diff --git a/NEWS.md b/NEWS.md index 693b46f01..8a1a52ae6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,142 @@ program: args_list(f_opt(number), opt_tail(string), number) https://github.com/ruby/lrama/pull/779 +### [EXPERIMENTAL] Support core PSLR(1) parser generation + +Added experimental support for generating a PSLR(1)-style parser based on this dissertation. +https://open.clemson.edu/all_dissertations/519/ + +This adds the following PSLR-related grammar directives and integration points: + +- `%define lr.type pslr` enables PSLR parser generation +- `%token-pattern` declares token candidates and their regular expressions for PSLR-aware lexical disambiguation +- `%lex-prec` declares explicit lexical precedence for overlapping token patterns +- `%symbol-set` declares reusable sets of terminal tokens for PSLR lexical declarations +- `%lex-tie` expands parser-state acceptable-token sets for tied terminals +- `%lex-no-tie` records an explicit no-tie decision for terminals with overlapping token patterns +- `YYLAYOUT*` token patterns are recognized in every parser state and discarded by PSLR-aware lexers +- `%define pslr.max-states` and `%define pslr.max-state-ratio` are Lrama-specific safety guards for state growth +- `%define api.pslr.state-member` names the parser-state field to be shared with the lexer when using the generated helper macros + +Typical usage looks like this: + +```yacc +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%parse-param {struct parse_params *p} +%lex-param {struct parse_params *p} + +%token-pattern RSHIFT />>/ "right shift" +%token-pattern RANGLE />/ "right angle" +%token-pattern ID /[a-z]+/ + +%lex-no-tie RANGLE RSHIFT +``` + +In this setup, `%token-pattern` lists the tokens that the generated pseudo-scanner FSA should consider, and +`%lex-no-tie` records that `RANGLE` and `RSHIFT` should not be treated as tied tokens. In a template-closing +parser state only `RANGLE` is syntactically acceptable, while a shift-expression state accepts `RSHIFT`; PSLR +state splitting and scanner profiles choose between the two without a global shortest-match rule. Use `%lex-prec` +for real lexical precedence relations, such as comment patterns that need a shortest-match or longest-match rule +in every parser context. + +For normal parser-state scanner rows, unresolved pseudo-scanner conflicts are not resolved by token declaration +order. They are reported as errors so the grammar can add an explicit `%lex-prec`, `%lex-tie`, or `%lex-no-tie` +declaration. For syntax-error handling, Lrama also emits a fallback scanner row. The fallback row first applies +explicit PSLR lexical precedence declarations. If a scanner conflict remains unresolved only because it is not a +pseudo-scanner conflict for any parser state, the fallback row completes the decision with scanner-default rules: +length conflicts use longest match and identity conflicts use token declaration order only for token pairs without +an explicit identity precedence relation. These defaults are composed with the explicit graph, so explicit identity +precedence is still honored when fallback length precedence is needed. For normal parser-state rows, explicit +precedence cycles remain unresolved and are reported as PSLR scanner conflicts. For the syntax-error fallback row +only, if explicit PSLR precedence plus fallback length defaults still do not determine a unique identity winner, +Lrama completes the fallback decision with traditional token declaration order so that a token-pattern match is +returned whenever `M(input, T0)` is non-empty. If no token pattern matches at all, the PSLR helper consumes one +byte and returns `YYUNDEF` as a character-token fallback, so error paths do not loop forever. + +`%lex-prec` uses ASCII spellings for the PSLR lexical precedence operators: + +| Lrama | Meaning | +|---|---| +| `<~` | identity conflict: right token wins; length conflict: longest match wins | +| `<-` | identity conflict: right token wins | +| `-~` | length conflict: longest match wins | +| `<<` | identity and length conflicts: right token wins | +| `-<` | length conflict: right token wins | +| `` versus `>>`; those cases should be handled by PSLR state +splitting plus `%lex-no-tie` when the tokens are intentionally not tied. + +Contradictory length precedence declarations are rejected instead of being silently overwritten. For example, +declaring both `%lex-prec A -~ B` and `%lex-prec A -s B` reports the token pair, the two operators and lines, and +the scan direction where the contradiction occurs. + +Lexical ties are separate from precedence. For example: + +```yacc +%token-pattern IF /if/ +%token-pattern ID /[a-z]+/ +%symbol-set keywords IF +%lex-tie ID keywords +%lex-prec ID <~ keywords +``` + +Here, `IF` can be considered when the parser state accepts `ID`, but `%lex-tie` does not choose a winner. +The `%lex-prec ID <~ keywords` declaration resolves the `if` identity conflict in favor of `IF` while keeping +longer identifiers such as `ifx` as `ID`. + +When both operands of `%lex-tie` are explicit tokens, Lrama ties them even if their token patterns do not conflict. +When either operand is a `%symbol-set` or `yyall`, Lrama follows the PSLR paper and ties only token pairs that have +a pairwise scanner conflict. This avoids unnecessary transitive lexical ties and the pseudo-scanner conflicts they +would create. + +`%lex-no-tie` suppresses lexical tie candidate warnings; it does not break a final transitive tie closure. Generic +declarations such as `%lex-no-tie yyall yyall` can suppress broad candidate reports, and a more specific `%lex-tie` +can still tie the relevant token pair. + +Token patterns named `YYLAYOUT` or starting with `YYLAYOUT` are layout tokens. They are included in every +parser-state scanner row and should be consumed and skipped by the PSLR-aware lexer instead of being returned to +the parser. The generated helpers include `YYPSLR_TOKEN_IS_LAYOUT(Token)` and the structured +`YYPSLR_PSEUDO_SCAN_RESULT(...)` API for this purpose. + +`yy_pseudo_scan_result` is a low-level scanner helper and may report a layout token with `result->is_layout = 1`. +A PSLR-aware `yylex` must consume that text, keep the same parser state, and scan the remaining input instead of +passing the layout token to the parser. + +The generated PSLR scanner FSA considers terminals declared with `%token-pattern` and character literal terminals +for which Lrama can synthesize an exact-match implicit token pattern. Grammar terminals without either a token +pattern or a supported literal spelling remain outside the generated pseudo-scanner helper. Thus, the paper's `T0` +fallback token set corresponds here to the set of terminals known to the generated scanner FSA. Parser-state rows +use the subset accepted by the current state plus tied tokens and layout tokens. The fallback row uses the whole +generated scanner universe so error handling can still identify and consume a token when the current parser state +has no normal scanner decision. + +`%token-pattern` currently uses an ASCII byte-oriented regular-expression subset for PSLR pseudo scanning. +Supported constructs are literals, escaped literals such as `\/`, `\*`, `\+`, `\?`, `\(`, `\)`, `\[`, `\]`, +and `\\`, grouping with `(...)`, alternation with `|`, repetition operators `*`, `+`, `?`, character classes with +escapes such as `[\]]`, `[\\]`, and `[\n\t\r]`, ranges such as `[a-z]`, negated classes such as `[^*]`, common +escapes such as `\n`, `\t`, and `\r`, and `.`. The `.` operator matches ASCII bytes except newline; negated +character classes range over ASCII bytes. Unicode properties and full Ruby/Onigmo regexp syntax are not supported. +Unsupported or malformed constructs are rejected during generation rather than silently reinterpreted. Nullable +token patterns such as `//`, `/()/`, `/a*/`, `/a?/`, and `/a|/` are generation errors because PSLR token lexemes +must be non-empty. + +When the parser and lexer share a context through `%parse-param` / `%lex-param`, the generated header also +provides helpers such as `YYPSLR_PSEUDO_SCAN(...)`, so the lexer can choose a token based on the current parser +state. The paper-compatible scanning path needs the lexer to pass the unconsumed input prefix, not only an +already-decided token fragment, so legacy external lexer bridges may still be limited by the text they provide. + +PSLR parsers enable a lightweight LAC check in the generated parser so syntax errors caused by LR state merging, +default reductions, or `%nonassoc` error actions are detected before user semantic actions are run for the bad +lookahead. PSLR support is still experimental. Scoped lexical declarations, lexical nonterminals, and `%lex` +are not implemented yet. If you find any bugs, please report them. + ## Lrama 0.7.1 (2025-12-24) ### Optimize IELR diff --git a/lib/lrama.rb b/lib/lrama.rb index 56ba0044d..c676b32d3 100644 --- a/lib/lrama.rb +++ b/lib/lrama.rb @@ -15,8 +15,10 @@ require_relative "lrama/output" require_relative "lrama/parser" require_relative "lrama/reporter" +require_relative "lrama/scanner_fsa" require_relative "lrama/state" require_relative "lrama/states" +require_relative "lrama/length_precedences" require_relative "lrama/tracer" require_relative "lrama/version" require_relative "lrama/warnings" diff --git a/lib/lrama/command.rb b/lib/lrama/command.rb index 17aad1a1c..236dca054 100644 --- a/lib/lrama/command.rb +++ b/lib/lrama/command.rb @@ -30,11 +30,11 @@ def execute_command_workflow text = read_input grammar = build_grammar(text) states, context = compute_status(grammar) + states.validate!(@logger) render_reports(states) if @options.report_file @tracer.trace(grammar) render_diagram(grammar) render_output(context, grammar) - states.validate!(@logger) @warnings.warn(grammar, states) end @@ -84,7 +84,11 @@ def prepare_grammar(grammar) def compute_status(grammar) states = Lrama::States.new(grammar, @tracer) states.compute - states.compute_ielr if grammar.ielr_defined? + if grammar.pslr_defined? + states.compute_pslr + elsif grammar.ielr_defined? + states.compute_ielr + end [states, Lrama::Context.new(states)] end diff --git a/lib/lrama/context.rb b/lib/lrama/context.rb index eb068c1b9..2363db999 100644 --- a/lib/lrama/context.rb +++ b/lib/lrama/context.rb @@ -224,6 +224,7 @@ def compute_yydefact if state.reduces.map(&:selected_look_ahead).any? {|la| !la.empty? } # Iterate reduces with reverse order so that first rule is used. state.reduces.reverse_each do |reduce| + next unless reduce.look_ahead reduce.look_ahead.each do |term| actions[term.number] = rule_id_to_action_number(reduce.rule.id) end diff --git a/lib/lrama/grammar.rb b/lib/lrama/grammar.rb index 95a80bb01..327be48b7 100644 --- a/lib/lrama/grammar.rb +++ b/lib/lrama/grammar.rb @@ -20,6 +20,11 @@ require_relative "grammar/symbols" require_relative "grammar/type" require_relative "grammar/union" +require_relative "grammar/token_action" +require_relative "grammar/token_pattern" +require_relative "grammar/lex_prec" +require_relative "grammar/lex_tie" +require_relative "grammar/lexer_context" require_relative "lexer" module Lrama @@ -40,6 +45,11 @@ class Grammar # def nterms: () -> Array[Grammar::Symbol] # def find_symbol_by_s_value!: (::String s_value) -> Grammar::Symbol # def ielr_defined?: () -> bool + # def pslr_defined?: () -> bool + # def token_patterns: () -> Array[Grammar::TokenPattern] + # def lex_prec: () -> Grammar::LexPrec + # def pslr_max_states: () -> Integer? + # def pslr_max_state_ratio: () -> Float? # end # # include Symbols::Resolver::_DelegatedMethods @@ -68,6 +78,8 @@ class Grammar # @union: Union # @precedences: Array[Precedence] # @start_nterm: Lrama::Lexer::Token::Base? + # @token_patterns: Array[Grammar::TokenPattern] + # @lex_prec: Grammar::LexPrec extend Forwardable @@ -100,6 +112,18 @@ class Grammar attr_accessor :locations #: bool attr_accessor :define #: Hash[String, String] attr_accessor :required #: bool + attr_reader :token_patterns #: Array[Grammar::TokenPattern] + attr_reader :lex_prec #: Grammar::LexPrec + attr_reader :symbol_sets #: Hash[String, Array[Lexer::Token::Base]] + attr_reader :lex_tie #: Grammar::LexTie + attr_reader :lexer_contexts #: Hash[String, Grammar::LexerContext] + attr_reader :token_actions #: Array[Grammar::TokenAction] + + # Argument symbol names for each parameterized rule expansion. + # @rbs () -> Hash[String, Array[String]] + def parameterized_expansion_args + @parameterized_resolver.expansion_args + end def_delegators "@symbols_resolver", :symbols, :nterms, :terms, :add_nterm, :add_term, :find_term_by_s_value, :find_symbol_by_number!, :find_symbol_by_id!, :token_to_symbol, @@ -133,6 +157,14 @@ def initialize(rule_counter, locations, define = {}) @required = false @precedences = [] @start_nterm = nil + @token_patterns = [] + @lex_prec = Grammar::LexPrec.new + @symbol_sets = {} + @lex_tie = Grammar::LexTie.new + @lexer_contexts = {} + @lexer_context_counter = 0 + @token_pattern_counter = 0 + @token_actions = [] append_special_symbols end @@ -277,6 +309,7 @@ def validate! validate_no_precedence_for_nterm! validate_rule_lhs_is_nterm! validate_duplicated_precedence! + validate_pslr_configuration! end # @rbs (Grammar::Symbol sym) -> Array[Rule] @@ -304,8 +337,247 @@ def ielr_defined? @define.key?('lr.type') && @define['lr.type'] == 'ielr' end + # @rbs () -> bool + def pslr_defined? + @define.key?('lr.type') && @define['lr.type'] == 'pslr' + end + + # @rbs () -> String? + def pslr_state_member + @define['api.pslr.state-member'] + end + + # @rbs () -> Integer? + def pslr_max_states + parse_pslr_positive_integer('pslr.max-states') + end + + # @rbs () -> Float? + def pslr_max_state_ratio + parse_pslr_positive_float('pslr.max-state-ratio') + end + + # Add a token pattern from %token-pattern directive + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer) -> Grammar::TokenPattern + def add_token_pattern(id:, pattern:, alias_name: nil, tag: nil, lineno:) + token_pattern = Grammar::TokenPattern.new( + id: id, + pattern: pattern, + alias_name: alias_name, + tag: tag, + lineno: lineno, + definition_order: @token_pattern_counter + ) + @token_pattern_counter += 1 + @token_patterns << token_pattern + + # Also register as a terminal symbol + add_term(id: id, alias_name: alias_name, tag: tag) + + token_pattern + end + + # Add a lex-prec rule from %lex-prec directive + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Grammar::LexPrec::Rule + def add_lex_prec_rule(left_token:, operator:, right_token:, lineno:) + @lex_prec.add_rule( + left_token: left_token, + operator: operator, + right_token: right_token, + lineno: lineno + ) + end + + # Add a symbol set from %symbol-set directive. + # @rbs (name: String, symbols: Array[Lexer::Token::Base]) -> Array[Lexer::Token::Base] + def add_symbol_set(name:, symbols:) + @symbol_sets[name] = symbols + end + + # Add lexical tie relationships from %lex-tie directive. + # @rbs (operands: Array[Lexer::Token::Base]) -> void + def add_lex_tie(operands:) + groups = operands.map { |op| build_operand_group(op) } + @lex_tie.add_tie_declaration(groups: groups) + end + + # Add no-tie declarations from %lex-no-tie directive. + # @rbs (operands: Array[Lexer::Token::Base]) -> void + def add_lex_no_tie(operands:) + groups = operands.map { |op| build_operand_group(op) } + @lex_tie.add_no_tie_declaration(groups: groups) + end + + # Add a token action from %token-action directive + # @rbs (id: Lexer::Token::Ident, code: Lexer::Token::UserCode, lineno: Integer) -> Grammar::TokenAction + def add_token_action(id:, code:, lineno:) + action = Grammar::TokenAction.new(token_id: id, code: code, lineno: lineno) + @token_actions << action + action + end + + # @rbs () -> Set[String] + def layout_token_names + @token_patterns.select(&:layout?).map(&:name).to_set + end + + # @rbs (ScannerFSA scanner_fsa) -> void + def finalize_lexical_ties!(scanner_fsa) + token_names = @token_patterns.map(&:name) + conflict_pairs = compute_scanner_conflict_pairs(scanner_fsa) + @lex_tie.finalize!(token_names, conflict_pairs) + end + + REGEX_LITERAL_ESCAPES = %w[. [ ] ( ) { } + * ? ^ $ | \\ /].freeze #: Array[String] + REGEX_CONTROL_ESCAPES = { "\\\\" => "\\\\", "\\n" => "\\n", "\\t" => "\\t", "\\r" => "\\r", "\\f" => "\\f", "\\v" => "\\v", "\\b" => "\\b" }.freeze #: Hash[String, String] + + # @rbs () -> void + def synthesize_implicit_literal_token_patterns! + char_literals = terms.select do |sym| + sym.id.is_a?(Lrama::Lexer::Token::Char) + end + + char_literals.each do |sym| + name = sym.id.s_value + next if @token_patterns.any? { |tp| tp.name == name } + + inner = name[1..-2].to_s + regex = if REGEX_CONTROL_ESCAPES.key?(inner) + REGEX_CONTROL_ESCAPES[inner] + else + escaped = Regexp.escape(inner) + escaped.gsub("/", "\\/") + end + + pattern = Lrama::Lexer::Token::Regex.new(s_value: "/#{regex}/") + add_token_pattern(id: sym.id, pattern: pattern, lineno: 0) + end + end + + # Add a lexer context from %lexer-context directive + # @rbs (name: String, symbols: Array[Lexer::Token::Ident]) -> Grammar::LexerContext + def add_lexer_context(name:, symbols:) + unless ctx = @lexer_contexts[name] + ctx = Grammar::LexerContext.new(name: name, index: @lexer_context_counter) + @lexer_context_counter += 1 + @lexer_contexts[name] = ctx + end + ctx.add_symbols(symbols) + ctx + end + + # Find a token pattern by its name + # @rbs (String name) -> Grammar::TokenPattern? + def find_token_pattern(name) + @token_patterns.find { |tp| tp.name == name } + end + private + # @rbs (Lexer::Token::Base operand) -> Grammar::LexTie::OperandGroup + def build_operand_group(operand) + name = operand.s_value + if name == "*" || name == "yyall" + Grammar::LexTie::OperandGroup.new(names: [], kind: :all) + elsif @symbol_sets.key?(name) + Grammar::LexTie::OperandGroup.new(names: @symbol_sets[name].map(&:s_value), kind: :symbol_set) + else + Grammar::LexTie::OperandGroup.new(names: [name], kind: :token) + end + end + + # Compute pairs of tokens that have scanner conflicts. + # Two tokens conflict if they share an accepting state or + # if one's accepting state is reachable from the other's via transitions. + # @rbs (ScannerFSA scanner_fsa) -> Set[[String, String]] + def compute_scanner_conflict_pairs(scanner_fsa) + pairs = Set.new + + scanner_fsa.states.each do |state| + next unless state.accepting? + + # Same accepting state + tokens = state.accepting_tokens.map(&:name) + tokens.combination(2) do |a, b| + next unless a && b + pairs << (a <= b ? [a, b] : [b, a]) + end + + # Reachable accepting states (prefix/length conflict) + reachable = find_reachable_accepting(scanner_fsa, state) + reachable.each do |other| + state.accepting_tokens.each do |t1| + other.accepting_tokens.each do |t2| + a, b = t1.name, t2.name + pairs << (a <= b ? [a, b] : [b, a]) + end + end + end + end + + pairs + end + + # @rbs (ScannerFSA scanner_fsa, ScannerFSA::State start) -> Array[ScannerFSA::State] + def find_reachable_accepting(scanner_fsa, start) + visited = Set.new([start.id]) + queue = start.transitions.values.dup + result = [] + + while !queue.empty? + next_id = queue.shift + next if visited.include?(next_id) + visited << next_id + + next_state = scanner_fsa.states[next_id] + next unless next_state + + result << next_state if next_state.accepting? + next_state.transitions.each_value { |id| queue << id } + end + + result + end + + # @rbs () -> void + def validate_pslr_configuration! + return unless pslr_defined? + + member = pslr_state_member + if member && member !~ /\A[a-zA-Z_][a-zA-Z0-9_]*\z/ + raise %(%define api.pslr.state-member must be a valid C identifier, got "#{member}".) + end + + pslr_max_states + pslr_max_state_ratio + end + + # @rbs (String key) -> Integer? + def parse_pslr_positive_integer(key) + value = @define[key] + return nil if value.nil? || value.empty? + + parsed = Integer(value, 10) + raise %(%define #{key} must be greater than 0, got "#{value}".) unless 0 < parsed + + parsed + rescue ArgumentError + raise %(%define #{key} must be an integer, got "#{value}".) + end + + # @rbs (String key) -> Float? + def parse_pslr_positive_float(key) + value = @define[key] + return nil if value.nil? || value.empty? + + parsed = Float(value) + raise %(%define #{key} must be greater than or equal to 1.0, got "#{value}".) unless 1.0 <= parsed + + parsed + rescue ArgumentError + raise %(%define #{key} must be a number, got "#{value}".) + end + # @rbs () -> void def sort_precedence @precedences.sort_by! do |prec| diff --git a/lib/lrama/grammar/lex_prec.rb b/lib/lrama/grammar/lex_prec.rb new file mode 100644 index 000000000..17b38e81e --- /dev/null +++ b/lib/lrama/grammar/lex_prec.rb @@ -0,0 +1,164 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents lexical precedence rules defined by %lex-prec directive + # Based on Definition 3.2.3, 3.2.4, 3.2.10 from the PSLR dissertation + # + # Example: %lex-prec RANGLE -s RSHIFT # RANGLE is shorter than RSHIFT + # %lex-prec IF - ID # IF has higher priority than ID (same length) + class LexPrec + # Precedence relation types (legacy) + SAME_PRIORITY = :same #: Symbol + HIGHER = :higher #: Symbol + SHORTER = :shorter #: Symbol + + # PSLR lex-prec operator types + # <~ identity conflict: right token wins; length conflict: longest wins + # <- identity conflict: right token wins + # -~ length conflict: longest wins + # << identity and length conflicts: right token wins + # -< length conflict: right token wins + # void + def initialize(left_token:, operator:, right_token:, lineno:) + @left_token = left_token + @operator = operator + @right_token = right_token + @lineno = lineno + end + + # @rbs () -> String + def left_name + @left_token.s_value + end + + # @rbs () -> String + def right_name + @right_token.s_value + end + end + + attr_reader :rules #: Array[Rule] + + # @rbs () -> void + def initialize + @rules = [] + end + + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Rule + def add_rule(left_token:, operator:, right_token:, lineno:) + rule = Rule.new( + left_token: left_token, + operator: operator, + right_token: right_token, + lineno: lineno + ) + @rules << rule + rule + end + + # Check if token t1 has higher priority than t2 + # Based on Definition 3.2.4 + # @rbs (String t1, String t2) -> bool + def higher_priority?(t1, t2) + @rules.any? do |rule| + rule.operator == HIGHER && + rule.left_name == t1 && + rule.right_name == t2 + end + end + + # Check if token t1 has shorter-match priority over t2 + # Based on Definition 3.2.15 + # @rbs (String t1, String t2) -> bool + def shorter_priority?(t1, t2) + @rules.any? do |rule| + rule.operator == SHORTER && + rule.left_name == t1 && + rule.right_name == t2 + end + end + + # Check if tokens t1 and t2 are in a lex-tie relationship + # @rbs (String t1, String t2) -> bool + def same_priority?(t1, t2) + @rules.any? do |rule| + rule.operator == SAME_PRIORITY && + ((rule.left_name == t1 && rule.right_name == t2) || + (rule.left_name == t2 && rule.right_name == t1)) + end + end + + # True when winner explicitly wins an identity conflict against loser. + # The relation is intentionally not transitive. + # @rbs (String winner, String loser, ?track: bool) -> bool + def identity_precedes?(winner, loser, track: false) + @rules.any? do |rule| + IDENTITY_OPERATORS.include?(rule.operator) && + rule.left_name == loser && + rule.right_name == winner + end + end + + # True when rule declares a longest-match length relation for the pair. + # @rbs (String token1, String token2) -> bool + def longest_pair?(token1, token2) + pair_rule?(token1, token2, LONGEST_OPERATORS) + end + + # True when rule declares a shortest-match length relation for the pair. + # @rbs (String token1, String token2) -> bool + def shortest_pair?(token1, token2) + pair_rule?(token1, token2, SHORTEST_OPERATORS) + end + + # Returns the explicit right-token length winner for a pair, if any. + # @rbs (String token1, String token2) -> String? + def right_token_length_winner(token1, token2) + @rules.each do |rule| + next unless RIGHT_TOKEN_LENGTH_OPERATORS.include?(rule.operator) + if (rule.left_name == token1 && rule.right_name == token2) || + (rule.left_name == token2 && rule.right_name == token1) + return rule.right_name + end + end + nil + end + + private + + # @rbs (String token1, String token2, Array[Symbol] operators) -> bool + def pair_rule?(token1, token2, operators) + @rules.any? do |rule| + operators.include?(rule.operator) && + ((rule.left_name == token1 && rule.right_name == token2) || + (rule.left_name == token2 && rule.right_name == token1)) + end + end + end + end +end diff --git a/lib/lrama/grammar/lex_tie.rb b/lib/lrama/grammar/lex_tie.rb new file mode 100644 index 000000000..b4d89557c --- /dev/null +++ b/lib/lrama/grammar/lex_tie.rb @@ -0,0 +1,290 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +require "set" + +module Lrama + class Grammar + # Stores PSLR lexical ties and explicit no-tie declarations. + # + # Lexical ties expand acc(sp); they never resolve a scanner conflict by + # themselves. Conflict selection is still handled by %lex-prec. + class LexTie + class OperandGroup + attr_reader :names #: Array[String] + attr_reader :kind #: ::Symbol + + # @rbs (names: Array[String], kind: ::Symbol) -> void + def initialize(names:, kind:) + @names = names + @kind = kind + end + end + + class Declaration + attr_reader :kind #: ::Symbol + attr_reader :groups #: Array[OperandGroup] + attr_reader :lineno #: Integer + + # @rbs (kind: ::Symbol, groups: Array[OperandGroup], lineno: Integer) -> void + def initialize(kind:, groups:, lineno:) + @kind = kind + @groups = groups + @lineno = lineno + end + end + + class Decision + attr_reader :kind #: ::Symbol + attr_reader :specificity #: Integer + attr_reader :lineno #: Integer + + # @rbs (kind: ::Symbol, specificity: Integer, lineno: Integer) -> void + def initialize(kind:, specificity:, lineno:) + @kind = kind + @specificity = specificity + @lineno = lineno + end + end + + attr_reader :ties #: Hash[String, Set[String]] + attr_reader :no_ties #: Set[[String, String]] + attr_reader :declarations #: Array[Declaration] + + # @rbs () -> void + def initialize + @ties = Hash.new { |h, k| h[k] = Set.new([k]) } + @no_ties = Set.new + @declarations = [] + end + + # @rbs (String left, String right) -> void + def add_tie(left, right) + left_set = tied_names(left) + right_set = tied_names(right) + merged = left_set | right_set + + merged.each do |name| + @ties[name] = merged.dup + end + end + + # @rbs (String left, String right) -> void + def add_no_tie(left, right) + @no_ties << pair_key(left, right) + end + + # @rbs (groups: Array[OperandGroup], ?lineno: Integer) -> void + def add_tie_declaration(groups:, lineno: 0) + @declarations << Declaration.new(kind: :tie, groups: groups, lineno: lineno) + end + + # @rbs (groups: Array[OperandGroup], ?lineno: Integer) -> void + def add_no_tie_declaration(groups:, lineno: 0) + @declarations << Declaration.new(kind: :no_tie, groups: groups, lineno: lineno) + end + + # @rbs (Array[String] token_names, Set[[String, String]] conflict_pairs) -> void + def finalize!(token_names, conflict_pairs) + decisions = {} #: Hash[[String, String], Decision] + + @declarations.each do |declaration| + declaration_pairs(declaration, token_names, conflict_pairs).each do |pair, specificity| + apply_decision(decisions, pair, Decision.new(kind: declaration.kind, specificity: specificity, lineno: declaration.lineno)) + end + end + + rebuild_relations(token_names, decisions) + end + + # @rbs (String name) -> Set[String] + def tied_names(name) + @ties[name].dup + end + + # @rbs (String left, String right) -> bool + def tied?(left, right) + tied_names(left).include?(right) + end + + # @rbs (String left, String right) -> bool + def no_tie?(left, right) + @no_ties.include?(pair_key(left, right)) + end + + # @rbs () -> Array[[String, String]] + def no_ties_conflicting_with_ties + @no_ties.select do |left, right| + tied?(left, right) + end + end + + private + + # @rbs (Hash[[String, String], Decision] decisions, [String, String] pair, Decision decision) -> void + def apply_decision(decisions, pair, decision) + current = decisions[pair] + if current.nil? || current.specificity < decision.specificity + decisions[pair] = decision + return + end + + if current.specificity == decision.specificity && current.kind != decision.kind + raise "%lex-tie and %lex-no-tie conflict for #{pair.join(' ')}." + end + end + + # @rbs (Declaration declaration, Array[String] token_names, Set[[String, String]] conflict_pairs) -> Array[[[String, String], Integer]] + def declaration_pairs(declaration, token_names, conflict_pairs) + pairs = [] #: Array[[[String, String], Integer]] + + declaration.groups.combination(2) do |left_group, right_group| + next unless left_group && right_group + + specificity = group_specificity(left_group, right_group) + left_names = names_for_group(left_group, token_names) + right_names = names_for_group(right_group, token_names) + + left_names.product(right_names).each do |left, right| + next if left == right + + pair = pair_key(left, right) + if declaration.kind == :tie && specificity < 3 + next unless conflict_pairs.include?(pair) + end + + if declaration.kind == :no_tie && specificity < 3 + next unless conflict_pairs.include?(pair) + end + + pairs << [pair, specificity] + end + end + + pairs + end + + # @rbs (OperandGroup group, Array[String] token_names) -> Array[String] + def names_for_group(group, token_names) + return token_names if group.kind == :all + + group.names + end + + # @rbs (OperandGroup left, OperandGroup right) -> Integer + def group_specificity(left, right) + return 3 if left.kind == :token && right.kind == :token + return 0 if left.kind == :all && right.kind == :all + return 2 if left.kind == :token || right.kind == :token + + 1 + end + + # @rbs (Array[String] token_names, Hash[[String, String], Decision] decisions) -> void + def rebuild_relations(token_names, decisions) + parents = {} #: Hash[String, String] + tie_specificities = {} #: Hash[[String, String], Integer] + token_names.each {|name| parents[name] = name } + + decisions.each do |pair, decision| + next unless decision.kind == :tie + + union(parents, pair[0], pair[1]) + tie_specificities[pair] = decision.specificity + end + + @ties = Hash.new { |h, k| h[k] = Set.new([k]) } + groups = token_names.group_by {|name| root(parents, name) } + groups.each_value do |names| + tied = names.to_set + names.each {|name| @ties[name] = tied.dup } + end + + closure_specificities = {} #: Hash[[String, String], Integer] + groups.each_value do |names| + names.combination(2) do |left, right| + next unless left && right + + pair = pair_key(left, right) + closure_specificities[pair] = tie_specificity_between(left, right, tie_specificities) + end + end + + @no_ties = Set.new + decisions.each do |pair, decision| + next unless decision.kind == :no_tie + + tie_specificity = closure_specificities[pair] + if tie_specificity && decision.specificity >= tie_specificity + raise "%lex-no-tie #{pair[0]} #{pair[1]} conflicts with an existing %lex-tie closure." + end + + @no_ties << pair unless tie_specificity + end + end + + # @rbs (Hash[String, String] parents, String name) -> String + def root(parents, name) + parents[name] ||= name + while parents[name] != name + parents[name] = parents[parents[name]] + name = parents[name] + end + name + end + + # @rbs (Hash[String, String] parents, String left, String right) -> void + def union(parents, left, right) + left_root = root(parents, left) + right_root = root(parents, right) + return if left_root == right_root + + parents[right_root] = left_root + end + + # Compute closure specificity between two tokens via tie graph BFS. + # Path specificity = min(edge specificities on the path). + # Result = max over all paths connecting left and right. + # @rbs (String left, String right, Hash[[String, String], Integer] tie_specificities) -> Integer + def tie_specificity_between(left, right, tie_specificities) + direct = tie_specificities[pair_key(left, right)] + return direct if direct + + # Build adjacency list from tie edges + graph = Hash.new { |h, k| h[k] = [] } #: Hash[String, Array[[String, Integer]]] + tie_specificities.each do |(a, b), specificity| + graph[a] << [b, specificity] + graph[b] << [a, specificity] + end + + return 0 unless graph.key?(left) + + # BFS/Dijkstra-like: find path from left to right maximizing min-edge specificity + # best[node] = best (max) path-min-specificity to reach node from left + best = { left => Float::INFINITY } #: Hash[String, Integer | Float] + queue = [[left, Float::INFINITY]] #: Array[[String, Integer | Float]] + + until queue.empty? + node, path_min = queue.shift + next unless node && path_min + + graph[node].each do |neighbor, edge_spec| + new_min = [path_min, edge_spec].min + if !best.key?(neighbor) || new_min > best[neighbor] + best[neighbor] = new_min + queue << [neighbor, new_min] + end + end + end + + result = best[right] + result && result != Float::INFINITY ? result.to_i : 0 + end + + # @rbs (String left, String right) -> [String, String] + def pair_key(left, right) + left <= right ? [left, right] : [right, left] + end + end + end +end diff --git a/lib/lrama/grammar/lexer_context.rb b/lib/lrama/grammar/lexer_context.rb new file mode 100644 index 000000000..e779899f4 --- /dev/null +++ b/lib/lrama/grammar/lexer_context.rb @@ -0,0 +1,39 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents a lexer context defined by %lexer-context directive. + # + # Example: + # %lexer-context BEG keyword_if keyword_unless '(' '[' '{' + # + # The bitmask value is automatically assigned by definition order (1 << index). + class LexerContext + attr_reader :name #: String + attr_reader :index #: Integer + attr_reader :symbols #: Array[Lexer::Token::Ident] + + # @rbs (name: String, index: Integer) -> void + def initialize(name:, index:) + @name = name + @index = index + @symbols = [] + end + + # Bitmask value for this context (1 << index). + # @rbs () -> Integer + def bitmask + 1 << @index + end + + # Add symbols that belong to this context. + # @rbs (Array[Lexer::Token::Ident] syms) -> void + def add_symbols(syms) + syms.each do |sym| + @symbols << sym + end + end + end + end +end diff --git a/lib/lrama/grammar/parameterized/resolver.rb b/lib/lrama/grammar/parameterized/resolver.rb index 558f30819..9544a8cbf 100644 --- a/lib/lrama/grammar/parameterized/resolver.rb +++ b/lib/lrama/grammar/parameterized/resolver.rb @@ -7,11 +7,13 @@ class Parameterized class Resolver attr_accessor :rules #: Array[Rule] attr_accessor :created_lhs_list #: Array[Lexer::Token::Base] + attr_reader :expansion_args #: Hash[String, Array[String]] # @rbs () -> void def initialize @rules = [] @created_lhs_list = [] + @expansion_args = {} end # @rbs (Rule rule) -> Array[Rule] @@ -34,6 +36,14 @@ def created_lhs(lhs_s_value) @created_lhs_list.reverse.find { |created_lhs| created_lhs.s_value == lhs_s_value } end + # Register the argument symbol names for a parameterized rule expansion. + # Used by LexerContextClassifier to inherit context from arguments. + # + # @rbs (String lhs_s_value, Array[Lexer::Token::Base] args) -> void + def register_expansion_args(lhs_s_value, args) + @expansion_args[lhs_s_value] = args.map(&:s_value) + end + # @rbs () -> Array[Rule] def redefined_rules @rules.select { |rule| @rules.count { |r| r.name == rule.name && r.required_parameters_count == rule.required_parameters_count } > 1 } diff --git a/lib/lrama/grammar/rule_builder.rb b/lib/lrama/grammar/rule_builder.rb index 34fdca6c8..33bf959d9 100644 --- a/lib/lrama/grammar/rule_builder.rb +++ b/lib/lrama/grammar/rule_builder.rb @@ -157,6 +157,7 @@ def process_rhs lhs_token = Lrama::Lexer::Token::Ident.new(s_value: lhs_s_value, location: token.location) replaced_rhs << lhs_token @parameterized_resolver.created_lhs_list << lhs_token + @parameterized_resolver.register_expansion_args(lhs_s_value, token.args) parameterized_rule.rhs.each do |r| rule_builder = RuleBuilder.new(@rule_counter, @midrule_action_counter, @parameterized_resolver, lhs_tag: token.lhs_tag || parameterized_rule.tag) rule_builder.lhs = lhs_token diff --git a/lib/lrama/grammar/scoped_lex_decl.rb b/lib/lrama/grammar/scoped_lex_decl.rb new file mode 100644 index 000000000..10c0ce42c --- /dev/null +++ b/lib/lrama/grammar/scoped_lex_decl.rb @@ -0,0 +1,40 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents scoped lexical declarations defined by %lex-scope directive. + # + # Scoped declarations allow lexical precedence and tie rules to apply + # only when parsing within the scope of a particular nonterminal. + # + # Example: + # %lex-scope template_args { + # %lex-prec RANGLE -~ RSHIFT + # } + class ScopedLexDecl + attr_reader :scope_name #: String + attr_reader :lex_prec_rules #: Array[LexPrec::Rule] + attr_reader :lex_tie_declarations #: Array[LexTie::Declaration] + attr_reader :lineno #: Integer + + # @rbs (scope_name: String, lineno: Integer) -> void + def initialize(scope_name:, lineno:) + @scope_name = scope_name + @lex_prec_rules = [] + @lex_tie_declarations = [] + @lineno = lineno + end + + # @rbs (LexPrec::Rule rule) -> void + def add_lex_prec_rule(rule) + @lex_prec_rules << rule + end + + # @rbs (LexTie::Declaration declaration) -> void + def add_lex_tie_declaration(declaration) + @lex_tie_declarations << declaration + end + end + end +end diff --git a/lib/lrama/grammar/symbols/resolver.rb b/lib/lrama/grammar/symbols/resolver.rb index 085a835d2..72ab17a1c 100644 --- a/lib/lrama/grammar/symbols/resolver.rb +++ b/lib/lrama/grammar/symbols/resolver.rb @@ -52,15 +52,17 @@ def sort_by_number! def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false) if token_id && (sym = find_symbol_by_token_id(token_id)) if replace - sym.id = id - sym.alias_name = alias_name - sym.tag = tag + replace_term_attributes(sym, id: id, alias_name: alias_name, tag: tag, token_id: token_id) end return sym end if (sym = find_symbol_by_id(id)) + if replace + replace_term_attributes(sym, id: id, alias_name: alias_name, tag: tag, token_id: token_id) + end + return sym end @@ -229,6 +231,14 @@ def find_nterm_by_id!(id) end || (raise "Symbol not found. #{id}") end + # @rbs (Grammar::Symbol sym, id: Lexer::Token::Base, ?alias_name: String?, ?tag: Lexer::Token::Tag?, ?token_id: Integer?) -> void + def replace_term_attributes(sym, id:, alias_name: nil, tag: nil, token_id: nil) + sym.id = id + sym.alias_name = alias_name + sym.tag = tag + sym.token_id = token_id if token_id + end + # @rbs () -> void def fill_terms_number # Character literal in grammar file has diff --git a/lib/lrama/grammar/token_action.rb b/lib/lrama/grammar/token_action.rb new file mode 100644 index 000000000..cb3a154f6 --- /dev/null +++ b/lib/lrama/grammar/token_action.rb @@ -0,0 +1,33 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents a token action defined by %token-action directive. + # + # Token actions are user code blocks associated with token patterns. + # When a token is matched by the pseudo-scanner, the associated code runs. + # Layout tokens are accumulated, and the accumulated text is available + # to the next non-layout token's action. + # + # Example: + # %token-action ID { printf("matched ID: %.*s\n", yyleng, yytext); } + class TokenAction + attr_reader :token_id #: Lexer::Token::Ident + attr_reader :code #: Lexer::Token::UserCode + attr_reader :lineno #: Integer + + # @rbs (token_id: Lexer::Token::Ident, code: Lexer::Token::UserCode, lineno: Integer) -> void + def initialize(token_id:, code:, lineno:) + @token_id = token_id + @code = code + @lineno = lineno + end + + # @rbs () -> String + def token_name + @token_id.s_value + end + end + end +end diff --git a/lib/lrama/grammar/token_pattern.rb b/lib/lrama/grammar/token_pattern.rb new file mode 100644 index 000000000..e990b1a49 --- /dev/null +++ b/lib/lrama/grammar/token_pattern.rb @@ -0,0 +1,43 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents a token pattern defined by %token-pattern directive + # Example: %token-pattern RSHIFT />>/ "right shift" + class TokenPattern + attr_reader :id #: Lexer::Token::Ident + attr_reader :pattern #: Lexer::Token::Regex + attr_reader :alias_name #: String? + attr_reader :tag #: Lexer::Token::Tag? + attr_reader :lineno #: Integer + attr_reader :definition_order #: Integer + + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer, definition_order: Integer) -> void + def initialize(id:, pattern:, alias_name: nil, tag: nil, lineno:, definition_order:) + @id = id + @pattern = pattern + @alias_name = alias_name + @tag = tag + @lineno = lineno + @definition_order = definition_order + end + + # @rbs () -> String + def name + @id.s_value + end + + # Returns the regex pattern string (without slashes) + # @rbs () -> String + def regex_pattern + @pattern.pattern + end + + # @rbs () -> bool + def layout? + name.start_with?("YYLAYOUT") + end + end + end +end diff --git a/lib/lrama/length_precedences.rb b/lib/lrama/length_precedences.rb new file mode 100644 index 000000000..15ba218c6 --- /dev/null +++ b/lib/lrama/length_precedences.rb @@ -0,0 +1,57 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + # Length precedences table for PSLR(1) + # Based on Definition 3.2.15 from the PSLR dissertation + # + # Determines which token should be preferred when there's a length conflict: + # - :left - the shorter token (t1) should be preferred + # - :right - the longer token (t2) should be preferred + # - :undefined - no preference defined, use default (longest match) + class LengthPrecedences + # Result of length precedence lookup + LEFT = :left #: Symbol + RIGHT = :right #: Symbol + UNDEFINED = :undefined #: Symbol + + attr_reader :table #: Hash[[String, String], Symbol] + + # @rbs (Grammar::LexPrec lex_prec) -> void + def initialize(lex_prec) + @table = build_table(lex_prec) + end + + # Get the length precedence between two tokens + # @rbs (String t1, String t2) -> Symbol + def precedence(t1, t2) + @table[[t1, t2]] || UNDEFINED + end + + # Check if t1 (shorter) should be preferred over t2 (longer) + # @rbs (String t1, String t2) -> bool + def prefer_shorter?(t1, t2) + precedence(t1, t2) == LEFT + end + + private + + # Build the length precedence table from lex-prec rules + # @rbs (Grammar::LexPrec lex_prec) -> Hash[[String, String], Symbol] + def build_table(lex_prec) + table = {} + + lex_prec.rules.each do |rule| + case rule.operator + when Grammar::LexPrec::SHORTER + # t1 -s t2: t1 (shorter) should be preferred over t2 (longer) + table[[rule.left_name, rule.right_name]] = LEFT + # Inverse: t2 (longer) should not be preferred over t1 (shorter) + table[[rule.right_name, rule.left_name]] = RIGHT + end + end + + table + end + end +end diff --git a/lib/lrama/lexer.rb b/lib/lrama/lexer.rb index ce98b505a..ebaa23f42 100644 --- a/lib/lrama/lexer.rb +++ b/lib/lrama/lexer.rb @@ -18,7 +18,8 @@ class Lexer # [::Symbol, Token::Char] | # [::Symbol, Token::Str] | # [::Symbol, Token::Int] | - # [::Symbol, Token::Ident] + # [::Symbol, Token::Ident] | + # [::Symbol, Token::Regex] # # type c_token = [:C_DECLARATION, Token::UserCode] @@ -32,6 +33,7 @@ class Lexer PERCENT_TOKENS = %w( %union %token + %token-pattern %type %nterm %left @@ -43,6 +45,12 @@ class Lexer %printer %destructor %lex-param + %lexer-context + %lex-prec + %lex-tie + %lex-no-tie + %symbol-set + %token-action %parse-param %initial-action %precedence @@ -121,7 +129,7 @@ def lex_token return when @scanner.scan(/#{SYMBOLS.join('|')}/) return [@scanner.matched, Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] - when @scanner.scan(/#{PERCENT_TOKENS.join('|')}/) + when @scanner.scan(/#{PERCENT_TOKENS.sort_by { |s| -s.length }.join('|')}/) return [@scanner.matched, Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/[\?\+\*]/) return [@scanner.matched, Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] @@ -133,6 +141,24 @@ def lex_token return [:CHARACTER, Lrama::Lexer::Token::Char.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/".*?"/) return [:STRING, Lrama::Lexer::Token::Str.new(s_value: %Q(#{@scanner.matched}), location: location)] + when @scanner.scan(%r{/(?:[^/\\]|\\.)+/}) + return [:REGEX, Lrama::Lexer::Token::Regex.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/<~(?=\s)/) + return ['<~', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/<-(?=\s)/) + return ['<-', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/-~(?=\s)/) + return ['-~', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/<<(?=\s)/) + return ['<<', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/-<(?=\s)/) + return ['-<', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/ String + def pattern + # Remove leading and trailing slashes + s_value[1..-2].to_s + end + end + end + end +end diff --git a/lib/lrama/lexer_context_classifier.rb b/lib/lrama/lexer_context_classifier.rb new file mode 100644 index 000000000..e61c990a1 --- /dev/null +++ b/lib/lrama/lexer_context_classifier.rb @@ -0,0 +1,161 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + # Classifies parser states into lexer context categories. + # + # When LALR states are merged, states from different grammatical contexts + # (e.g., BEG vs CMDARG) share the same state number, making them + # indistinguishable to the lexer. This classifier analyzes kernel items + # to determine the lexer context of each state, enabling context-aware + # state splitting. + # + # Context definitions come from %lexer-context directives in the grammar file. + # Each directive maps a context name to a set of symbols: + # + # %lexer-context BEG keyword_if keyword_unless '(' '[' '{' + # %lexer-context CMDARG tIDENTIFIER tFID tCONSTANT + # + class LexerContextClassifier + # @rbs (Hash[String, Grammar::LexerContext] lexer_contexts, ?Hash[String, Array[String]] expansion_args) -> void + def initialize(lexer_contexts, expansion_args = {}) + @lexer_contexts = lexer_contexts + @expansion_args = expansion_args + @symbol_to_context = build_symbol_to_context_map + @context_names = build_context_names + end + + # Classify a state's kernel items into context groups. + # + # @rbs (State state) -> Hash[Integer, Array[State::Item]] + def classify(state) + groups = {} + + state.kernels.each do |item| + ctx = infer_item_context(item) + groups[ctx] ||= [] + groups[ctx] << item + end + + groups + end + + # Infer the lexer context for a single kernel item. + # + # @rbs (State::Item item) -> Integer + def infer_item_context(item) + # Position 0 means we're at the start of a rule (just entered via GOTO) + return default_beg_context if item.position == 0 + + prev_sym = item.rhs[item.position - 1] + classify_symbol_context(prev_sym) + end + + # Classify context based on the symbol before the dot. + # + # @rbs (Grammar::Symbol sym) -> Integer + def classify_symbol_context(sym) + name = sym.id.s_value + # Also try without surrounding quotes for single-char tokens + bare = name.gsub(/\A["']|["']\z/, "") + + # Direct match + ctx = @symbol_to_context[name] || @symbol_to_context[bare] + return ctx if ctx + + # Fallback: inherit context from parameterized rule expansion arguments + if (arg_names = @expansion_args[name]) + arg_names.each do |arg_name| + ctx = @symbol_to_context[arg_name] + return ctx if ctx + end + end + + 0 + end + + # For backward compatibility with states.rb split logic + # @rbs (Grammar::Symbol sym) -> Integer + def classify_terminal_context(sym) + classify_symbol_context(sym) + end + + # For backward compatibility with states.rb split logic + # @rbs (Grammar::Symbol sym) -> Integer + def classify_nonterminal_context(sym) + classify_symbol_context(sym) + end + + # Return a human-readable name for a context value. + # + # @rbs (Integer ctx) -> String + def context_name(ctx) + return "UNKNOWN" if ctx == 0 + + names = @context_names.select { |flag, _| (ctx & flag) != 0 }.values + names.empty? ? "UNKNOWN" : names.join("|") + end + + # Class-level context_name for use without an instance (e.g., output.rb). + # Requires lexer_contexts to build the name map. + # + # @rbs (Integer ctx, Hash[String, Grammar::LexerContext] lexer_contexts) -> String + def self.context_name(ctx, lexer_contexts) + return "UNKNOWN" if ctx == 0 + + names = [] + lexer_contexts.each_value do |lc| + names << lc.name if (ctx & lc.bitmask) != 0 + end + names.empty? ? "UNKNOWN" : names.join("|") + end + + # All context bitmasks OR'd together (for "is context known?" checks). + # @rbs () -> Integer + def all_contexts_mask + mask = 0 + @lexer_contexts.each_value { |lc| mask |= lc.bitmask } + mask + end + + # Return the ordered list of context definitions. + # @rbs () -> Array[Grammar::LexerContext] + def contexts + @lexer_contexts.values.sort_by(&:index) + end + + private + + # Build a map from symbol name → context bitmask. + # @rbs () -> Hash[String, Integer] + def build_symbol_to_context_map + map = {} + @lexer_contexts.each_value do |lc| + lc.symbols.each do |sym| + name = sym.s_value + # OR the bitmask in case a symbol appears in multiple contexts + map[name] = (map[name] || 0) | lc.bitmask + end + end + map + end + + # Build a map from bitmask value → context name. + # @rbs () -> Hash[Integer, String] + def build_context_names + names = {} + @lexer_contexts.each_value do |lc| + names[lc.bitmask] = lc.name + end + names + end + + # Return the bitmask for the first defined context (used as default for position-0 items). + # Returns 0 if no contexts are defined. + # @rbs () -> Integer + def default_beg_context + first = @lexer_contexts.values.min_by(&:index) + first ? first.bitmask : 0 + end + end +end diff --git a/lib/lrama/option_parser.rb b/lib/lrama/option_parser.rb index 5a15d59c7..29ae759c4 100644 --- a/lib/lrama/option_parser.rb +++ b/lib/lrama/option_parser.rb @@ -97,6 +97,7 @@ def parse_by_option_parser(argv) o.on_tail ' lookaheads explicitly associate lookahead tokens to items' o.on_tail ' solved describe shift/reduce conflicts solving' o.on_tail ' counterexamples, cex generate conflict counterexamples' + o.on_tail ' pslr report PSLR split and scanner metrics' o.on_tail ' rules list unused rules' o.on_tail ' terms list unused terminals' o.on_tail ' verbose report detailed internal state and analysis results' @@ -141,7 +142,7 @@ def parse_by_option_parser(argv) end ALIASED_REPORTS = { cex: :counterexamples }.freeze #: Hash[Symbol, Symbol] - VALID_REPORTS = %i[states itemsets lookaheads solved counterexamples rules terms verbose].freeze #: Array[Symbol] + VALID_REPORTS = %i[states itemsets lookaheads solved counterexamples pslr rules terms verbose].freeze #: Array[Symbol] # @rbs (Array[String]) -> Hash[Symbol, bool] def validate_report(report) diff --git a/lib/lrama/output.rb b/lib/lrama/output.rb index 24cf725c7..fdb2a68cc 100644 --- a/lib/lrama/output.rb +++ b/lib/lrama/output.rb @@ -401,6 +401,495 @@ def percent_code(name) end.join end + # PSLR Output Helper Methods + # Based on PSLR::OutputHelper - generates PSLR-specific C code + + # Check if the grammar requested PSLR output. + def pslr_enabled? + @grammar.pslr_defined? + end + + # Check if PSLR scanner tables are available. + def pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + !scanner_fsa.nil? && !scanner_fsa.states.empty? + end + + def pslr_function_declarations + return "" unless pslr_enabled? + + declarations = [<<~C_CODE] + int yy_state_accepts_token (int yystate, int yychar); + C_CODE + + if pslr_scanner_enabled? + declarations << <<~C_CODE + int yy_pseudo_scan (int parser_state, const char *input, int *match_length); + + #ifndef YYPSLR_SCAN_RESULT_DEFINED + #define YYPSLR_SCAN_RESULT_DEFINED + typedef struct { + int length; + int is_layout; + } yypslr_scan_result; + #endif + + int yy_pseudo_scan_full (int parser_state, const char *input, yypslr_scan_result *result); + C_CODE + + declarations << <<~C_CODE + #define YYPSLR_ENABLED 1 + #define YYPSLR_NO_MATCH YYEMPTY + + #ifndef YYPSLR_PSEUDO_SCAN_STATE + # define YYPSLR_PSEUDO_SCAN_STATE(ParserState, Input, MatchLength) \\ + yy_pseudo_scan ((ParserState), (Input), (MatchLength)) + #endif + C_CODE + end + + if (member = pslr_state_member) + declarations << <<~C_CODE + #ifndef YYGETSTATE_CONTEXT + # define YYGETSTATE_CONTEXT(Context) ((Context)->#{member}) + #endif + C_CODE + + if pslr_scanner_enabled? + declarations << <<~C_CODE + #ifndef YYPSLR_PSEUDO_SCAN + # define YYPSLR_PSEUDO_SCAN(Context, Input, MatchLength) \\ + ((Context) != 0 \\ + ? YYPSLR_PSEUDO_SCAN_STATE (YYGETSTATE_CONTEXT (Context), (Input), (MatchLength)) \\ + : YYEMPTY) + #endif + + #ifndef YYPSLR_PSEUDO_SCAN_RESULT + # define YYPSLR_PSEUDO_SCAN_RESULT(Context, Input, ResultPtr) \\ + ((Context) != 0 \\ + ? yy_pseudo_scan_full (YYGETSTATE_CONTEXT (Context), (Input), (ResultPtr)) \\ + : (((ResultPtr)->length = 0), ((ResultPtr)->is_layout = 0), YYEMPTY)) + #endif + C_CODE + end + + if !parse_param_name.empty? + declarations << <<~C_CODE + #ifndef YYSETSTATE_CONTEXT + # define YYSETSTATE_CONTEXT(CurrentState) \\ + do { \\ + if (#{parse_param_name} != 0) { \\ + YYGETSTATE_CONTEXT (#{parse_param_name}) = (CurrentState); \\ + } \\ + } while (0) + #endif + C_CODE + end + end + + declarations.join("\n") + end + + def pslr_state_member + member = @grammar.pslr_state_member + member&.strip + end + + def pslr_accepting_states + return [] unless pslr_scanner_enabled? + + @context.states.scanner_fsa.states.select(&:accepting?) + end + + def pslr_token_pattern_count + @context.states.token_patterns.size + end + + def pslr_token_id(token_pattern) + @context.states.find_symbol_by_s_value!(token_pattern.name).token_id + end + + # Generate Scanner FSA transition table as C code + def scanner_transition_table + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + + lines = [] + lines << "/* Scanner FSA transition table */" + lines << "#define YY_SCANNER_NUM_STATES #{scanner_fsa.states.size}" + lines << "#define YY_SCANNER_INVALID_STATE (-1)" + lines << "" + lines << "static const int yy_scanner_transition[YY_SCANNER_NUM_STATES][256] = {" + + scanner_fsa.states.each_with_index do |state, idx| + transitions = Array.new(256, -1) + state.transitions.each do |char, target_id| + transitions[char.ord] = target_id + end + lines << " /* state #{idx} */ {#{transitions.join(', ')}}#{idx < scanner_fsa.states.size - 1 ? ',' : ''}" + end + + lines << "};" + lines.join("\n") + end + + # Generate state_to_accepting table as C code + def state_to_accepting_table + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + accepting_indices = Array.new(scanner_fsa.states.size, -1) + + pslr_accepting_states.each_with_index do |state, index| + accepting_indices[state.id] = index + end + + lines = [] + lines << "" + lines << "/* FSA state -> accepting state index mapping */" + lines << "#define YY_ACCEPTING_NONE (-1)" + lines << "" + lines << "static const int yy_state_to_accepting[YY_SCANNER_NUM_STATES] = {" + lines << " #{accepting_indices.join(', ')}" + lines << "};" + lines.join("\n") + end + + def token_pattern_token_ids_table + return "" unless pslr_scanner_enabled? + + lines = [] + lines << "" + lines << "/* token pattern index -> parser token id */" + lines << "#define YY_PSLR_EMPTY_PATTERN (-1)" + lines << "#define YY_NUM_TOKEN_PATTERNS #{pslr_token_pattern_count}" + lines << "" + lines << "static const int yy_token_pattern_to_token_id[YY_NUM_TOKEN_PATTERNS] = {" + lines << " #{@context.states.token_patterns.map {|token_pattern| pslr_token_id(token_pattern) }.join(', ')}" + lines << "};" + lines.join("\n") + end + + # Generate token IDs for accepting states as C code + def accepting_tokens_table + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + + lines = [] + lines << "" + lines << "/* Accepting state token IDs */" + lines << "/* For each accepting state, list of (token_id, definition_order) pairs */" + lines << "" + + # Collect all unique tokens + all_tokens = @context.states.token_patterns.map(&:name) + lines << "/* Token pattern names: #{all_tokens.join(', ')} */" + lines << "" + + # Generate accepting tokens for each FSA state + scanner_fsa.states.each do |state| + next unless state.accepting? + + token_names = state.accepting_tokens.map(&:name) + lines << "/* State #{state.id} accepts: #{token_names.join(', ')} */" + end + + lines.join("\n") + end + + # Generate scanner_accepts table as C code + def scanner_accepts_table_code + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + scanner_accepts = @context.states.scanner_accepts_table + return "" unless scanner_accepts + + lines = [] + lines << "" + lines << "/* scanner_accepts[parser_state][accepting_state] -> token pattern index */" + lines << "/* YY_PSLR_EMPTY_PATTERN means no token accepted */" + lines << "" + + num_parser_states = @context.states.states.size + num_accepting_states = pslr_accepting_states.size + + lines << "#define YY_NUM_PARSER_STATES #{num_parser_states}" + lines << "#define YY_NUM_ACCEPTING_STATES #{num_accepting_states}" + lines << "" + + if num_accepting_states > 0 + lines << "static const int yy_scanner_accepts[YY_NUM_PARSER_STATES][YY_NUM_ACCEPTING_STATES] = {" + + @context.states.states.each_with_index do |parser_state, ps_idx| + row = [] + pslr_accepting_states.each do |fsa_state| + token = scanner_accepts[parser_state.id, fsa_state.id] + if token + row << token.definition_order + else + row << -1 + end + end + + lines << " /* parser state #{ps_idx} */ {#{row.join(', ')}}#{ps_idx < num_parser_states - 1 ? ',' : ''}" + end + + lines << "};" + end + + lines.join("\n") + end + + # Generate length_precedences table as C code + def length_precedences_table_code + return "" unless pslr_scanner_enabled? + length_precedences = @context.states.length_precedences + return "" unless length_precedences + + lines = [] + lines << "" + lines << "/* length_precedences[token1][token2] -> precedence */" + lines << "#define YY_LENGTH_PREC_UNDEFINED 0" + lines << "#define YY_LENGTH_PREC_LEFT 1 /* shorter token wins */" + lines << "#define YY_LENGTH_PREC_RIGHT 2 /* longer token wins */" + lines << "" + + num_tokens = pslr_token_pattern_count + if num_tokens > 0 + lines << "static const int yy_length_precedences[#{num_tokens}][#{num_tokens}] = {" + + @context.states.token_patterns.each_with_index do |t1, i| + row = @context.states.token_patterns.map do |t2| + case length_precedences.precedence(t1.name, t2.name) + when :left then 1 + when :right then 2 + else 0 + end + end + lines << " /* #{t1.name} */ {#{row.join(', ')}}#{i < num_tokens - 1 ? ',' : ''}" + end + + lines << "};" + end + + lines.join("\n") + end + + # Generate pseudo_scan function as C code + def pseudo_scan_function + return "" unless pslr_scanner_enabled? + + <<~C_CODE + + /* + * pseudo_scan: PSLR(1) scanning function + * Based on Definition 3.2.16 from the PSLR dissertation + * + * Input: + * parser_state: Current parser state + * input: Input buffer pointer + * match_length: Output parameter for matched length + * + * Returns: Selected parser token ID, or YYEMPTY if no match + */ + int + yy_pseudo_scan(int parser_state, const char *input, int *match_length) + { + int local_match_length = 0; + int ss = 0; /* FSA initial state */ + int ibest = 0; + int pbest = YY_PSLR_EMPTY_PATTERN; + int i = 0; + + if (match_length == NULL) { + match_length = &local_match_length; + } + + *match_length = 0; + + if (parser_state < 0 || parser_state >= YY_NUM_PARSER_STATES || input == NULL) { + return YYEMPTY; + } + + while (input[i] != '\\0') { + int c = (unsigned char)input[i]; + int next_ss = yy_scanner_transition[ss][c]; + + if (next_ss == YY_SCANNER_INVALID_STATE) { + break; + } + + ss = next_ss; + i++; + + /* Check if this is an accepting state */ + int sa = yy_state_to_accepting[ss]; + if (sa != YY_ACCEPTING_NONE) { + int pattern_index = yy_scanner_accepts[parser_state][sa]; + if (pattern_index != YY_PSLR_EMPTY_PATTERN) { + /* Check length precedences */ + if (pbest == YY_PSLR_EMPTY_PATTERN || + (i > ibest && yy_length_precedences[pbest][pattern_index] != YY_LENGTH_PREC_LEFT) || + (i == ibest && yy_length_precedences[pattern_index][pbest] == YY_LENGTH_PREC_LEFT)) { + pbest = pattern_index; + ibest = i; + } + } + } + } + + *match_length = ibest; + if (pbest == YY_PSLR_EMPTY_PATTERN) { + return YYEMPTY; + } + + return yy_token_pattern_to_token_id[pbest]; + } + C_CODE + end + + def token_pattern_is_layout_function + layout_flags = @context.states.token_patterns.map { |tp| tp.layout? ? 1 : 0 } + + <<~C_CODE + static const int yy_token_pattern_layout[YY_NUM_TOKEN_PATTERNS] = { + #{layout_flags.join(', ')} + }; + + static int + yy_token_pattern_is_layout (int pattern_id) + { + if (pattern_id < 0 || pattern_id >= YY_NUM_TOKEN_PATTERNS) + return 0; + return yy_token_pattern_layout[pattern_id]; + } + C_CODE + end + + def pseudo_scan_full_function + <<~C_CODE + int + yy_pseudo_scan_full (int parser_state, const char *input, yypslr_scan_result *result) + { + int local_match_length = 0; + int ss = 0; + int ibest = 0; + int pbest = YY_PSLR_EMPTY_PATTERN; + int i = 0; + + if (result == NULL) + return YYEMPTY; + + result->length = 0; + result->is_layout = 0; + + if (parser_state < 0 || parser_state >= YY_NUM_PARSER_STATES || input == NULL) + return YYEMPTY; + + while (input[i] != '\\0') { + int c = (unsigned char)input[i]; + int next_ss = yy_scanner_transition[ss][c]; + + if (next_ss == YY_SCANNER_INVALID_STATE) + break; + + ss = next_ss; + i++; + + int sa = yy_state_to_accepting[ss]; + if (sa != YY_ACCEPTING_NONE) { + int pattern_index = yy_scanner_accepts[parser_state][sa]; + if (pattern_index != YY_PSLR_EMPTY_PATTERN) { + if (pbest == YY_PSLR_EMPTY_PATTERN || + (i > ibest && yy_length_precedences[pbest][pattern_index] != YY_LENGTH_PREC_LEFT) || + (i == ibest && yy_length_precedences[pattern_index][pbest] == YY_LENGTH_PREC_LEFT)) { + pbest = pattern_index; + ibest = i; + } + } + } + } + + result->length = ibest; + if (pbest == YY_PSLR_EMPTY_PATTERN) + return YYEMPTY; + + result->is_layout = yy_token_pattern_is_layout(pbest); + return yy_token_pattern_to_token_id[pbest]; + } + C_CODE + end + + # Check if lexer context table is available. + def lexer_context_enabled? + @context.states.lexer_context_enabled? + end + + # Generate #define constants for lexer contexts, emitted early in the output + # so that user code in %{ ... %} can reference them. + def lexer_context_defines_code + return "" unless lexer_context_enabled? + + classifier = @context.states.lexer_context_classifier + lines = [] + lines << "/* Lexer context constants — generated from %lexer-context directives */" + classifier.contexts.each do |lc| + lines << "#define YY_CTX_%-8s 0x%02x" % [lc.name, lc.bitmask] + end + lines.join("\n") + end + + # Generate the lexer context table as C code. + def lexer_context_table_code + return "" unless lexer_context_enabled? + + table = @context.states.lexer_context_table + lexer_contexts = @grammar.lexer_contexts + lines = [] + + lines << "/* Lexer Context Classification Table */" + lines << "/* Maps parser state -> lexer context flags */" + lines << "" + lines << "static const unsigned char yy_lexer_context[] = {" + + table.each_with_index do |ctx, idx| + ctx_name = LexerContextClassifier.context_name(ctx, lexer_contexts) + comma = idx < table.size - 1 ? "," : "" + lines << " /* state #{idx} */ #{ctx}#{comma} /* #{ctx_name} */" + end + + lines << "};" + lines << "" + lines << "int" + lines << "yy_lexer_context_is(int yystate, int ctx_mask) {" + lines << " if (yystate < 0 || yystate >= #{table.size}) return 0;" + lines << " return yy_lexer_context[yystate] & ctx_mask;" + lines << "}" + + lines.join("\n") + end + + # Generate all PSLR C code + def pslr_tables_and_functions + return "" unless pslr_scanner_enabled? + + parts = [ + "/* PSLR(1) Scanner Tables and Functions */", + "/* Generated by Lrama PSLR implementation */", + "", + scanner_transition_table, + state_to_accepting_table, + token_pattern_token_ids_table, + accepting_tokens_table, + scanner_accepts_table_code, + length_precedences_table_code, + token_pattern_is_layout_function, + pseudo_scan_function, + pseudo_scan_full_function, + ] + + parts.join("\n") + end + private def eval_template(file, path) diff --git a/lib/lrama/parser.rb b/lib/lrama/parser.rb index 04632cbae..c9ef86185 100644 --- a/lib/lrama/parser.rb +++ b/lib/lrama/parser.rb @@ -655,7 +655,7 @@ def token_to_str(t) module Lrama class Parser < Racc::Parser -module_eval(<<'...end parser.y/module_eval...', 'parser.y', 505) +module_eval(<<'...end parser.y/module_eval...', 'parser.y', 636) include Lrama::Tracer::Duration @@ -745,325 +745,395 @@ def raise_parse_error(error_message, location) ##### State transition tables begin ### racc_action_table = [ - 98, 98, 99, 99, 87, 53, 53, 52, 178, 110, - 110, 97, 53, 53, 184, 178, 110, 110, 53, 181, - 184, 162, 110, 6, 163, 181, 181, 53, 53, 52, - 52, 181, 79, 79, 53, 53, 52, 52, 43, 79, - 79, 53, 4, 52, 5, 110, 88, 94, 182, 125, - 126, 163, 100, 100, 180, 193, 194, 195, 137, 185, - 188, 180, 4, 44, 5, 185, 188, 94, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 46, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 47, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 47, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 12, 13, 50, - 57, 14, 15, 16, 17, 18, 19, 20, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 57, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 12, 13, 57, - 60, 14, 15, 16, 17, 18, 19, 20, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 57, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 53, 53, 52, - 52, 110, 105, 53, 53, 52, 52, 110, 105, 53, - 53, 52, 52, 110, 105, 53, 53, 52, 52, 110, - 105, 53, 53, 52, 52, 110, 110, 53, 53, 52, - 209, 110, 110, 53, 53, 209, 225, 110, 110, 53, - 53, 209, 209, 110, 110, 193, 194, 195, 137, 216, - 222, 232, 217, 217, 217, 235, 57, 53, 217, 52, - 53, 53, 52, 52, 193, 194, 195, 57, 57, 57, - 66, 67, 68, 69, 70, 72, 72, 72, 86, 89, - 47, 57, 57, 113, 117, 117, 79, 123, 124, 131, - 47, 133, 137, 139, 143, 149, 150, 151, 152, 133, - 155, 156, 157, 110, 166, 149, 169, 172, 173, 72, - 175, 176, 183, 189, 166, 196, 137, 200, 202, 137, - 166, 211, 166, 137, 72, 176, 218, 176, 72, 137, - 228, 137, 72, 231, 72 ] + 125, 125, 126, 126, 114, 4, 60, 5, 227, 60, + 96, 233, 124, 96, 60, 6, 227, 60, 96, 233, + 230, 96, 60, 230, 59, 50, 211, 106, 230, 212, + 51, 230, 151, 152, 153, 154, 155, 156, 157, 151, + 152, 153, 154, 155, 156, 157, 60, 60, 59, 59, + 53, 106, 106, 60, 231, 59, 54, 212, 106, 121, + 60, 115, 59, 265, 96, 132, 266, 127, 127, 229, + 271, 234, 237, 266, 60, 54, 59, 229, 96, 234, + 237, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 57, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 121, 60, 4, 59, + 5, 96, 132, 48, 60, 60, 59, 59, 96, 132, + 60, 64, 59, 281, 96, 132, 266, 64, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 64, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 242, 243, 244, 180, 12, 13, 67, + 48, 14, 15, 16, 17, 18, 19, 20, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 64, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 242, 243, 244, 180, 12, 13, 64, + 48, 14, 15, 16, 17, 18, 19, 20, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 64, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 60, 60, 59, 59, 96, 96, 60, + 48, 59, 60, 96, 59, 60, 96, 59, 60, 96, + 59, 60, 96, 59, 60, 96, 59, 60, 96, 59, + 60, 96, 59, 60, 96, 59, 60, 96, 59, 60, + 96, 59, 60, 96, 258, 60, 96, 258, 60, 96, + 274, 60, 96, 258, 60, 96, 258, 284, 96, 60, + 266, 59, 60, 60, 59, 59, 60, 60, 59, 59, + 60, 64, 59, 242, 243, 244, 168, 169, 64, 73, + 74, 75, 76, 77, 79, 79, 86, 88, 90, 79, + 79, 113, 116, 54, 64, 64, 137, 141, 144, 86, + 64, 161, 161, 106, 166, 167, 174, 54, 176, 180, + 182, 186, 144, 144, 190, 197, 198, 199, 200, 176, + 203, 204, 205, 96, 96, 215, 197, 218, 221, 222, + 79, 224, 225, 232, 238, 215, 245, 180, 249, 251, + 180, 215, 260, 215, 180, 79, 225, 267, 225, 79, + 180, 277, 180, 79, 280, 79 ] racc_action_check = [ - 51, 97, 51, 97, 41, 75, 165, 75, 165, 75, - 165, 51, 171, 190, 171, 190, 171, 190, 201, 165, - 201, 148, 201, 1, 148, 171, 190, 36, 37, 36, - 37, 201, 36, 37, 38, 39, 38, 39, 5, 38, - 39, 117, 0, 117, 0, 117, 41, 46, 168, 88, - 88, 168, 51, 97, 165, 177, 177, 177, 177, 171, - 171, 190, 2, 6, 2, 201, 201, 90, 46, 46, - 46, 46, 46, 46, 46, 46, 46, 9, 46, 46, - 46, 46, 46, 46, 46, 46, 46, 10, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 11, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 3, 3, 12, - 14, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 15, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 8, 8, 16, - 17, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 18, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 63, 13, 63, - 13, 63, 63, 64, 73, 64, 73, 64, 64, 65, - 78, 65, 78, 65, 65, 106, 79, 106, 79, 106, - 106, 118, 180, 118, 180, 118, 180, 188, 196, 188, - 196, 188, 196, 202, 217, 202, 217, 202, 217, 218, - 231, 218, 231, 218, 231, 186, 186, 186, 186, 208, - 213, 227, 208, 213, 227, 234, 24, 113, 234, 113, - 114, 123, 114, 123, 210, 210, 210, 25, 26, 27, - 28, 29, 30, 31, 32, 33, 34, 35, 40, 42, - 47, 55, 60, 71, 74, 76, 80, 81, 87, 91, - 92, 93, 94, 102, 116, 124, 125, 126, 127, 133, - 136, 137, 138, 144, 150, 151, 153, 156, 158, 162, - 163, 164, 170, 174, 176, 178, 179, 182, 184, 187, - 189, 199, 200, 204, 205, 207, 209, 212, 214, 216, - 221, 222, 224, 225, 229 ] + 58, 124, 58, 124, 48, 0, 214, 0, 214, 220, + 214, 220, 58, 220, 239, 1, 239, 250, 239, 250, + 214, 250, 43, 220, 43, 5, 196, 43, 239, 196, + 6, 250, 92, 92, 92, 92, 92, 92, 92, 93, + 93, 93, 93, 93, 93, 93, 44, 45, 44, 45, + 9, 44, 45, 46, 217, 46, 10, 217, 46, 53, + 70, 48, 70, 257, 70, 70, 257, 58, 124, 214, + 262, 220, 220, 262, 38, 11, 38, 239, 38, 250, + 250, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 12, 53, 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 117, 71, 2, 71, + 2, 71, 71, 53, 72, 13, 72, 13, 72, 72, + 133, 14, 133, 276, 133, 133, 276, 15, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 16, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 226, 226, 226, 226, 3, 3, 17, + 117, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 18, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 235, 235, 235, 235, 8, 8, 24, + 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 25, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 39, 40, 39, 40, 39, 40, 88, + 8, 88, 90, 88, 90, 98, 90, 98, 100, 98, + 100, 102, 100, 102, 147, 102, 147, 150, 147, 150, + 158, 150, 158, 161, 158, 161, 229, 161, 229, 237, + 229, 237, 245, 237, 245, 251, 245, 251, 266, 251, + 266, 267, 266, 267, 280, 267, 280, 283, 280, 80, + 283, 80, 105, 106, 105, 106, 137, 138, 137, 138, + 166, 26, 166, 259, 259, 259, 115, 115, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 41, + 42, 47, 49, 54, 62, 67, 78, 81, 82, 84, + 86, 101, 103, 107, 108, 114, 118, 119, 120, 121, + 129, 140, 141, 142, 144, 167, 168, 169, 170, 176, + 179, 180, 181, 187, 190, 198, 199, 201, 204, 206, + 211, 212, 213, 219, 223, 225, 227, 228, 231, 233, + 236, 238, 248, 249, 253, 254, 256, 258, 261, 263, + 265, 270, 271, 273, 274, 278 ] racc_action_pointer = [ - 32, 23, 52, 93, nil, 31, 63, nil, 123, 68, - 74, 84, 103, 165, 94, 111, 123, 135, 141, nil, - nil, nil, nil, nil, 210, 221, 222, 223, 235, 236, - 237, 238, 239, 237, 238, 239, 24, 25, 31, 32, - 243, -1, 247, nil, nil, nil, 43, 237, nil, nil, - nil, -5, nil, nil, nil, 235, nil, nil, nil, nil, - 236, nil, nil, 164, 170, 176, nil, nil, nil, nil, - nil, 245, nil, 171, 246, 2, 247, nil, 177, 183, - 248, 249, nil, nil, nil, nil, nil, 214, 45, nil, - 63, 250, 247, 248, 207, nil, nil, -4, nil, nil, - nil, nil, 261, nil, nil, nil, 182, nil, nil, nil, - nil, nil, nil, 224, 227, nil, 258, 38, 188, nil, - nil, nil, nil, 228, 260, 220, 223, 257, nil, nil, - nil, nil, nil, 256, nil, nil, 224, 266, 255, nil, - nil, nil, nil, nil, 266, nil, nil, nil, -24, nil, - 224, 270, nil, 274, nil, nil, 221, nil, 261, nil, - nil, nil, 271, 275, 232, 3, nil, nil, 3, nil, - 233, 9, nil, nil, 237, nil, 234, 3, 241, 231, - 189, nil, 241, nil, 244, nil, 163, 234, 194, 240, - 10, nil, nil, nil, nil, nil, 195, nil, nil, 289, - 242, 15, 200, nil, 238, 286, nil, 246, 174, 252, - 182, nil, 248, 175, 290, nil, 244, 201, 206, nil, - nil, 283, 246, nil, 294, 259, nil, 176, nil, 296, - nil, 207, nil, nil, 180, nil ] + -6, 15, 97, 142, nil, 18, 30, nil, 182, 40, + 42, 61, 84, 112, 104, 110, 120, 154, 160, nil, + nil, nil, nil, nil, 182, 200, 284, 291, 304, 305, + 306, 307, 308, 306, 307, 311, 312, 313, 71, 230, + 231, 311, 312, 19, 43, 44, 50, 316, -1, 320, + nil, nil, nil, 55, 309, nil, nil, nil, -5, nil, + nil, nil, 307, nil, nil, nil, nil, 308, nil, nil, + 57, 104, 111, nil, nil, nil, nil, nil, 318, nil, + 286, 319, 323, nil, 324, nil, 313, nil, 236, nil, + 239, nil, -19, -12, nil, nil, nil, nil, 242, nil, + 245, 323, 248, 324, nil, 289, 290, 325, 326, nil, + nil, nil, nil, nil, 276, 302, nil, 102, 326, 323, + 324, 269, nil, nil, -4, nil, nil, nil, nil, 338, + nil, nil, nil, 117, nil, nil, nil, 293, 294, nil, + 335, 337, 338, nil, 335, nil, nil, 251, nil, nil, + 254, nil, nil, nil, nil, nil, nil, nil, 257, nil, + nil, 260, nil, nil, nil, nil, 297, 340, 285, 288, + 336, nil, nil, nil, nil, nil, 335, nil, nil, 289, + 346, 334, nil, nil, nil, nil, nil, 346, nil, nil, + 347, nil, nil, nil, nil, nil, -34, nil, 290, 351, + nil, 355, nil, nil, 287, nil, 341, nil, nil, nil, + nil, 352, 356, 298, 3, nil, nil, -6, nil, 299, + 6, nil, nil, 303, nil, 300, 86, 307, 297, 263, + nil, 307, nil, 310, nil, 126, 300, 266, 306, 11, + nil, nil, nil, nil, nil, 269, nil, nil, 370, 308, + 14, 272, nil, 304, 367, nil, 312, 3, 318, 236, + nil, 314, 10, 371, nil, 310, 275, 278, nil, nil, + 363, 312, nil, 375, 325, nil, 63, nil, 377, nil, + 281, nil, nil, 227, nil ] racc_action_default = [ - -1, -137, -1, -3, -10, -137, -137, -2, -3, -137, - -14, -14, -137, -137, -137, -137, -137, -137, -137, -28, - -29, -34, -35, -36, -137, -137, -137, -137, -137, -137, - -137, -137, -137, -54, -54, -54, -137, -137, -137, -137, - -137, -137, -137, -13, 236, -4, -137, -14, -16, -17, - -20, -132, -100, -101, -131, -18, -23, -89, -24, -25, - -137, -27, -37, -137, -137, -137, -41, -42, -43, -44, - -45, -46, -55, -137, -47, -137, -48, -49, -92, -137, - -95, -97, -98, -50, -51, -52, -53, -137, -137, -11, - -5, -7, -14, -137, -72, -15, -21, -132, -133, -134, - -135, -19, -137, -26, -30, -31, -32, -38, -87, -88, - -136, -39, -40, -137, -56, -58, -60, -137, -83, -85, - -93, -94, -96, -137, -137, -137, -137, -137, -6, -8, - -9, -129, -104, -102, -105, -73, -137, -137, -137, -90, - -33, -59, -57, -61, -80, -86, -84, -99, -137, -66, - -70, -137, -12, -137, -103, -109, -137, -22, -137, -62, - -81, -82, -54, -137, -64, -68, -71, -74, -137, -130, - -106, -107, -128, -91, -137, -67, -70, -72, -100, -72, - -137, -125, -137, -109, -100, -110, -72, -72, -137, -70, - -69, -75, -76, -116, -117, -118, -137, -78, -79, -137, - -70, -108, -137, -111, -72, -54, -115, -63, -137, -100, - -119, -126, -65, -137, -54, -114, -72, -137, -137, -120, - -121, -137, -72, -112, -54, -100, -122, -137, -127, -54, - -77, -137, -124, -113, -137, -123 ] + -1, -167, -1, -3, -10, -167, -167, -2, -3, -167, + -14, -14, -167, -167, -167, -167, -167, -167, -167, -28, + -29, -34, -35, -36, -167, -167, -167, -167, -167, -167, + -167, -167, -167, -61, -61, -167, -167, -167, -167, -167, + -167, -61, -61, -167, -167, -167, -167, -167, -167, -167, + -13, 285, -4, -167, -14, -16, -17, -20, -162, -130, + -131, -161, -18, -23, -119, -24, -25, -167, -27, -37, + -167, -167, -167, -41, -42, -43, -44, -45, -46, -62, + -167, -47, -167, -48, -91, -93, -167, -49, -167, -50, + -167, -51, -79, -167, -117, -118, -166, -52, -167, -53, + -167, -54, -167, -55, -56, -122, -167, -125, -127, -128, + -57, -58, -59, -60, -167, -167, -11, -5, -7, -14, + -167, -104, -15, -21, -162, -163, -164, -165, -19, -167, + -26, -30, -31, -32, -38, -39, -40, -167, -63, -65, + -67, -167, -70, -72, -167, -92, -94, -75, -78, -77, + -167, -82, -83, -84, -85, -86, -87, -88, -167, -89, + -90, -167, -115, -123, -124, -126, -167, -167, -167, -167, + -167, -6, -8, -9, -159, -134, -132, -135, -105, -167, + -167, -167, -120, -33, -66, -64, -68, -112, -73, -71, + -112, -76, -81, -80, -116, -129, -167, -98, -102, -167, + -12, -167, -133, -139, -167, -22, -167, -69, -113, -114, + -74, -61, -167, -96, -100, -103, -106, -167, -160, -136, + -137, -158, -121, -167, -99, -102, -104, -130, -104, -167, + -155, -167, -139, -130, -140, -104, -104, -167, -102, -101, + -107, -108, -146, -147, -148, -167, -110, -111, -167, -102, + -138, -167, -141, -104, -61, -145, -95, -167, -130, -149, + -156, -97, -167, -61, -144, -104, -167, -167, -150, -151, + -167, -104, -142, -61, -130, -152, -167, -157, -61, -109, + -167, -154, -143, -167, -153 ] racc_goto_table = [ - 73, 118, 136, 54, 48, 49, 164, 96, 91, 120, - 121, 93, 187, 208, 107, 111, 112, 119, 134, 213, - 56, 58, 59, 171, 61, 1, 78, 78, 78, 78, - 62, 63, 64, 65, 115, 227, 129, 192, 148, 74, - 76, 95, 187, 118, 118, 207, 204, 3, 234, 7, - 130, 201, 128, 138, 147, 93, 212, 140, 154, 145, - 146, 101, 9, 116, 42, 168, 103, 45, 78, 78, - 219, 127, 51, 71, 141, 142, 77, 83, 84, 85, - 159, 144, 190, 160, 161, 191, 132, 197, 102, 158, - 122, 177, 170, 220, 203, 205, 199, 186, 221, 153, - nil, nil, nil, 116, 116, nil, 198, nil, nil, nil, - nil, nil, 214, 78, 206, nil, 177, nil, nil, nil, - nil, nil, 210, nil, 224, nil, nil, 186, 210, 174, - 229, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, nil, 226, 210, nil, nil, nil, nil, nil, - nil, nil, nil, nil, nil, nil, nil, 210, nil, nil, + 80, 82, 61, 179, 93, 98, 100, 148, 123, 149, + 55, 56, 118, 120, 163, 164, 68, 159, 85, 160, + 213, 162, 69, 70, 71, 72, 134, 135, 136, 139, + 241, 143, 105, 105, 105, 105, 131, 131, 131, 253, + 177, 196, 9, 1, 63, 65, 66, 52, 236, 3, + 257, 7, 220, 172, 122, 207, 262, 173, 210, 208, + 256, 49, 208, 268, 170, 130, 191, 145, 58, 140, + 78, 261, 276, 217, 181, 195, 171, 120, 236, 81, + 194, 250, 101, 103, 146, 283, 184, 185, 83, 183, + 188, 189, 128, 87, 105, 105, 202, 150, 158, 131, + 104, 110, 111, 112, 89, 91, 97, 99, 240, 187, + 246, 92, 239, 129, 206, 165, 192, 252, 254, 175, + 219, 269, 248, 270, 193, 201, 140, 140, nil, nil, + nil, nil, nil, nil, nil, 263, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, 273, nil, nil, + nil, nil, nil, 278, nil, 105, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, 215, nil, nil, nil, nil, nil, nil, nil, - nil, 223, nil, nil, nil, nil, nil, nil, nil, nil, - nil, 230, nil, nil, nil, nil, 233 ] + nil, nil, nil, nil, nil, nil, nil, nil, 223, nil, + 226, nil, nil, nil, nil, nil, 235, nil, nil, nil, + nil, nil, nil, nil, nil, 247, nil, nil, nil, nil, + nil, nil, nil, 255, nil, 226, nil, nil, nil, nil, + nil, 259, nil, nil, nil, nil, 235, 259, nil, nil, + nil, 264, nil, nil, nil, nil, nil, nil, nil, nil, + 272, nil, 275, 259, nil, nil, nil, nil, nil, nil, + 279, nil, nil, nil, nil, 282, 259 ] racc_goto_check = [ - 29, 22, 42, 31, 14, 14, 35, 16, 8, 48, - 48, 13, 40, 39, 24, 24, 24, 45, 52, 39, - 18, 18, 18, 54, 17, 1, 31, 31, 31, 31, - 17, 17, 17, 17, 30, 39, 5, 38, 34, 26, - 26, 14, 40, 22, 22, 35, 38, 6, 39, 6, - 9, 54, 8, 16, 48, 13, 35, 24, 52, 45, - 45, 18, 7, 31, 10, 34, 17, 7, 31, 31, - 38, 11, 15, 25, 30, 30, 27, 27, 27, 27, - 32, 33, 36, 43, 44, 42, 14, 42, 46, 47, - 50, 22, 53, 55, 42, 42, 56, 22, 57, 58, - nil, nil, nil, 31, 31, nil, 22, nil, nil, nil, - nil, nil, 42, 31, 22, nil, 22, nil, nil, nil, - nil, nil, 22, nil, 42, nil, nil, 22, 22, 29, - 42, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, nil, 22, 22, nil, nil, nil, nil, nil, - nil, nil, nil, nil, nil, nil, nil, 22, nil, nil, + 36, 36, 38, 56, 22, 22, 22, 43, 16, 43, + 14, 14, 8, 13, 61, 61, 17, 43, 47, 43, + 49, 43, 17, 17, 17, 17, 24, 24, 24, 37, + 52, 42, 38, 38, 38, 38, 22, 22, 22, 52, + 65, 48, 7, 1, 18, 18, 18, 7, 54, 6, + 53, 6, 67, 5, 14, 39, 53, 9, 39, 57, + 49, 10, 57, 52, 11, 17, 43, 47, 15, 38, + 25, 49, 53, 48, 16, 61, 8, 13, 54, 26, + 43, 67, 33, 33, 17, 53, 37, 37, 27, 24, + 42, 42, 18, 28, 38, 38, 65, 45, 45, 22, + 34, 34, 34, 34, 29, 30, 31, 32, 56, 40, + 56, 44, 50, 59, 60, 63, 22, 56, 56, 14, + 66, 68, 69, 70, 22, 71, 38, 38, nil, nil, + nil, nil, nil, nil, nil, 56, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, 56, nil, nil, + nil, nil, nil, 56, nil, 38, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, 29, nil, nil, nil, nil, nil, nil, nil, - nil, 29, nil, nil, nil, nil, nil, nil, nil, nil, - nil, 29, nil, nil, nil, nil, 29 ] + nil, nil, nil, nil, nil, nil, nil, nil, 36, nil, + 22, nil, nil, nil, nil, nil, 22, nil, nil, nil, + nil, nil, nil, nil, nil, 22, nil, nil, nil, nil, + nil, nil, nil, 22, nil, 22, nil, nil, nil, nil, + nil, 22, nil, nil, nil, nil, 22, 22, nil, nil, + nil, 36, nil, nil, nil, nil, nil, nil, nil, nil, + 36, nil, 22, 22, nil, nil, nil, nil, nil, nil, + 36, nil, nil, nil, nil, 36, 22 ] racc_goto_pointer = [ - nil, 25, nil, nil, nil, -55, 47, 59, -38, -41, - 60, -18, nil, -35, -6, 59, -44, 6, 6, nil, - nil, nil, -74, nil, -49, 40, 5, 40, nil, -33, - -39, -10, -64, -35, -86, -144, -94, nil, -140, -183, - -159, nil, -92, -61, -60, -58, 31, -50, -69, nil, - 10, nil, -75, -63, -132, -117, -85, -113, -32 ] + nil, 43, nil, nil, nil, -65, 49, 39, -41, -61, + 57, -52, nil, -40, 0, 55, -50, -2, 30, nil, + nil, nil, -34, nil, -44, 37, 45, 53, 57, 67, + 67, 67, 67, 41, 57, nil, -33, -51, -11, -132, + -31, nil, -51, -81, 73, 5, nil, -17, -126, -178, + -113, nil, -196, -195, -172, nil, -118, -128, nil, 49, + -68, -91, nil, 8, nil, -80, -83, -151, -138, -108, + -137, -49 ] racc_goto_default = [ - nil, nil, 2, 8, 90, nil, nil, nil, nil, nil, - nil, nil, 10, 11, nil, nil, nil, 55, nil, 21, - 22, 23, 104, 106, nil, nil, nil, nil, 114, 75, - nil, 108, nil, nil, nil, nil, 165, 135, nil, nil, - 179, 167, nil, 109, nil, nil, nil, nil, 81, 80, - 82, 92, nil, nil, nil, nil, nil, nil, nil ] + nil, nil, 2, 8, 117, nil, nil, nil, nil, nil, + nil, nil, 10, 11, nil, nil, nil, 62, nil, 21, + 22, 23, 147, 133, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, 138, 102, nil, 94, nil, + nil, 142, nil, nil, nil, nil, 84, nil, nil, nil, + 214, 178, nil, nil, 228, 216, nil, 95, 209, nil, + nil, 108, 107, 109, 119, nil, nil, nil, nil, nil, + nil, nil ] racc_reduce_table = [ 0, 0, :racc_error, - 0, 64, :_reduce_1, - 2, 64, :_reduce_2, - 0, 65, :_reduce_3, - 2, 65, :_reduce_4, - 1, 66, :_reduce_5, - 2, 66, :_reduce_6, - 0, 67, :_reduce_none, - 1, 67, :_reduce_none, - 5, 59, :_reduce_none, - 0, 68, :_reduce_10, - 0, 69, :_reduce_11, - 5, 60, :_reduce_12, - 2, 60, :_reduce_13, - 0, 72, :_reduce_14, - 2, 72, :_reduce_15, - 2, 61, :_reduce_none, - 2, 61, :_reduce_none, - 1, 76, :_reduce_18, - 2, 76, :_reduce_19, - 2, 70, :_reduce_20, - 3, 70, :_reduce_21, - 5, 70, :_reduce_22, - 2, 70, :_reduce_none, - 2, 70, :_reduce_24, - 2, 70, :_reduce_25, - 3, 70, :_reduce_26, - 2, 70, :_reduce_27, - 1, 70, :_reduce_28, - 1, 70, :_reduce_29, - 1, 81, :_reduce_30, - 1, 81, :_reduce_31, - 1, 82, :_reduce_32, - 2, 82, :_reduce_33, - 1, 71, :_reduce_none, - 1, 71, :_reduce_none, - 1, 71, :_reduce_none, - 2, 71, :_reduce_37, - 3, 71, :_reduce_38, - 3, 71, :_reduce_39, - 3, 71, :_reduce_40, - 2, 71, :_reduce_41, - 2, 71, :_reduce_42, - 2, 71, :_reduce_43, - 2, 71, :_reduce_44, - 2, 71, :_reduce_45, - 2, 77, :_reduce_none, - 2, 77, :_reduce_47, - 2, 77, :_reduce_48, - 2, 77, :_reduce_49, - 2, 77, :_reduce_50, - 2, 77, :_reduce_51, - 2, 77, :_reduce_52, - 2, 77, :_reduce_53, - 0, 87, :_reduce_none, - 1, 87, :_reduce_none, - 1, 88, :_reduce_56, - 2, 88, :_reduce_57, - 2, 83, :_reduce_58, - 3, 83, :_reduce_59, - 0, 91, :_reduce_none, - 1, 91, :_reduce_none, - 3, 86, :_reduce_62, - 8, 78, :_reduce_63, - 5, 79, :_reduce_64, - 8, 79, :_reduce_65, - 1, 92, :_reduce_66, - 3, 92, :_reduce_67, - 1, 93, :_reduce_68, - 3, 93, :_reduce_69, - 0, 99, :_reduce_none, - 1, 99, :_reduce_none, - 0, 100, :_reduce_none, + 0, 79, :_reduce_1, + 2, 79, :_reduce_2, + 0, 80, :_reduce_3, + 2, 80, :_reduce_4, + 1, 81, :_reduce_5, + 2, 81, :_reduce_6, + 0, 82, :_reduce_none, + 1, 82, :_reduce_none, + 5, 74, :_reduce_none, + 0, 83, :_reduce_10, + 0, 84, :_reduce_11, + 5, 75, :_reduce_12, + 2, 75, :_reduce_13, + 0, 87, :_reduce_14, + 2, 87, :_reduce_15, + 2, 76, :_reduce_none, + 2, 76, :_reduce_none, + 1, 91, :_reduce_18, + 2, 91, :_reduce_19, + 2, 85, :_reduce_20, + 3, 85, :_reduce_21, + 5, 85, :_reduce_22, + 2, 85, :_reduce_none, + 2, 85, :_reduce_24, + 2, 85, :_reduce_25, + 3, 85, :_reduce_26, + 2, 85, :_reduce_27, + 1, 85, :_reduce_28, + 1, 85, :_reduce_29, + 1, 96, :_reduce_30, + 1, 96, :_reduce_31, + 1, 97, :_reduce_32, + 2, 97, :_reduce_33, + 1, 86, :_reduce_none, + 1, 86, :_reduce_none, + 1, 86, :_reduce_none, + 2, 86, :_reduce_37, + 3, 86, :_reduce_38, + 3, 86, :_reduce_39, + 3, 86, :_reduce_40, + 2, 86, :_reduce_41, + 2, 86, :_reduce_42, + 2, 86, :_reduce_43, + 2, 86, :_reduce_44, + 2, 86, :_reduce_45, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_none, + 2, 92, :_reduce_54, + 2, 92, :_reduce_55, + 2, 92, :_reduce_56, + 2, 92, :_reduce_57, + 2, 92, :_reduce_58, + 2, 92, :_reduce_59, + 2, 92, :_reduce_60, + 0, 109, :_reduce_none, + 1, 109, :_reduce_none, + 1, 110, :_reduce_63, + 2, 110, :_reduce_64, + 2, 98, :_reduce_65, + 3, 98, :_reduce_66, + 0, 113, :_reduce_none, + 1, 113, :_reduce_none, + 3, 108, :_reduce_69, + 1, 115, :_reduce_70, + 2, 115, :_reduce_71, + 2, 99, :_reduce_72, + 3, 99, :_reduce_73, + 3, 114, :_reduce_74, + 1, 116, :_reduce_75, + 2, 116, :_reduce_76, + 2, 102, :_reduce_77, + 2, 101, :_reduce_78, + 1, 103, :_reduce_79, + 3, 117, :_reduce_80, + 3, 117, :_reduce_81, + 1, 118, :_reduce_82, + 1, 118, :_reduce_83, + 1, 118, :_reduce_84, + 1, 118, :_reduce_85, + 1, 118, :_reduce_86, + 1, 118, :_reduce_87, + 1, 118, :_reduce_88, + 2, 104, :_reduce_89, + 2, 105, :_reduce_90, + 1, 120, :_reduce_91, + 2, 120, :_reduce_92, 1, 100, :_reduce_none, - 1, 94, :_reduce_74, - 3, 94, :_reduce_75, - 3, 94, :_reduce_76, - 7, 94, :_reduce_77, - 3, 94, :_reduce_78, - 3, 94, :_reduce_79, - 0, 102, :_reduce_none, - 1, 102, :_reduce_none, - 1, 90, :_reduce_82, - 1, 103, :_reduce_83, - 2, 103, :_reduce_84, - 2, 84, :_reduce_85, - 3, 84, :_reduce_86, - 1, 80, :_reduce_none, - 1, 80, :_reduce_none, - 0, 104, :_reduce_89, - 0, 105, :_reduce_90, - 5, 75, :_reduce_91, - 1, 106, :_reduce_92, - 2, 106, :_reduce_93, - 2, 107, :_reduce_94, - 1, 108, :_reduce_95, - 2, 108, :_reduce_96, - 1, 85, :_reduce_97, - 1, 85, :_reduce_98, - 3, 85, :_reduce_99, + 2, 119, :_reduce_94, + 8, 93, :_reduce_95, + 5, 94, :_reduce_96, + 8, 94, :_reduce_97, + 1, 121, :_reduce_98, + 3, 121, :_reduce_99, + 1, 122, :_reduce_100, + 3, 122, :_reduce_101, + 0, 128, :_reduce_none, + 1, 128, :_reduce_none, + 0, 129, :_reduce_none, + 1, 129, :_reduce_none, + 1, 123, :_reduce_106, + 3, 123, :_reduce_107, + 3, 123, :_reduce_108, + 7, 123, :_reduce_109, + 3, 123, :_reduce_110, + 3, 123, :_reduce_111, + 0, 131, :_reduce_none, + 1, 131, :_reduce_none, + 1, 112, :_reduce_114, + 2, 106, :_reduce_115, + 3, 106, :_reduce_116, + 1, 95, :_reduce_none, + 1, 95, :_reduce_none, + 0, 132, :_reduce_119, + 0, 133, :_reduce_120, + 5, 90, :_reduce_121, + 1, 134, :_reduce_122, + 2, 134, :_reduce_123, + 2, 135, :_reduce_124, + 1, 136, :_reduce_125, + 2, 136, :_reduce_126, + 1, 107, :_reduce_127, + 1, 107, :_reduce_128, + 3, 107, :_reduce_129, + 1, 111, :_reduce_none, + 1, 111, :_reduce_none, + 1, 138, :_reduce_132, + 2, 138, :_reduce_133, + 2, 77, :_reduce_none, + 2, 77, :_reduce_none, + 4, 137, :_reduce_136, + 1, 139, :_reduce_137, + 3, 139, :_reduce_138, + 0, 140, :_reduce_139, + 2, 140, :_reduce_140, + 3, 140, :_reduce_141, + 5, 140, :_reduce_142, + 7, 140, :_reduce_143, + 4, 140, :_reduce_144, + 3, 140, :_reduce_145, + 1, 125, :_reduce_146, + 1, 125, :_reduce_147, + 1, 125, :_reduce_148, + 0, 141, :_reduce_none, + 1, 141, :_reduce_none, + 2, 126, :_reduce_151, + 3, 126, :_reduce_152, + 6, 126, :_reduce_153, + 4, 126, :_reduce_154, + 0, 142, :_reduce_155, + 0, 143, :_reduce_156, + 5, 127, :_reduce_157, + 3, 124, :_reduce_158, + 0, 144, :_reduce_159, + 3, 78, :_reduce_160, + 1, 88, :_reduce_none, + 0, 89, :_reduce_none, 1, 89, :_reduce_none, 1, 89, :_reduce_none, - 1, 110, :_reduce_102, - 2, 110, :_reduce_103, - 2, 62, :_reduce_none, - 2, 62, :_reduce_none, - 4, 109, :_reduce_106, - 1, 111, :_reduce_107, - 3, 111, :_reduce_108, - 0, 112, :_reduce_109, - 2, 112, :_reduce_110, - 3, 112, :_reduce_111, - 5, 112, :_reduce_112, - 7, 112, :_reduce_113, - 4, 112, :_reduce_114, - 3, 112, :_reduce_115, - 1, 96, :_reduce_116, - 1, 96, :_reduce_117, - 1, 96, :_reduce_118, - 0, 113, :_reduce_none, - 1, 113, :_reduce_none, - 2, 97, :_reduce_121, - 3, 97, :_reduce_122, - 6, 97, :_reduce_123, - 4, 97, :_reduce_124, - 0, 114, :_reduce_125, - 0, 115, :_reduce_126, - 5, 98, :_reduce_127, - 3, 95, :_reduce_128, - 0, 116, :_reduce_129, - 3, 63, :_reduce_130, - 1, 73, :_reduce_none, - 0, 74, :_reduce_none, - 1, 74, :_reduce_none, - 1, 74, :_reduce_none, - 1, 74, :_reduce_none, - 1, 101, :_reduce_136 ] - -racc_reduce_n = 137 - -racc_shift_n = 236 + 1, 89, :_reduce_none, + 1, 130, :_reduce_166 ] + +racc_reduce_n = 167 + +racc_shift_n = 285 racc_token_table = { false => 0, @@ -1075,57 +1145,72 @@ def raise_parse_error(error_message, location) :INTEGER => 6, :STRING => 7, :TAG => 8, - "%%" => 9, - "%{" => 10, - "%}" => 11, - "%require" => 12, - ";" => 13, - "%expect" => 14, - "%define" => 15, - "{" => 16, - "}" => 17, - "%param" => 18, - "%lex-param" => 19, - "%parse-param" => 20, - "%code" => 21, - "%initial-action" => 22, - "%no-stdlib" => 23, - "%locations" => 24, - "%union" => 25, - "%destructor" => 26, - "%printer" => 27, - "%error-token" => 28, - "%after-shift" => 29, - "%before-reduce" => 30, - "%after-reduce" => 31, - "%after-shift-error-token" => 32, - "%after-pop-stack" => 33, - "-temp-group" => 34, - "%token" => 35, - "%type" => 36, - "%nterm" => 37, - "%left" => 38, - "%right" => 39, - "%precedence" => 40, - "%nonassoc" => 41, - "%start" => 42, - "%rule" => 43, - "(" => 44, - ")" => 45, - ":" => 46, - "%inline" => 47, - "," => 48, - "|" => 49, - "%empty" => 50, - "%prec" => 51, - "?" => 52, - "+" => 53, - "*" => 54, - "[" => 55, - "]" => 56, - "{...}" => 57 } - -racc_nt_base = 58 + :REGEX => 9, + "%%" => 10, + "%{" => 11, + "%}" => 12, + "%require" => 13, + ";" => 14, + "%expect" => 15, + "%define" => 16, + "{" => 17, + "}" => 18, + "%param" => 19, + "%lex-param" => 20, + "%parse-param" => 21, + "%code" => 22, + "%initial-action" => 23, + "%no-stdlib" => 24, + "%locations" => 25, + "%union" => 26, + "%destructor" => 27, + "%printer" => 28, + "%error-token" => 29, + "%after-shift" => 30, + "%before-reduce" => 31, + "%after-reduce" => 32, + "%after-shift-error-token" => 33, + "%after-pop-stack" => 34, + "-temp-group" => 35, + "%token" => 36, + "%token-pattern" => 37, + "%token-action" => 38, + "%symbol-set" => 39, + "%lexer-context" => 40, + "%lex-prec" => 41, + "%lex-tie" => 42, + "%lex-no-tie" => 43, + "%type" => 44, + "%nterm" => 45, + "%left" => 46, + "%right" => 47, + "%precedence" => 48, + "%nonassoc" => 49, + "%start" => 50, + "<~" => 51, + "<-" => 52, + "-~" => 53, + "<<" => 54, + "-<" => 55, + " 56, + "-s" => 57, + "%rule" => 58, + "(" => 59, + ")" => 60, + ":" => 61, + "%inline" => 62, + "," => 63, + "|" => 64, + "%empty" => 65, + "%prec" => 66, + "?" => 67, + "+" => 68, + "*" => 69, + "[" => 70, + "]" => 71, + "{...}" => 72 } + +racc_nt_base = 73 racc_use_result_var = true @@ -1156,6 +1241,7 @@ def raise_parse_error(error_message, location) "INTEGER", "STRING", "TAG", + "REGEX", "\"%%\"", "\"%{\"", "\"%}\"", @@ -1183,6 +1269,13 @@ def raise_parse_error(error_message, location) "\"%after-pop-stack\"", "\"-temp-group\"", "\"%token\"", + "\"%token-pattern\"", + "\"%token-action\"", + "\"%symbol-set\"", + "\"%lexer-context\"", + "\"%lex-prec\"", + "\"%lex-tie\"", + "\"%lex-no-tie\"", "\"%type\"", "\"%nterm\"", "\"%left\"", @@ -1190,6 +1283,13 @@ def raise_parse_error(error_message, location) "\"%precedence\"", "\"%nonassoc\"", "\"%start\"", + "\"<~\"", + "\"<-\"", + "\"-~\"", + "\"<<\"", + "\"-<\"", + "\" 1 empties = val[0].rhs.select { |sym| sym.is_a?(Lrama::Lexer::Token::Empty) } empties.each do |empty| @@ -2049,8 +2378,8 @@ def _reduce_107(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 374) - def _reduce_108(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 505) + def _reduce_138(val, _values, result) builder = val[2] if !builder.line builder.line = @lexer.line - 1 @@ -2061,8 +2390,8 @@ def _reduce_108(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 384) - def _reduce_109(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 515) + def _reduce_139(val, _values, result) reset_precs result = @grammar.create_rule_builder(@rule_counter, @midrule_action_counter) @@ -2070,8 +2399,8 @@ def _reduce_109(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 389) - def _reduce_110(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 520) + def _reduce_140(val, _values, result) builder = val[0] builder.add_rhs(Lrama::Lexer::Token::Empty.new(location: @lexer.location)) result = builder @@ -2080,8 +2409,8 @@ def _reduce_110(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 395) - def _reduce_111(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 526) + def _reduce_141(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = val[1] token.alias_name = val[2] @@ -2093,8 +2422,8 @@ def _reduce_111(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 404) - def _reduce_112(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 535) + def _reduce_142(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = Lrama::Lexer::Token::InstantiateRule.new(s_value: val[2], alias_name: val[3], location: @lexer.location, args: [val[1]], lhs_tag: val[4]) builder = val[0] @@ -2106,8 +2435,8 @@ def _reduce_112(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 413) - def _reduce_113(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 544) + def _reduce_143(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = Lrama::Lexer::Token::InstantiateRule.new(s_value: val[1].s_value, alias_name: val[5], location: @lexer.location, args: val[3], lhs_tag: val[6]) builder = val[0] @@ -2119,8 +2448,8 @@ def _reduce_113(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 422) - def _reduce_114(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 553) + def _reduce_144(val, _values, result) user_code = val[1] user_code.alias_name = val[2] user_code.tag = val[3] @@ -2132,8 +2461,8 @@ def _reduce_114(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 431) - def _reduce_115(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 562) + def _reduce_145(val, _values, result) on_action_error("multiple %prec in a rule", val[0]) if prec_seen? sym = @grammar.find_symbol_by_id!(val[2]) if val[0].rhs.empty? @@ -2149,33 +2478,33 @@ def _reduce_115(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 444) - def _reduce_116(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 575) + def _reduce_146(val, _values, result) result = "option" result end .,., -module_eval(<<'.,.,', 'parser.y', 445) - def _reduce_117(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 576) + def _reduce_147(val, _values, result) result = "nonempty_list" result end .,., -module_eval(<<'.,.,', 'parser.y', 446) - def _reduce_118(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 577) + def _reduce_148(val, _values, result) result = "list" result end .,., -# reduce 119 omitted +# reduce 149 omitted -# reduce 120 omitted +# reduce 150 omitted -module_eval(<<'.,.,', 'parser.y', 451) - def _reduce_121(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 582) + def _reduce_151(val, _values, result) result = if val[1] [Lrama::Lexer::Token::InstantiateRule.new(s_value: val[1].s_value, location: @lexer.location, args: val[0])] else @@ -2186,29 +2515,29 @@ def _reduce_121(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 457) - def _reduce_122(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 588) + def _reduce_152(val, _values, result) result = val[0].append(val[2]) result end .,., -module_eval(<<'.,.,', 'parser.y', 458) - def _reduce_123(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 589) + def _reduce_153(val, _values, result) result = val[0].append(Lrama::Lexer::Token::InstantiateRule.new(s_value: val[2].s_value, location: @lexer.location, args: val[4])) result end .,., -module_eval(<<'.,.,', 'parser.y', 459) - def _reduce_124(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 590) + def _reduce_154(val, _values, result) result = [Lrama::Lexer::Token::InstantiateRule.new(s_value: val[0].s_value, location: @lexer.location, args: val[2])] result end .,., -module_eval(<<'.,.,', 'parser.y', 464) - def _reduce_125(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 595) + def _reduce_155(val, _values, result) if prec_seen? on_action_error("multiple User_code after %prec", val[0]) if @code_after_prec @code_after_prec = true @@ -2219,39 +2548,39 @@ def _reduce_125(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 472) - def _reduce_126(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 603) + def _reduce_156(val, _values, result) end_c_declaration result end .,., -module_eval(<<'.,.,', 'parser.y', 476) - def _reduce_127(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 607) + def _reduce_157(val, _values, result) result = val[2] result end .,., -module_eval(<<'.,.,', 'parser.y', 479) - def _reduce_128(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 610) + def _reduce_158(val, _values, result) result = val[1].s_value result end .,., -module_eval(<<'.,.,', 'parser.y', 484) - def _reduce_129(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 615) + def _reduce_159(val, _values, result) begin_c_declaration('\Z') result end .,., -module_eval(<<'.,.,', 'parser.y', 488) - def _reduce_130(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 619) + def _reduce_160(val, _values, result) end_c_declaration @grammar.epilogue_first_lineno = val[0].first_line + 1 @grammar.epilogue = val[2].s_value @@ -2260,18 +2589,18 @@ def _reduce_130(val, _values, result) end .,., -# reduce 131 omitted +# reduce 161 omitted -# reduce 132 omitted +# reduce 162 omitted -# reduce 133 omitted +# reduce 163 omitted -# reduce 134 omitted +# reduce 164 omitted -# reduce 135 omitted +# reduce 165 omitted -module_eval(<<'.,.,', 'parser.y', 500) - def _reduce_136(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 631) + def _reduce_166(val, _values, result) result = Lrama::Lexer::Token::Ident.new(s_value: val[0].s_value) result end diff --git a/lib/lrama/reporter.rb b/lib/lrama/reporter.rb index ed25cc7f8..a98203855 100644 --- a/lib/lrama/reporter.rb +++ b/lib/lrama/reporter.rb @@ -5,6 +5,7 @@ require_relative 'reporter/grammar' require_relative 'reporter/precedences' require_relative 'reporter/profile' +require_relative 'reporter/pslr' require_relative 'reporter/rules' require_relative 'reporter/states' require_relative 'reporter/terms' @@ -20,6 +21,7 @@ def initialize(**options) @terms = Terms.new(**options) @conflicts = Conflicts.new @precedences = Precedences.new + @pslr = Pslr.new(**options) @grammar = Grammar.new(**options) @states = States.new(**options) end @@ -31,6 +33,7 @@ def report(io, states) report_duration(:report_terms) { @terms.report(io, states) } report_duration(:report_conflicts) { @conflicts.report(io, states) } report_duration(:report_precedences) { @precedences.report(io, states) } + report_duration(:report_pslr) { @pslr.report(io, states) } report_duration(:report_grammar) { @grammar.report(io, states) } report_duration(:report_states) { @states.report(io, states, ielr: states.ielr_defined?) } end diff --git a/lib/lrama/reporter/pslr.rb b/lib/lrama/reporter/pslr.rb new file mode 100644 index 000000000..766c5855b --- /dev/null +++ b/lib/lrama/reporter/pslr.rb @@ -0,0 +1,42 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Reporter + class Pslr + # @rbs (?pslr: bool, **bool _) -> void + def initialize(pslr: false, **_) + @pslr = pslr + end + + # @rbs (IO io, Lrama::States states) -> void + def report(io, states) + return unless @pslr + return unless states.pslr_defined? + + metrics = states.pslr_metrics + + io << "PSLR Summary\n\n" + io << " Base states: #{metrics[:base_states_count]}\n" + io << " Total states: #{metrics[:total_states_count]}\n" + io << " Split states: #{metrics[:split_state_count]}\n" + io << " State growth: +#{metrics[:growth_count]} (#{format_ratio(metrics[:growth_ratio])})\n" + io << " Token patterns: #{metrics[:token_pattern_count]}\n" + io << " Scanner states: #{metrics[:scanner_fsa_state_count]}\n" + io << " Inadequacies: #{metrics[:inadequacies_count]}\n" + io << " Max states: #{states.pslr_max_states || 'unbounded'}\n" + io << " Max ratio: #{states.pslr_max_state_ratio || 'unbounded'}\n" + io << "\n" + end + + private + + # @rbs (Numeric?) -> String + def format_ratio(value) + return "n/a" if value.nil? + + "#{format('%.2f', value)}x" + end + end + end +end diff --git a/lib/lrama/scanner_fsa.rb b/lib/lrama/scanner_fsa.rb new file mode 100644 index 000000000..f8082c014 --- /dev/null +++ b/lib/lrama/scanner_fsa.rb @@ -0,0 +1,514 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + # Scanner Finite State Automaton for PSLR(1) + # Built from token patterns defined by %token-pattern directives + # Based on Definitions 3.2.12, 3.2.13 from the PSLR dissertation + class ScannerFSA + # Represents a state in the scanner FSA + class State + attr_reader :id #: Integer + attr_reader :transitions #: Hash[String, Integer] + attr_reader :accepting_tokens #: Array[Grammar::TokenPattern] + + # @rbs (Integer id) -> void + def initialize(id) + @id = id + @transitions = {} + @accepting_tokens = [] + end + + # @rbs () -> bool + def accepting? + !@accepting_tokens.empty? + end + + # @rbs (String char, Integer target_state_id) -> void + def add_transition(char, target_state_id) + @transitions[char] = target_state_id + end + + # @rbs (Grammar::TokenPattern token_pattern) -> void + def add_accepting_token(token_pattern) + @accepting_tokens << token_pattern + end + end + + attr_reader :states #: Array[State] + attr_reader :initial_state #: State + attr_reader :token_patterns #: Array[Grammar::TokenPattern] + + # @rbs (Array[Grammar::TokenPattern] token_patterns) -> void + def initialize(token_patterns) + @token_patterns = token_patterns + @states = [] + @state_counter = 0 + build_fsa + end + + # Returns the accepting state for a given FSA state + # Definition 3.2.13 (state_to_accepting_state) + # @rbs (Integer state_id) -> State? + def state_to_accepting_state(state_id) + state = @states[state_id] + return nil unless state&.accepting? + state + end + + # Returns the set of tokens accepted at FSA state ss + # Definition 3.2.12 acc(ss) + # @rbs (Integer state_id) -> Array[Grammar::TokenPattern] + def acc_ss(state_id) + state = @states[state_id] + return [] unless state + state.accepting_tokens + end + + # Simulate the FSA on input string starting from initial state + # Returns all accepting states reached during the scan + # @rbs (String input) -> Array[{state: State, position: Integer, token: Grammar::TokenPattern}] + def scan(input) + results = [] + current_state_id = 0 + + input.each_char.with_index do |char, index| + current_state = @states[current_state_id] + break unless current_state + + next_state_id = current_state.transitions[char] + break unless next_state_id + + current_state_id = next_state_id + next_state = @states[next_state_id] + + if next_state.accepting? + next_state.accepting_tokens.each do |token_pattern| + results << { state: next_state, position: index + 1, token: token_pattern } + end + end + end + + results + end + + private + + # Build the FSA from token patterns + # Uses Thompson's construction for NFAs followed by subset construction for DFA + # @rbs () -> void + def build_fsa + return if @token_patterns.empty? + + # Create initial state + @initial_state = create_state + + # Build NFA for each token pattern and convert to DFA + nfa_states = build_nfa + convert_nfa_to_dfa(nfa_states) + end + + # @rbs () -> State + def create_state + state = State.new(@state_counter) + @state_counter += 1 + @states << state + state + end + + # Simple NFA state for regex compilation + class NFAState + attr_reader :id #: Integer + attr_accessor :transitions #: Hash[String?, Array[NFAState]] + attr_accessor :accepting_token #: Grammar::TokenPattern? + + # @rbs (Integer id) -> void + def initialize(id) + @id = id + @transitions = Hash.new { |h, k| h[k] = [] } + @accepting_token = nil + end + + # @rbs (String? char, NFAState target) -> void + def add_transition(char, target) + @transitions[char] << target + end + + # @rbs () -> bool + def accepting? + !@accepting_token.nil? + end + end + + # Build NFA from all token patterns + # @rbs () -> Array[NFAState] + def build_nfa + nfa_states = [] + nfa_counter = [0] + + # Create NFA start state + nfa_start = create_nfa_state(nfa_counter, nfa_states) + + @token_patterns.each do |token_pattern| + # Build NFA fragment for this pattern + start_state, end_state = compile_regex(token_pattern.regex_pattern, nfa_counter, nfa_states) + + # Connect NFA start to this pattern's start with epsilon + nfa_start.add_transition(nil, start_state) + + # Mark end state as accepting + end_state.accepting_token = token_pattern + end + + nfa_states + end + + # @rbs (Array[Integer] counter, Array[NFAState] states) -> NFAState + def create_nfa_state(counter, states) + state = NFAState.new(counter[0]) + counter[0] += 1 + states << state + state + end + + # Compile a regex pattern to NFA fragment + # Returns [start_state, end_state] + # @rbs (String pattern, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_regex(pattern, counter, states) + # Simple regex compiler supporting: + # - Literal characters + # - Character classes [...] + # - Quantifiers *, +, ? + # - Alternation | + # - Grouping () + + compile_sequence(pattern, 0, counter, states) + end + + # Compile a sequence of regex elements + # @rbs (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_sequence(pattern, pos, counter, states) + fragments = [] + i = pos + + while i < pattern.length + char = pattern[i] + + case char + when '\\' + # Escape sequence + if i + 1 < pattern.length + i += 1 + next_char = pattern[i] + case next_char + when 'd' + # \d matches digit + frag = compile_char_class('0-9', counter, states) + when 'w' + # \w matches word character + frag = compile_char_class('a-zA-Z0-9_', counter, states) + when 's' + # \s matches whitespace + frag = compile_char_class(' \t\n\r\f\v', counter, states) + else + # Literal escaped character + frag = compile_literal(next_char, counter, states) + end + fragments << frag + end + when '[' + # Character class + class_end = pattern.index(']', i) + raise "Unclosed character class in pattern: #{pattern}" unless class_end + + char_class = pattern[i + 1...class_end] + frag = compile_char_class(char_class, counter, states) + fragments << frag + i = class_end + when '*', '+', '?' + # Quantifier - modify the last fragment + if fragments.empty? + raise "Quantifier #{char} without preceding element in pattern: #{pattern}" + end + last_frag = fragments.pop + quantified = apply_quantifier(last_frag, char, counter, states) + fragments << quantified + when '|' + # Alternation - compile remaining and merge + left_start, left_end = concatenate_fragments(fragments, counter, states) + right_start, right_end = compile_sequence(pattern, i + 1, counter, states) + + # Create alternation + alt_start = create_nfa_state(counter, states) + alt_end = create_nfa_state(counter, states) + + alt_start.add_transition(nil, left_start) + alt_start.add_transition(nil, right_start) + left_end.add_transition(nil, alt_end) + right_end.add_transition(nil, alt_end) + + return [alt_start, alt_end] + when '(' + # Find matching closing paren + depth = 1 + j = i + 1 + while j < pattern.length && depth > 0 + if pattern[j] == '(' + depth += 1 + elsif pattern[j] == ')' + depth -= 1 + end + j += 1 + end + raise "Unclosed group in pattern: #{pattern}" if depth > 0 + + group_content = pattern[i + 1...j - 1] + frag = compile_sequence(group_content, 0, counter, states) + fragments << frag + i = j - 1 + when ')' + # End of group - return + break + when '.' + # Match any character (simplified: printable ASCII) + frag = compile_any_char(counter, states) + fragments << frag + else + # Literal character + frag = compile_literal(char, counter, states) + fragments << frag + end + + i += 1 + end + + if fragments.empty? + # Empty pattern + state = create_nfa_state(counter, states) + return [state, state] + end + + concatenate_fragments(fragments, counter, states) + end + + # Compile a single literal character + # @rbs (String char, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_literal(char, counter, states) + start_state = create_nfa_state(counter, states) + end_state = create_nfa_state(counter, states) + start_state.add_transition(char, end_state) + [start_state, end_state] + end + + # Compile a character class [...] + # @rbs (String char_class, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_char_class(char_class, counter, states) + start_state = create_nfa_state(counter, states) + end_state = create_nfa_state(counter, states) + + chars = expand_char_class(char_class) + chars.each do |c| + start_state.add_transition(c, end_state) + end + + [start_state, end_state] + end + + # Expand character class string to array of characters + # @rbs (String char_class) -> Array[String] + def expand_char_class(char_class) + chars = [] + i = 0 + negated = false + + if char_class[0] == '^' + negated = true + i = 1 + end + + while i < char_class.length + if i + 2 < char_class.length && char_class[i + 1] == '-' + # Range + start_char = char_class[i] + end_char = char_class[i + 2] + (start_char..end_char).each { |c| chars << c } + i += 3 + else + chars << char_class[i] + i += 1 + end + end + + if negated + all_printable = (32..126).map(&:chr) + chars = all_printable - chars + end + + chars + end + + # Compile . (any character) + # @rbs (Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_any_char(counter, states) + start_state = create_nfa_state(counter, states) + end_state = create_nfa_state(counter, states) + + # Match printable ASCII + (32..126).each do |code| + start_state.add_transition(code.chr, end_state) + end + + [start_state, end_state] + end + + # Apply a quantifier to a fragment + # @rbs ([NFAState, NFAState] fragment, String quantifier, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def apply_quantifier(fragment, quantifier, counter, states) + frag_start, frag_end = fragment + + case quantifier + when '*' + # Zero or more + new_start = create_nfa_state(counter, states) + new_end = create_nfa_state(counter, states) + + new_start.add_transition(nil, frag_start) + new_start.add_transition(nil, new_end) + frag_end.add_transition(nil, frag_start) + frag_end.add_transition(nil, new_end) + + [new_start, new_end] + when '+' + # One or more + new_end = create_nfa_state(counter, states) + + frag_end.add_transition(nil, frag_start) + frag_end.add_transition(nil, new_end) + + [frag_start, new_end] + when '?' + # Zero or one + new_start = create_nfa_state(counter, states) + new_end = create_nfa_state(counter, states) + + new_start.add_transition(nil, frag_start) + new_start.add_transition(nil, new_end) + frag_end.add_transition(nil, new_end) + + [new_start, new_end] + else + fragment + end + end + + # Concatenate multiple NFA fragments into one + # @rbs (Array[[NFAState, NFAState]] fragments, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def concatenate_fragments(fragments, counter, states) + return create_nfa_state(counter, states).then { |s| [s, s] } if fragments.empty? + return fragments[0] if fragments.size == 1 + + result_start = fragments[0][0] + current_end = fragments[0][1] + + fragments[1..-1].each do |frag_start, frag_end| + current_end.add_transition(nil, frag_start) + current_end = frag_end + end + + [result_start, current_end] + end + + # Convert NFA to DFA using subset construction + # @rbs (Array[NFAState] nfa_states) -> void + def convert_nfa_to_dfa(nfa_states) + return if nfa_states.empty? + + # Clear existing DFA states + @states = [] + @state_counter = 0 + + # Compute epsilon closure of start state + nfa_start = nfa_states[0] + start_closure = epsilon_closure([nfa_start]) + + # Map NFA state sets to DFA states using frozen sorted id arrays as keys + start_key = start_closure.map(&:id).sort.freeze + dfa_states = {} + # Also cache the key for each closure to avoid recomputing + closure_keys = {} + work_list = [start_closure] + dfa_states[start_key] = create_state + closure_keys[start_closure.object_id] = start_key + + @initial_state = @states[0] + + # Mark accepting tokens for initial state + start_closure.each do |nfa_state| + if nfa_state.accepting? + @initial_state.add_accepting_token(nfa_state.accepting_token) + end + end + + while !work_list.empty? + current_nfa_set = work_list.shift + current_key = closure_keys[current_nfa_set.object_id] + current_dfa = dfa_states[current_key] + + # Find all possible transitions, grouping targets by character + transitions = {} + current_nfa_set.each do |nfa_state| + nfa_state.transitions.each do |char, targets| + next if char.nil? # Skip epsilon transitions + transitions[char] ||= [] + transitions[char].concat(targets) + end + end + + transitions.each do |char, targets| + target_closure = epsilon_closure(targets.uniq) + target_key = target_closure.map(&:id).sort.freeze + + unless dfa_states.key?(target_key) + new_dfa_state = create_state + dfa_states[target_key] = new_dfa_state + + # Mark accepting tokens + target_closure.each do |nfa_state| + if nfa_state.accepting? + new_dfa_state.add_accepting_token(nfa_state.accepting_token) + end + end + + closure_keys[target_closure.object_id] = target_key + work_list << target_closure + end + + current_dfa.add_transition(char, dfa_states[target_key].id) + end + end + end + + # Compute epsilon closure of a set of NFA states + # @rbs (Array[NFAState] nfa_states) -> Array[NFAState] + def epsilon_closure(nfa_states) + closure = nfa_states.dup + seen = Set.new(nfa_states.map(&:id)) + work_list = nfa_states.dup + + while !work_list.empty? + state = work_list.shift + epsilon_targets = state.transitions[nil] || [] + + epsilon_targets.each do |target| + unless seen.include?(target.id) + seen << target.id + closure << target + work_list << target + end + end + end + + closure + end + end +end diff --git a/lib/lrama/state.rb b/lib/lrama/state.rb index 50912e094..826d40b1a 100644 --- a/lib/lrama/state.rb +++ b/lib/lrama/state.rb @@ -4,8 +4,10 @@ require_relative "state/action" require_relative "state/inadequacy_annotation" require_relative "state/item" +require_relative "state/pslr_inadequacy" require_relative "state/reduce_reduce_conflict" require_relative "state/resolved_conflict" +require_relative "state/scanner_accepts" require_relative "state/shift_reduce_conflict" module Lrama @@ -55,6 +57,8 @@ class State attr_accessor :follow_kernel_items #: Hash[Action::Goto, Hash[Item, bool]] attr_accessor :always_follows #: Hash[Action::Goto, Array[Grammar::Symbol]] attr_accessor :goto_follows #: Hash[Action::Goto, Array[Grammar::Symbol]] + attr_accessor :pslr_item_lookahead_set #: lookahead_set? + attr_accessor :lexer_context #: Integer? # @rbs (Integer id, Grammar::Symbol accessing_symbol, Array[Item] kernels) -> void def initialize(id, accessing_symbol, kernels) @@ -78,6 +82,8 @@ def initialize(id, accessing_symbol, kernels) @follow_kernel_items = {} @always_follows = {} @goto_follows = {} + @pslr_item_lookahead_set = nil + @lexer_context = nil @lhs_contributions = {} @lane_items = {} end @@ -147,6 +153,18 @@ def set_look_ahead(rule, look_ahead) reduce.look_ahead = look_ahead end + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_reduce_lookahead(reduce) + reduce.look_ahead || item_lookahead_set[reduce.item] || [] + end + + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_pslr_reduce_lookahead(reduce) + return acceptable_reduce_lookahead(reduce) unless @pslr_item_lookahead_set + + @pslr_item_lookahead_set[reduce.item] || acceptable_reduce_lookahead(reduce) + end + # @rbs (Grammar::Rule rule, Hash[Grammar::Symbol, Array[Action::Goto]] sources) -> void def set_look_ahead_sources(rule, sources) reduce = reduces.find do |r| @@ -288,6 +306,16 @@ def split_state? # # @rbs (State next_state) -> lookahead_set def propagate_lookaheads(next_state) + propagate_lookaheads_with_filter(next_state, true) + end + + # @rbs (State next_state) -> lookahead_set + def propagate_lookaheads_without_filter(next_state) + propagate_lookaheads_with_filter(next_state, false) + end + + # @rbs (State next_state, bool apply_filter) -> lookahead_set + def propagate_lookaheads_with_filter(next_state, apply_filter) next_state.kernels.map {|next_kernel| lookahead_sets = if next_kernel.position > 1 @@ -297,7 +325,14 @@ def propagate_lookaheads(next_state) goto_follow_set(next_kernel.lhs) end - [next_kernel, lookahead_sets & next_state.lookahead_set_filters[next_kernel]] + lookahead_sets = + if apply_filter + lookahead_sets & next_state.lookahead_set_filters[next_kernel] + else + lookahead_sets + end + + [next_kernel, lookahead_sets] }.to_h end @@ -441,11 +476,14 @@ def item_lookahead_set [] elsif kernel.position > 1 prev_items = predecessors_with_item(kernel) - prev_items.map {|st, i| st.item_lookahead_set[i] }.reduce([]) {|acc, syms| acc |= syms } + prev_items + .map {|st, i| st.item_lookahead_set[i] } + .compact + .reduce([]) {|acc, syms| acc | syms } elsif kernel.position == 1 prev_state = @predecessors.find {|p| p.transitions.any? {|transition| transition.next_sym == kernel.lhs } } - goto = prev_state.nterm_transitions.find {|goto| goto.next_sym == kernel.lhs } - prev_state.goto_follows[goto] + goto = prev_state&.nterm_transitions&.find {|goto| goto.next_sym == kernel.lhs } + prev_state&.goto_follows&.fetch(goto, []) || [] end [kernel, value] }.to_h @@ -479,11 +517,15 @@ def append_predecessor(prev_state) def goto_follow_set(nterm_token) return [] if nterm_token.accept_symbol? goto = @lalr_isocore.nterm_transitions.find {|g| g.next_sym == nterm_token } + return [] unless goto + + base_terms = Array(@lalr_isocore.always_follows[goto]) @kernels .select {|kernel| @lalr_isocore.follow_kernel_items[goto][kernel] } .map {|kernel| item_lookahead_set[kernel] } - .reduce(@lalr_isocore.always_follows[goto]) {|result, terms| result |= terms } + .compact + .reduce(base_terms) {|result, terms| result | terms } end # Definition 3.8 (Goto Follows Internal Relation) diff --git a/lib/lrama/state/pslr_inadequacy.rb b/lib/lrama/state/pslr_inadequacy.rb new file mode 100644 index 000000000..451458cf5 --- /dev/null +++ b/lib/lrama/state/pslr_inadequacy.rb @@ -0,0 +1,79 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class State + # PSLR Inadequacy detection + # Based on Section 3.4.3 from the PSLR dissertation + # + # PSLR inadequacy occurs when state merging causes different + # pseudo-scanner behavior + class PslrInadequacy + # Inadequacy types + LR_RELATIVE = :lr_relative #: Symbol + PSLR_RELATIVE = :pslr_relative #: Symbol + + attr_reader :type #: Symbol + attr_reader :state #: State + attr_reader :conflicting_states #: Array[State] + attr_reader :details #: Hash[Symbol, untyped] + + # @rbs (type: Symbol, state: State, conflicting_states: Array[State], details: Hash[Symbol, untyped]) -> void + def initialize(type:, state:, conflicting_states:, details:) + @type = type + @state = state + @conflicting_states = conflicting_states + @details = details + end + + # @rbs () -> String + def to_s + message = "PSLR Inadequacy (#{type}): state #{state.id} conflicts with states #{conflicting_states.map(&:id).join(', ')}" + return message if details[:profiles].nil? + + profiles = details[:profiles].map do |profile, state_ids| + "#{state_ids.join(', ')} => #{profile.inspect}" + end + + "#{message} (profiles: #{profiles.join(' | ')})" + end + end + + # PSLR Compatibility checker + # Based on Definition 3.4.1 from the dissertation + class PslrCompatibilityChecker + # @rbs (ScannerAccepts scanner_accepts, LengthPrecedences length_prec) -> void + def initialize(scanner_accepts, length_prec) + @scanner_accepts = scanner_accepts + @length_prec = length_prec + end + + # Build a stable scanner profile for a parser state + # @rbs (State state, ScannerFSA scanner_fsa) -> Array[[Integer, String?]] + def profile(state, scanner_fsa) + scanner_fsa.states.each_with_object([]) do |fsa_state, result| + next unless fsa_state.accepting? + + token = @scanner_accepts[state.id, fsa_state.id] + result << [fsa_state.id, token&.name] + end + end + + # Partition states by scanner profile + # @rbs (Array[State] states, ScannerFSA scanner_fsa) -> Hash[Array[[Integer, String?]], Array[State]] + def group_by_profile(states, scanner_fsa) + states.group_by do |state| + profile(state, scanner_fsa) + end + end + + # Check if two states are PSLR-compatible + # Definition 3.4.1: States are compatible if for any input, + # the pseudo-scanner selects the same token + # @rbs (State s1, State s2, ScannerFSA scanner_fsa) -> bool + def compatible?(s1, s2, scanner_fsa) + profile(s1, scanner_fsa) == profile(s2, scanner_fsa) + end + end + end +end diff --git a/lib/lrama/state/scanner_accepts.rb b/lib/lrama/state/scanner_accepts.rb new file mode 100644 index 000000000..8e9c2a173 --- /dev/null +++ b/lib/lrama/state/scanner_accepts.rb @@ -0,0 +1,184 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class State + # Scanner accepts table for PSLR(1) + # Based on Definition 3.2.14 from the PSLR dissertation + # + # scanner_accepts[sp, sa]: For parser state sp and accepting state sa, + # returns the token that should be selected + class ScannerAccepts + attr_reader :table #: Hash[[Integer, Integer], Grammar::TokenPattern?] + + # @rbs (Array[State] parser_states, ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + def initialize(parser_states, scanner_fsa, lex_prec, length_prec) + @parser_states = parser_states + @scanner_fsa = scanner_fsa + @lex_prec = lex_prec + @length_prec = length_prec + @table = {} + @profile_map = {} #: Hash[untyped, untyped] # Cache for conflict profile resolution + end + + # Build the scanner_accepts table + # Based on Definition 3.2.20 (compute_scanner_accepts) + # @rbs () -> void + def build + @parser_states.each do |parser_state| + compute_for_parser_state(parser_state) + end + end + + # Get the accepted token for a parser state and accepting state + # @rbs (Integer parser_state_id, Integer accepting_state_id) -> Grammar::TokenPattern? + def [](parser_state_id, accepting_state_id) + @table[[parser_state_id, accepting_state_id]] + end + + private + + # Compute scanner_accepts for a single parser state + # Uses DFS to explore the FSA state space + # @rbs (State parser_state) -> void + def compute_for_parser_state(parser_state) + visited = Set.new + dfs(parser_state, 0, visited) # Start from FSA initial state (id 0) + end + + # DFS exploration of FSA states + # @rbs (State parser_state, Integer fsa_state_id, Set[Integer] visited) -> void + def dfs(parser_state, fsa_state_id, visited) + return if visited.include?(fsa_state_id) + visited << fsa_state_id + + fsa_state = @scanner_fsa.states[fsa_state_id] + return unless fsa_state + + # If this is an accepting state, compute the accepted token + if fsa_state.accepting? + token = resolve(parser_state, fsa_state) + @table[[parser_state.id, fsa_state_id]] = token if token + end + + # Explore transitions + fsa_state.transitions.each_value do |next_state_id| + dfs(parser_state, next_state_id, visited) + end + end + + # Resolve which token should be accepted + # Based on Definition 3.2.19 (resolve) + # @rbs (State parser_state, ScannerFSA::State fsa_state) -> Grammar::TokenPattern? + def resolve(parser_state, fsa_state) + # Get tokens that are both: + # 1. Accepted by the FSA at this state (acc(ss)) + # 2. Accepted by the parser at this state (acc(sp)) + acc_ss = fsa_state.accepting_tokens + acc_sp = compute_acc_sp(parser_state) + + # Intersection: tokens that can be both scanned and parsed + acc_sp_ss = acc_ss.select do |token_pattern| + acc_sp.include?(token_pattern.name) + end + + return nil if acc_sp_ss.empty? + + # Select the highest priority token + select_best_token(acc_sp_ss) + end + + # Compute acc(sp): set of terminal symbols acceptable at parser state sp + # Memoized per parser state id + # @rbs (State parser_state) -> Set[String] + def compute_acc_sp(parser_state) + @acc_sp_cache ||= {} + return @acc_sp_cache[parser_state.id] if @acc_sp_cache.key?(parser_state.id) + + tokens = Set.new + + # Layout tokens must be accepted in every parser state + @scanner_fsa.token_patterns.each do |tp| + tokens << tp.name if tp.layout? + end + + # Add tokens from shift actions (term_transitions) + parser_state.term_transitions.each do |shift| + next_sym = shift.next_sym + tokens << next_sym.id.s_value if next_sym.term? + end + + # Add tokens from reduce actions (lookahead) + parser_state.reduces.each do |reduce| + parser_state.acceptable_pslr_reduce_lookahead(reduce).each do |la| + tokens << la.id.s_value + end + end + + @acc_sp_cache[parser_state.id] = tokens + tokens + end + + # Select the best token from candidates based on precedence rules + # @rbs (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + def select_best_token(candidates) + return candidates.first if candidates.size <= 1 + + # Sort by: + # 1. Explicit precedence (from %lex-prec - rules) + # 2. Definition order (first defined wins) + candidates.min_by do |token| + priority_rank(token, candidates) + end + end + + # Compute priority rank for a token among candidates + # Lower rank = higher priority + # @rbs (Grammar::TokenPattern token, Array[Grammar::TokenPattern] candidates) -> [Integer, Integer] + def priority_rank(token, candidates) + # Check if this token has explicit higher priority over others + higher_count = candidates.count do |other| + next false if other == token + @lex_prec.higher_priority?(token.name, other.name) + end + + # Tokens with more "higher than" relationships get lower rank + # Fallback to definition order + [-higher_count, token.definition_order] + end + + # Compatibility checker for PSLR state merging + # Compares two sets of acceptable token names to determine + # if they would produce the same scanner behavior + class CompatibilityChecker + # @rbs (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + def initialize(scanner_fsa, lex_prec, length_prec) + @scanner_fsa = scanner_fsa + @lex_prec = lex_prec + @length_prec = length_prec + end + + # @rbs (Set[String] set1, Set[String] set2) -> bool + def compatible?(set1, set2) + @scanner_fsa.states.each do |fsa_state| + next unless fsa_state.accepting? + + acc = fsa_state.accepting_tokens + names1 = acc.select { |t| set1.include?(t.name) }.map(&:name).to_set + names2 = acc.select { |t| set2.include?(t.name) }.map(&:name).to_set + + if names1.empty? && names2.empty? + next + elsif names1.empty? || names2.empty? + return false unless fsa_state.transitions.empty? + else + return false unless names1 == names2 + end + end + + true + end + end + end + end +end diff --git a/lib/lrama/states.rb b/lib/lrama/states.rb index ddce627df..9b2ce9a4f 100644 --- a/lib/lrama/states.rb +++ b/lib/lrama/states.rb @@ -2,6 +2,8 @@ # frozen_string_literal: true require "forwardable" +require "set" +require_relative "lexer_context_classifier" require_relative "tracer/duration" require_relative "state/item" @@ -36,12 +38,19 @@ class States include Lrama::Tracer::Duration def_delegators "@grammar", :symbols, :terms, :nterms, :rules, :precedences, - :accept_symbol, :eof_symbol, :undef_symbol, :find_symbol_by_s_value!, :ielr_defined? + :accept_symbol, :eof_symbol, :undef_symbol, :find_symbol_by_s_value!, :ielr_defined?, :pslr_defined?, + :token_patterns, :lex_prec, :pslr_max_states, :pslr_max_state_ratio attr_reader :states #: Array[State] attr_reader :reads_relation #: Hash[State::Action::Goto, Array[State::Action::Goto]] attr_reader :includes_relation #: Hash[State::Action::Goto, Array[State::Action::Goto]] attr_reader :lookback_relation #: Hash[state_id, Hash[rule_id, Array[State::Action::Goto]]] + attr_reader :scanner_fsa #: ScannerFSA? + attr_reader :length_precedences #: LengthPrecedences? + attr_reader :scanner_accepts_table #: State::ScannerAccepts? + attr_reader :pslr_inadequacies #: Array[State::PslrInadequacy] + attr_reader :pslr_metrics #: Hash[Symbol, Integer | Float | nil] + attr_reader :lexer_context_classifier #: LexerContextClassifier? # @rbs (Grammar grammar, Tracer tracer) -> void def initialize(grammar, tracer) @@ -105,6 +114,17 @@ def initialize(grammar, tracer) # second key is rule_id, # value is bitmap of term. @la = {} + @pslr_inadequacies = [] + @pslr_metrics = { + base_states_count: nil, + total_states_count: nil, + split_state_count: 0, + growth_count: 0, + growth_ratio: nil, + token_pattern_count: 0, + scanner_fsa_state_count: 0, + inadequacies_count: 0 + } end # @rbs () -> void @@ -141,6 +161,42 @@ def compute_ielr report_duration(:compute_default_reduction) { compute_default_reduction } end + # Compute PSLR(1) states + # Based on Section 3.4 of the PSLR dissertation + # @rbs () -> void + def compute_pslr + capture_pslr_metrics_before_split + # Preparation + report_duration(:clear_conflicts) { clear_conflicts } + # Phase 1 + report_duration(:compute_predecessors) { compute_predecessors } + report_duration(:compute_follow_kernel_items) { compute_follow_kernel_items } + report_duration(:compute_always_follows) { compute_always_follows } + report_duration(:compute_goto_follows) { compute_goto_follows } + # Phase 2 + report_duration(:build_scanner_fsa) { build_scanner_fsa } + report_duration(:build_length_precedences) { build_length_precedences } + report_duration(:compute_inadequacy_annotations) { compute_inadequacy_annotations } + # Phase 3a: PSLR split (Scanner FSA-based) + @pslr_split_enabled = true + report_duration(:split_states) { split_states } + @pslr_split_enabled = false + # Phase 3b: Lexer context classification + context-based split + report_duration(:classify_lexer_contexts) { classify_lexer_contexts } + report_duration(:split_states_by_context) { split_states_by_context } + # Phase 4 + report_duration(:clear_look_ahead_sets) { clear_look_ahead_sets } + report_duration(:compute_look_ahead_sets) { compute_look_ahead_sets } + # Phase 5 + report_duration(:compute_conflicts) { compute_conflicts(:ielr) } + report_duration(:compute_default_reduction) { compute_default_reduction } + report_duration(:build_scanner_accepts) { build_scanner_accepts } + report_duration(:handle_pslr_inadequacies) { handle_pslr_inadequacies } + # Phase 6: Re-classify after all splits + report_duration(:classify_lexer_contexts) { classify_lexer_contexts } + finalize_pslr_metrics + end + # @rbs () -> Integer def states_count @states.count @@ -189,6 +245,52 @@ def rr_conflicts_count # @rbs (Logger logger) -> void def validate!(logger) validate_conflicts_within_threshold!(logger) + validate_pslr_state_growth!(logger) + validate_pslr_inadequacies!(logger) + end + + # Classify each state's lexer context based on kernel items. + # + # For each state, analyzes the kernel items to determine what lexer + # context (BEG, CMDARG, ARG, END, ENDFN, MID, DOT) the state belongs to. + # When a state has kernel items from multiple contexts, the context is + # set to the bitwise OR of all contexts (mixed context). + # + # @rbs () -> void + def classify_lexer_contexts + return if @grammar.lexer_contexts.empty? + + @lexer_context_classifier = LexerContextClassifier.new( + @grammar.lexer_contexts, + @grammar.parameterized_expansion_args + ) + + @states.each do |state| + groups = @lexer_context_classifier.classify(state) + + # Combine all contexts into a single bitmask + combined = 0 + groups.each_key do |ctx| + combined |= ctx if ctx > 0 + end + + state.lexer_context = combined + end + end + + # Return the lexer context table as an array of context values, + # one per parser state (indexed by state id). + # + # @rbs () -> Array[Integer] + def lexer_context_table + @states.map { |state| state.lexer_context || 0 } + end + + # Check if lexer context classification has been performed. + # + # @rbs () -> bool + def lexer_context_enabled? + pslr_defined? && @lexer_context_classifier != nil end def compute_la_sources_for_conflicted_states @@ -755,12 +857,139 @@ def compute_always_follows_bitmaps # @rbs () -> void def split_states @states.each do |state| - state.transitions.each do |transition| + state.transitions.dup.each do |transition| compute_state(state, transition, transition.to_state) end end end + # Split states where different predecessor paths lead to different + # lexer contexts. This resolves LALR state merging that makes + # BEG vs CMDARG (and other context pairs) indistinguishable. + # + # Algorithm: + # 1. For each state, group incoming transitions by the lexer context + # that the predecessor would imply + # 2. If a state has predecessors from multiple different contexts, + # split the state so each split has a unique context + # + # @rbs () -> void + def split_states_by_context + return unless @lexer_context_classifier + + # Iterate over a snapshot of states (new states may be added) + states_snapshot = @states.dup + + states_snapshot.each do |state| + # Skip start state and states with no context + next if state.kernels.any?(&:start_item?) + + # Group predecessor transitions by the context they imply + context_groups = compute_predecessor_context_groups(state) + + # Only split if there are multiple distinct non-zero contexts + meaningful_groups = context_groups.reject { |ctx, _| ctx == 0 } + next if meaningful_groups.size <= 1 + + # The largest group keeps the original state + primary_ctx, = meaningful_groups.max_by { |_, transitions| transitions.size } + + meaningful_groups.each do |ctx, transitions| + next if ctx == primary_ctx + + # Create a new split state for this context group + split = create_context_split_state(state) + split.lexer_context = ctx + + # Update predecessor transitions to point to the new split state + transitions.each do |pred_state, transition| + pred_state.update_transition(transition, split) + end + end + + # Update the original state's context to the primary + state.lexer_context = primary_ctx + end + end + + # For a given state, group its incoming transitions by the lexer context + # that the predecessor state implies for this state. + # + # The implied context is determined by what symbol was used to reach + # this state (the accessing symbol's context). + # + # @rbs (State state) -> Hash[Integer, Array[[State, State::Action::Shift | State::Action::Goto]]] + def compute_predecessor_context_groups(state) + groups = Hash.new { |h, k| h[k] = [] } + + state.predecessors.each do |pred| + pred.transitions.each do |transition| + next unless transition.to_state == state + + # The context is determined by the predecessor's context + # combined with what we're transitioning on + ctx = infer_transition_context(pred, transition) + groups[ctx] << [pred, transition] + end + end + + groups + end + + # Infer the lexer context that a transition implies for the target state. + # + # @rbs (State pred, State::Action::Shift | State::Action::Goto transition) -> Integer + def infer_transition_context(pred, transition) + sym = transition.next_sym + if sym.term? + @lexer_context_classifier.classify_terminal_context(sym) + else + @lexer_context_classifier.classify_nonterminal_context(sym) + end + end + + # Create a new split state that is an isocore copy of the given state. + # + # @rbs (State original) -> State + def create_context_split_state(original) + base = original.lalr_isocore || original + new_state = State.new(@states.count, base.accessing_symbol, base.kernels) + new_state.closure = base.closure + new_state.compute_transitions_and_reduces + + # Copy transition targets from original + original.transitions.each do |transition| + new_state.set_items_to_state(transition.to_items, transition.to_state) + end + + @states << new_state + new_state.lalr_isocore = base + base.ielr_isocores << new_state + base.ielr_isocores.each do |st| + st.ielr_isocores = base.ielr_isocores + end + + new_state.lookaheads_recomputed = true + new_state.item_lookahead_set = original.item_lookahead_set + new_state.pslr_item_lookahead_set = original.pslr_item_lookahead_set + + new_state + end + + # @rbs () -> void + def capture_pslr_metrics_before_split + @pslr_metrics = { + base_states_count: @states.count, + total_states_count: @states.count, + split_state_count: 0, + growth_count: 0, + growth_ratio: 1.0, + token_pattern_count: token_patterns.size, + scanner_fsa_state_count: 0, + inadequacies_count: 0 + } + end + # @rbs () -> void def compute_inadequacy_annotations @states.each do |state| @@ -782,17 +1011,32 @@ def compute_inadequacy_annotations def merge_lookaheads(state, filtered_lookaheads) return if state.kernels.all? {|item| (filtered_lookaheads[item] - state.item_lookahead_set[item]).empty? } - state.item_lookahead_set = state.item_lookahead_set.merge {|_, v1, v2| v1 | v2 } + state.item_lookahead_set = state.item_lookahead_set.merge(filtered_lookaheads) {|_, v1, v2| v1 | v2 } state.transitions.each do |transition| next if transition.to_state.lookaheads_recomputed compute_state(state, transition, transition.to_state) end end + # @rbs (State state, State::lookahead_set pslr_lookaheads) -> void + def merge_pslr_lookaheads(state, pslr_lookaheads) + state.pslr_item_lookahead_set ||= state.kernels.map {|kernel| [kernel, []] }.to_h + return if state.kernels.all? {|item| (pslr_lookaheads[item] - state.pslr_item_lookahead_set[item]).empty? } + + state.pslr_item_lookahead_set = state.pslr_item_lookahead_set.merge(pslr_lookaheads) {|_, v1, v2| v1 | v2 } + end + # @rbs (State state, State::Action::Shift | State::Action::Goto transition, State next_state) -> void def compute_state(state, transition, next_state) propagating_lookaheads = state.propagate_lookaheads(next_state) - s = next_state.ielr_isocores.find {|st| st.is_compatible?(propagating_lookaheads) } + pslr_lookaheads = + if @pslr_split_enabled + state.propagate_lookaheads_without_filter(next_state) + else + propagating_lookaheads + end + + s = next_state.ielr_isocores.find {|st| compatible_split_state?(st, propagating_lookaheads, pslr_lookaheads) } if s.nil? s = next_state.lalr_isocore @@ -809,17 +1053,97 @@ def compute_state(state, transition, next_state) st.ielr_isocores = s.ielr_isocores end new_state.lookaheads_recomputed = true - new_state.item_lookahead_set = propagating_lookaheads + new_state.item_lookahead_set = pslr_lookaheads + new_state.pslr_item_lookahead_set = pslr_lookaheads state.update_transition(transition, new_state) elsif(!s.lookaheads_recomputed) s.lookaheads_recomputed = true - s.item_lookahead_set = propagating_lookaheads + s.item_lookahead_set = pslr_lookaheads + s.pslr_item_lookahead_set = pslr_lookaheads else + merge_pslr_lookaheads(s, pslr_lookaheads) if @pslr_split_enabled merge_lookaheads(s, propagating_lookaheads) state.update_transition(transition, s) if state.items_to_state[transition.to_items].id != s.id end end + # @rbs (State state, State::lookahead_set filtered_lookaheads, ?State::lookahead_set pslr_lookaheads) -> bool + def compatible_split_state?(state, filtered_lookaheads, pslr_lookaheads = nil) + return false unless state.is_compatible?(filtered_lookaheads) + return true unless @pslr_split_enabled && @scanner_fsa + + pslr_lookaheads ||= filtered_lookaheads + + pslr_state_signature(state) == pslr_state_signature(state, pslr_lookaheads) + end + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Array[[Integer, String?]] + def pslr_state_signature(state, filtered_lookaheads = nil) + return [] unless @scanner_fsa + + acc_sp = acceptable_tokens_for_pslr(state, filtered_lookaheads) + + # Cache: use frozen acc_sp as key to avoid recomputing signature + # for states with identical acceptable token sets + cache_key = acc_sp.to_a.sort.freeze + @_pslr_sig_cache ||= {} + return @_pslr_sig_cache[cache_key] if @_pslr_sig_cache.key?(cache_key) + + # Pre-filter: only iterate over FSA accepting states (cached list) + @_fsa_accepting_states ||= @scanner_fsa.states.select(&:accepting?).freeze + + sig = @_fsa_accepting_states.each_with_object([]) do |fsa_state, signature| + candidates = fsa_state.accepting_tokens.select do |token_pattern| + acc_sp.include?(token_pattern.name) + end + signature << [fsa_state.id, select_best_pslr_token(candidates)&.name] + end + + @_pslr_sig_cache[cache_key] = sig + sig + end + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Set[String] + def acceptable_tokens_for_pslr(state, filtered_lookaheads = nil) + tokens = Set.new + kernel_reduce_items = state.kernels.select(&:end_of_rule?).to_set + + state.term_transitions.each do |shift| + next_sym = shift.next_sym + tokens << next_sym.id.s_value if next_sym.term? + end + + state.reduces.each do |reduce| + look_ahead = + if filtered_lookaheads && kernel_reduce_items.include?(reduce.item) + filtered_lookaheads[reduce.item] || [] + else + state.acceptable_pslr_reduce_lookahead(reduce) + end + + look_ahead.each do |la| + tokens << la.id.s_value + end + end + + tokens + end + + # @rbs (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + def select_best_pslr_token(candidates) + return nil if candidates.empty? + return candidates.first if candidates.size == 1 + + candidates.min_by do |token| + higher_count = candidates.count do |other| + next false if other == token + lex_prec.higher_priority?(token.name, other.name) + end + + [-higher_count, token.definition_order] + end + end + # @rbs (Logger logger) -> void def validate_conflicts_within_threshold!(logger) exit false unless conflicts_within_threshold?(logger) @@ -863,5 +1187,143 @@ def clear_look_ahead_sets @_follow_sets = nil @_la = nil end + + # Build Scanner FSA from token patterns + # @rbs () -> void + def build_scanner_fsa + return if token_patterns.empty? + + @scanner_fsa = ScannerFSA.new(token_patterns) + end + + # Build length precedences table + # @rbs () -> void + def build_length_precedences + @length_precedences = LengthPrecedences.new(lex_prec) + end + + # Build scanner_accepts table + # @rbs () -> void + def build_scanner_accepts + return unless @scanner_fsa + + @scanner_accepts_table = State::ScannerAccepts.new( + @states, + @scanner_fsa, + lex_prec, + @length_precedences + ) + @scanner_accepts_table.build + end + + # Handle PSLR inadequacies + # Detects and splits states where pseudo-scanner behavior differs + # @rbs () -> void + def handle_pslr_inadequacies + return unless @scanner_fsa && @scanner_accepts_table + + @pslr_inadequacies = detect_pslr_inadequacies + return if @pslr_inadequacies.empty? + + @tracer.warn("Detected #{@pslr_inadequacies.size} unresolved PSLR inadequacies") if @tracer.respond_to?(:warn) + end + + # @rbs () -> void + def finalize_pslr_metrics + return unless pslr_defined? + + base_states_count = @pslr_metrics[:base_states_count] || @states.count + total_states_count = @states.count + + @pslr_metrics = { + base_states_count: base_states_count, + total_states_count: total_states_count, + split_state_count: @states.count(&:split_state?), + growth_count: total_states_count - base_states_count, + growth_ratio: base_states_count.zero? ? nil : total_states_count.to_f / base_states_count, + token_pattern_count: token_patterns.size, + scanner_fsa_state_count: @scanner_fsa ? @scanner_fsa.states.size : 0, + inadequacies_count: @pslr_inadequacies.size + } + end + + # Detect PSLR inadequacies in isocore groups + # @rbs () -> Array[State::PslrInadequacy] + def detect_pslr_inadequacies + inadequacies = [] + + @states.each do |state| + state.transitions.each do |transition| + next_state = transition.to_state + next unless next_state + + propagating_lookaheads = state.propagate_lookaheads_without_filter(next_state.lalr_isocore) + expected_profile = pslr_state_signature(next_state, propagating_lookaheads) + actual_profile = pslr_state_signature(next_state) + + next if expected_profile == actual_profile + + matching_state = next_state.ielr_isocores.find do |candidate| + pslr_state_signature(candidate) == expected_profile + end + + inadequacies << State::PslrInadequacy.new( + type: State::PslrInadequacy::PSLR_RELATIVE, + state: next_state, + conflicting_states: [matching_state, next_state].compact.uniq, + details: { + reason: "Transition reaches a state with an incompatible PSLR scanner profile", + from_state_id: state.id, + transition_symbol: transition.next_sym.id.s_value, + expected_profile: expected_profile, + actual_profile: actual_profile, + matching_state_id: matching_state&.id + } + ) + end + end + + inadequacies + end + + # @rbs (Logger logger) -> void + def validate_pslr_inadequacies!(logger) + return unless pslr_defined? + return if @pslr_inadequacies.empty? + + @pslr_inadequacies.each do |inadequacy| + logger.warn(inadequacy.to_s) + end + + # Do not exit on PSLR inadequacies — treat as warnings. + # The handwritten lexer handles remaining ambiguities. + end + + # @rbs (Logger logger) -> void + def validate_pslr_state_growth!(logger) + return unless pslr_defined? + + errors = [] + base_states_count = @pslr_metrics[:base_states_count] || @states.count + total_states_count = @pslr_metrics[:total_states_count] || @states.count + split_state_count = @pslr_metrics[:split_state_count] || @states.count(&:split_state?) + growth_ratio = @pslr_metrics[:growth_ratio] || 1.0 + + if (limit = pslr_max_states) && limit < total_states_count + errors << "PSLR state growth exceeded pslr.max-states=#{limit} (total=#{total_states_count}, base=#{base_states_count}, split=#{split_state_count})" + end + + if (limit = pslr_max_state_ratio) && limit < growth_ratio + errors << "PSLR state growth exceeded pslr.max-state-ratio=#{limit} (ratio=#{format('%.2f', growth_ratio)}x, total=#{total_states_count}, base=#{base_states_count})" + end + + return if errors.empty? + + errors.each do |message| + logger.error(message) + end + + exit false + end end end diff --git a/lib/lrama/warnings/lexical_tie_candidates.rb b/lib/lrama/warnings/lexical_tie_candidates.rb new file mode 100644 index 000000000..e45cccb2e --- /dev/null +++ b/lib/lrama/warnings/lexical_tie_candidates.rb @@ -0,0 +1,27 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Warnings + class LexicalTieCandidates + # @rbs (Logger logger, bool warnings) -> void + def initialize(logger, warnings) + @logger = logger + @warnings = warnings + end + + # @rbs (Lrama::States states) -> void + def warn(states) + return unless @warnings + return unless states.respond_to?(:lexical_tie_candidates) + + states.lexical_tie_candidates.each do |left, right| + @logger.warn( + "lexical tie candidate: #{left} and #{right} conflict lexically but are not tied; " \ + "add %lex-tie #{left} #{right} or %lex-no-tie #{left} #{right}" + ) + end + end + end + end +end diff --git a/parser.y b/parser.y index f256d5330..ecf7ec66a 100644 --- a/parser.y +++ b/parser.y @@ -2,7 +2,7 @@ class Lrama::Parser expect 0 error_on_expect_mismatch - token C_DECLARATION CHARACTER IDENT_COLON IDENTIFIER INTEGER STRING TAG + token C_DECLARATION CHARACTER IDENT_COLON IDENTIFIER INTEGER STRING TAG REGEX rule @@ -132,6 +132,13 @@ rule symbol_declaration: "%token" token_declarations + | "%token-pattern" token_pattern_declarations + | "%token-action" token_action_declarations + | "%symbol-set" symbol_set_declaration + | "%lexer-context" lexer_context_declaration + | "%lex-prec" lex_prec_declarations + | "%lex-tie" lex_tie_declaration + | "%lex-no-tie" lex_no_tie_declaration | "%type" symbol_declarations { val[1].each {|hash| @@ -213,6 +220,130 @@ rule token_declaration: id INTEGER? alias { result = val } + token_pattern_declarations: + TAG? token_pattern_declaration+ + { + val[1].each {|decl| + @grammar.add_token_pattern( + id: decl[:id], + pattern: decl[:pattern], + alias_name: decl[:alias], + tag: val[0], + lineno: decl[:id].first_line + ) + } + } + | token_pattern_declarations TAG token_pattern_declaration+ + { + val[2].each {|decl| + @grammar.add_token_pattern( + id: decl[:id], + pattern: decl[:pattern], + alias_name: decl[:alias], + tag: val[1], + lineno: decl[:id].first_line + ) + } + } + + token_pattern_declaration: + IDENTIFIER REGEX alias + { + result = { id: val[0], pattern: val[1], alias: val[2] } + } + + lexer_context_declaration: + IDENTIFIER symbol+ + { + @grammar.add_lexer_context(name: val[0].s_value, symbols: val[1]) + } + + symbol_set_declaration: + IDENTIFIER symbol+ + { + @grammar.add_symbol_set(name: val[0].s_value, symbols: val[1]) + } + + lex_prec_declarations: + lex_prec_chain + { + val[0].each {|rule| + @grammar.add_lex_prec_rule( + left_token: rule[:left], + operator: rule[:op], + right_token: rule[:right], + lineno: rule[:left].first_line + ) + } + } + + lex_prec_chain: + symbol lex_prec_op symbol + { + result = [{ left: val[0], op: val[1], right: val[2] }] + } + | lex_prec_chain lex_prec_op symbol + { + last_right = val[0].last[:right] + result = val[0] + [{ left: last_right, op: val[1], right: val[2] }] + } + + lex_prec_op: + "<~" + { + result = Lrama::Grammar::LexPrec::IDENTITY_RIGHT_LONGEST + } + | "<-" + { + result = Lrama::Grammar::LexPrec::IDENTITY_RIGHT + } + | "-~" + { + result = Lrama::Grammar::LexPrec::LONGEST + } + | "<<" + { + result = Lrama::Grammar::LexPrec::TOKEN_RIGHT + } + | "-<" + { + result = Lrama::Grammar::LexPrec::TOKEN_RIGHT_LENGTH + } + | " Grammar::Symbol def ielr_defined?: () -> bool + + def pslr_defined?: () -> bool + + def token_patterns: () -> Array[Grammar::TokenPattern] + + def lex_prec: () -> Grammar::LexPrec + + def pslr_max_states: () -> Integer? + + def pslr_max_state_ratio: () -> Float? end include Symbols::Resolver::_DelegatedMethods @@ -76,6 +86,14 @@ module Lrama @start_nterm: Lrama::Lexer::Token::Base? + @token_patterns: Array[Grammar::TokenPattern] + + @lex_prec: Grammar::LexPrec + + @symbol_sets: Hash[String, Array[Lexer::Token::Base]] + + @lex_tie: Grammar::LexTie + extend Forwardable attr_reader percent_codes: Array[PercentCode] @@ -136,6 +154,22 @@ module Lrama attr_accessor required: bool + attr_reader token_patterns: Array[Grammar::TokenPattern] + + attr_reader lex_prec: Grammar::LexPrec + + attr_reader symbol_sets: Hash[String, Array[Lexer::Token::Base]] + + attr_reader lex_tie: Grammar::LexTie + + attr_reader lexer_contexts: Hash[String, Grammar::LexerContext] + + attr_reader token_actions: Array[Grammar::TokenAction] + + # Argument symbol names for each parameterized rule expansion. + # @rbs () -> Hash[String, Array[String]] + def parameterized_expansion_args: () -> Hash[String, Array[String]] + # @rbs (Counter rule_counter, bool locations, Hash[String, String] define) -> void def initialize: (Counter rule_counter, bool locations, Hash[String, String] define) -> void @@ -227,8 +261,114 @@ module Lrama # @rbs () -> bool def ielr_defined?: () -> bool + # @rbs () -> bool + def pslr_defined?: () -> bool + + # @rbs () -> String? + def pslr_state_member: () -> String? + + # @rbs () -> Integer? + def pslr_max_states: () -> Integer? + + # @rbs () -> Float? + def pslr_max_state_ratio: () -> Float? + + # @rbs () -> Array[Grammar::TokenPattern] + def layout_token_patterns: () -> Array[Grammar::TokenPattern] + + # @rbs () -> Set[String] + def layout_token_names: () -> Set[String] + + # Add a token pattern from %token-pattern directive + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer) -> Grammar::TokenPattern + def add_token_pattern: (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, lineno: Integer, ?alias_name: String?, ?tag: Lexer::Token::Tag?) -> Grammar::TokenPattern + + # Add a symbol set from %symbol-set directive. + # @rbs (name: String, symbols: Array[Lexer::Token::Base]) -> Array[Lexer::Token::Base] + def add_symbol_set: (name: String, symbols: Array[Lexer::Token::Base]) -> Array[Lexer::Token::Base] + + # Add lex-prec rules from %lex-prec directive. + # Stores as raw declaration for delayed expansion after implicit literal synthesis. + # @rbs (left_token: Lexer::Token::Base, operator: Symbol, right_token: Lexer::Token::Base, lineno: Integer) -> Grammar::LexPrec::Declaration + def add_lex_prec_rule: (left_token: Lexer::Token::Base, operator: Symbol, right_token: Lexer::Token::Base, lineno: Integer) -> Grammar::LexPrec::Declaration + + # Finalize lexical declarations after implicit literal synthesis. + # Expands yyall and symbol-set operands using the post-synthesis token universe. + # Validates that identity-component operators are not used as self-pairs. + # @rbs () -> void + def finalize_lexical_declarations!: () -> void + + # Add lexical tie relationships from %lex-tie directive. + # @rbs (operands: Array[Lexer::Token::Base]) -> void + def add_lex_tie: (operands: Array[Lexer::Token::Base]) -> void + + # Add no-tie declarations from %lex-no-tie directive. + # @rbs (operands: Array[Lexer::Token::Base]) -> void + def add_lex_no_tie: (operands: Array[Lexer::Token::Base]) -> void + + # Add a lexer context from %lexer-context directive + # @rbs (name: String, symbols: Array[Lexer::Token::Ident]) -> Grammar::LexerContext + def add_lexer_context: (name: String, symbols: Array[Lexer::Token::Ident]) -> Grammar::LexerContext + + # Add a token action from %token-action directive + # @rbs (id: Lexer::Token::Ident, code: Lexer::Token::UserCode, lineno: Integer) -> Grammar::TokenAction + def add_token_action: (id: Lexer::Token::Ident, code: Lexer::Token::UserCode, lineno: Integer) -> Grammar::TokenAction + + # Find a token pattern by its name + # @rbs (String name) -> Grammar::TokenPattern? + def find_token_pattern: (String name) -> Grammar::TokenPattern? + + # @rbs (Set[String] tokens) -> Set[String] + def expand_lexical_ties: (Set[String] tokens) -> Set[String] + + # @rbs (ScannerFSA scanner_fsa) -> void + def finalize_lexical_ties!: (ScannerFSA scanner_fsa) -> void + + REGEX_LITERAL_ESCAPES: Array[String] + + REGEX_CONTROL_ESCAPES: Hash[String, String] + + # @rbs () -> void + def synthesize_implicit_literal_token_patterns!: () -> void + private + # Validate that identity-component operators are not applied to self-pairs. + # Self-pair is allowed only for length-only operators (-~ and -s). + # @rbs (Lexer::Token::Base left, Lexer::Token::Base right, Symbol operator, Integer lineno) -> void + def validate_lex_prec_self_pair!: (Lexer::Token::Base left, Lexer::Token::Base right, Symbol operator, Integer lineno) -> void + + # @rbs (Lexer::Token::Base id) -> String? + def implicit_literal_regex_pattern: (Lexer::Token::Base id) -> String? + + # Extract the string content from a quoted string literal (e.g., "=>" -> =>) + # @rbs (String s_value) -> String? + def str_literal_value: (String s_value) -> String? + + # @rbs (String s_value) -> String? + def char_literal_value: (String s_value) -> String? + + # @rbs (String literal) -> String + def escape_regex_literal: (String literal) -> String + + # @rbs (Lexer::Token::Base id) -> Integer + def token_lineno: (Lexer::Token::Base id) -> Integer + + # @rbs () -> void + def validate_pslr_configuration!: () -> void + + # @rbs (Lexer::Token::Base operand) -> Array[Lexer::Token::Base] + def expand_pslr_operand: (Lexer::Token::Base operand) -> Array[Lexer::Token::Base] + + # @rbs (Lexer::Token::Base operand) -> Grammar::LexTie::OperandGroup + def pslr_operand_group: (Lexer::Token::Base operand) -> Grammar::LexTie::OperandGroup + + # @rbs (String key) -> Integer? + def parse_pslr_positive_integer: (String key) -> Integer? + + # @rbs (String key) -> Float? + def parse_pslr_positive_float: (String key) -> Float? + # @rbs () -> void def sort_precedence: () -> void diff --git a/sig/generated/lrama/grammar/lex_prec.rbs b/sig/generated/lrama/grammar/lex_prec.rbs new file mode 100644 index 000000000..b6e6c5e21 --- /dev/null +++ b/sig/generated/lrama/grammar/lex_prec.rbs @@ -0,0 +1,121 @@ +# Generated from lib/lrama/grammar/lex_prec.rb with RBS::Inline + +module Lrama + class Grammar + # Represents lexical precedence rules defined by %lex-prec. + # + # Lrama accepts ASCII spellings for the PSLR paper operators: + # <~ identity conflict: right token wins; length conflict: longest wins + # <- identity conflict: right token wins + # -~ length conflict: longest wins + # << identity and length conflicts: right token wins + # -< length conflict: right token wins + # void + def initialize: (left_operand: Lexer::Token::Base, operator: Symbol, right_operand: Lexer::Token::Base, lineno: Integer) -> void + end + + class Rule + attr_reader left_token: Lexer::Token::Base + + attr_reader operator: Symbol + + attr_reader right_token: Lexer::Token::Base + + attr_reader lineno: Integer + + # @rbs (left_token: Lexer::Token::Base, operator: Symbol, right_token: Lexer::Token::Base, lineno: Integer) -> void + def initialize: (left_token: Lexer::Token::Base, operator: Symbol, right_token: Lexer::Token::Base, lineno: Integer) -> void + + # @rbs () -> String + def left_name: () -> String + + # @rbs () -> String + def right_name: () -> String + end + + attr_reader rules: Array[Rule] + + attr_reader declarations: Array[Declaration] + + attr_reader used_rules: Set[Integer] + + # @rbs () -> void + def initialize: () -> void + + # Mark a rule as used by conflict resolution. + # @rbs (Integer rule_index) -> void + def mark_used: (Integer rule_index) -> void + + # Returns rules that were never used in conflict resolution. + # @rbs () -> Array[Rule] + def useless_rules: () -> Array[Rule] + + # Store a raw declaration for delayed expansion. + # @rbs (left_operand: Lexer::Token::Base, operator: Symbol, right_operand: Lexer::Token::Base, lineno: Integer) -> Declaration + def add_declaration: (left_operand: Lexer::Token::Base, operator: Symbol, right_operand: Lexer::Token::Base, lineno: Integer) -> Declaration + + # @rbs (left_token: Lexer::Token::Base, operator: Symbol, right_token: Lexer::Token::Base, lineno: Integer) -> Rule + def add_rule: (left_token: Lexer::Token::Base, operator: Symbol, right_token: Lexer::Token::Base, lineno: Integer) -> Rule + + # True when winner explicitly wins an identity conflict against loser. + # The relation is intentionally not transitive. + # @rbs (String winner, String loser, ?track: bool) -> bool + def identity_precedes?: (String winner, String loser, ?track: bool) -> bool + + # True when rule declares a longest-match length relation for the pair. + # @rbs (String token1, String token2) -> bool + def longest_pair?: (String token1, String token2) -> bool + + # True when rule declares a shortest-match length relation for the pair. + # @rbs (String token1, String token2) -> bool + def shortest_pair?: (String token1, String token2) -> bool + + # Returns the explicit right-token length winner for a pair, if any. + # @rbs (String token1, String token2) -> String? + def right_token_length_winner: (String token1, String token2) -> String? + + private + + # @rbs (String token1, String token2, Array[Symbol] operators) -> bool + def pair_rule?: (String token1, String token2, Array[Symbol] operators) -> bool + end + end +end diff --git a/sig/generated/lrama/grammar/lex_tie.rbs b/sig/generated/lrama/grammar/lex_tie.rbs new file mode 100644 index 000000000..7655ca7ba --- /dev/null +++ b/sig/generated/lrama/grammar/lex_tie.rbs @@ -0,0 +1,110 @@ +# Generated from lib/lrama/grammar/lex_tie.rb with RBS::Inline + +module Lrama + class Grammar + # Stores PSLR lexical ties and explicit no-tie declarations. + # + # Lexical ties expand acc(sp); they never resolve a scanner conflict by + # themselves. Conflict selection is still handled by %lex-prec. + class LexTie + class OperandGroup + attr_reader names: Array[String] + + attr_reader kind: ::Symbol + + # @rbs (names: Array[String], kind: ::Symbol) -> void + def initialize: (names: Array[String], kind: ::Symbol) -> void + end + + class Declaration + attr_reader kind: ::Symbol + + attr_reader groups: Array[OperandGroup] + + attr_reader lineno: Integer + + # @rbs (kind: ::Symbol, groups: Array[OperandGroup], lineno: Integer) -> void + def initialize: (kind: ::Symbol, groups: Array[OperandGroup], lineno: Integer) -> void + end + + class Decision + attr_reader kind: ::Symbol + + attr_reader specificity: Integer + + attr_reader lineno: Integer + + # @rbs (kind: ::Symbol, specificity: Integer, lineno: Integer) -> void + def initialize: (kind: ::Symbol, specificity: Integer, lineno: Integer) -> void + end + + attr_reader ties: Hash[String, Set[String]] + + attr_reader no_ties: Set[[ String, String ]] + + attr_reader declarations: Array[Declaration] + + # @rbs () -> void + def initialize: () -> void + + # @rbs (String left, String right) -> void + def add_tie: (String left, String right) -> void + + # @rbs (String left, String right) -> void + def add_no_tie: (String left, String right) -> void + + # @rbs (groups: Array[OperandGroup], ?lineno: Integer) -> void + def add_tie_declaration: (groups: Array[OperandGroup], ?lineno: Integer) -> void + + # @rbs (groups: Array[OperandGroup], ?lineno: Integer) -> void + def add_no_tie_declaration: (groups: Array[OperandGroup], ?lineno: Integer) -> void + + # @rbs (Array[String] token_names, Set[[String, String]] conflict_pairs) -> void + def finalize!: (Array[String] token_names, Set[[ String, String ]] conflict_pairs) -> void + + # @rbs (String name) -> Set[String] + def tied_names: (String name) -> Set[String] + + # @rbs (String left, String right) -> bool + def tied?: (String left, String right) -> bool + + # @rbs (String left, String right) -> bool + def no_tie?: (String left, String right) -> bool + + # @rbs () -> Array[[String, String]] + def no_ties_conflicting_with_ties: () -> Array[[ String, String ]] + + private + + # @rbs (Hash[[String, String], Decision] decisions, [String, String] pair, Decision decision) -> void + def apply_decision: (Hash[[ String, String ], Decision] decisions, [ String, String ] pair, Decision decision) -> void + + # @rbs (Declaration declaration, Array[String] token_names, Set[[String, String]] conflict_pairs) -> Array[[[String, String], Integer]] + def declaration_pairs: (Declaration declaration, Array[String] token_names, Set[[ String, String ]] conflict_pairs) -> Array[[ [ String, String ], Integer ]] + + # @rbs (OperandGroup group, Array[String] token_names) -> Array[String] + def names_for_group: (OperandGroup group, Array[String] token_names) -> Array[String] + + # @rbs (OperandGroup left, OperandGroup right) -> Integer + def group_specificity: (OperandGroup left, OperandGroup right) -> Integer + + # @rbs (Array[String] token_names, Hash[[String, String], Decision] decisions) -> void + def rebuild_relations: (Array[String] token_names, Hash[[ String, String ], Decision] decisions) -> void + + # @rbs (Hash[String, String] parents, String name) -> String + def root: (Hash[String, String] parents, String name) -> String + + # @rbs (Hash[String, String] parents, String left, String right) -> void + def union: (Hash[String, String] parents, String left, String right) -> void + + # Compute closure specificity between two tokens via tie graph BFS. + # Path specificity = min(edge specificities on the path). + # Result = max over all paths connecting left and right. + # @rbs (String left, String right, Hash[[String, String], Integer] tie_specificities) -> Integer + def tie_specificity_between: (String left, String right, Hash[[ String, String ], Integer] tie_specificities) -> Integer + + # @rbs (String left, String right) -> [String, String] + def pair_key: (String left, String right) -> [ String, String ] + end + end +end diff --git a/sig/generated/lrama/grammar/lexer_context.rbs b/sig/generated/lrama/grammar/lexer_context.rbs new file mode 100644 index 000000000..436576d98 --- /dev/null +++ b/sig/generated/lrama/grammar/lexer_context.rbs @@ -0,0 +1,30 @@ +# Generated from lib/lrama/grammar/lexer_context.rb with RBS::Inline + +module Lrama + class Grammar + # Represents a lexer context defined by %lexer-context directive. + # + # Example: + # %lexer-context BEG keyword_if keyword_unless '(' '[' '{' + # + # The bitmask value is automatically assigned by definition order (1 << index). + class LexerContext + attr_reader name: String + + attr_reader index: Integer + + attr_reader symbols: Array[Lexer::Token::Ident] + + # @rbs (name: String, index: Integer) -> void + def initialize: (name: String, index: Integer) -> void + + # Bitmask value for this context (1 << index). + # @rbs () -> Integer + def bitmask: () -> Integer + + # Add symbols that belong to this context. + # @rbs (Array[Lexer::Token::Ident] syms) -> void + def add_symbols: (Array[Lexer::Token::Ident] syms) -> void + end + end +end diff --git a/sig/generated/lrama/grammar/parameterized/resolver.rbs b/sig/generated/lrama/grammar/parameterized/resolver.rbs index d2b07e963..a4d380da7 100644 --- a/sig/generated/lrama/grammar/parameterized/resolver.rbs +++ b/sig/generated/lrama/grammar/parameterized/resolver.rbs @@ -8,6 +8,8 @@ module Lrama attr_accessor created_lhs_list: Array[Lexer::Token::Base] + attr_reader expansion_args: Hash[String, Array[String]] + # @rbs () -> void def initialize: () -> void @@ -23,6 +25,12 @@ module Lrama # @rbs (String lhs_s_value) -> Lexer::Token::Base? def created_lhs: (String lhs_s_value) -> Lexer::Token::Base? + # Register the argument symbol names for a parameterized rule expansion. + # Used by LexerContextClassifier to inherit context from arguments. + # + # @rbs (String lhs_s_value, Array[Lexer::Token::Base] args) -> void + def register_expansion_args: (String lhs_s_value, Array[Lexer::Token::Base] args) -> void + # @rbs () -> Array[Rule] def redefined_rules: () -> Array[Rule] diff --git a/sig/generated/lrama/grammar/symbols/resolver.rbs b/sig/generated/lrama/grammar/symbols/resolver.rbs index 2e5f2ebf7..8c4980b70 100644 --- a/sig/generated/lrama/grammar/symbols/resolver.rbs +++ b/sig/generated/lrama/grammar/symbols/resolver.rbs @@ -108,6 +108,9 @@ module Lrama # @rbs (Lexer::Token::Base id) -> Grammar::Symbol def find_nterm_by_id!: (Lexer::Token::Base id) -> Grammar::Symbol + # @rbs (Grammar::Symbol sym, id: Lexer::Token::Base, ?alias_name: String?, ?tag: Lexer::Token::Tag?, ?token_id: Integer?) -> void + def replace_term_attributes: (Grammar::Symbol sym, id: Lexer::Token::Base, ?alias_name: String?, ?tag: Lexer::Token::Tag?, ?token_id: Integer?) -> void + # @rbs () -> void def fill_terms_number: () -> void diff --git a/sig/generated/lrama/grammar/token_action.rbs b/sig/generated/lrama/grammar/token_action.rbs new file mode 100644 index 000000000..cd208edfd --- /dev/null +++ b/sig/generated/lrama/grammar/token_action.rbs @@ -0,0 +1,28 @@ +# Generated from lib/lrama/grammar/token_action.rb with RBS::Inline + +module Lrama + class Grammar + # Represents a token action defined by %token-action directive. + # + # Token actions are user code blocks associated with token patterns. + # When a token is matched by the pseudo-scanner, the associated code runs. + # Layout tokens are accumulated, and the accumulated text is available + # to the next non-layout token's action. + # + # Example: + # %token-action ID { printf("matched ID: %.*s\n", yyleng, yytext); } + class TokenAction + attr_reader token_id: Lexer::Token::Ident + + attr_reader code: Lexer::Token::UserCode + + attr_reader lineno: Integer + + # @rbs (token_id: Lexer::Token::Ident, code: Lexer::Token::UserCode, lineno: Integer) -> void + def initialize: (token_id: Lexer::Token::Ident, code: Lexer::Token::UserCode, lineno: Integer) -> void + + # @rbs () -> String + def token_name: () -> String + end + end +end diff --git a/sig/generated/lrama/grammar/token_pattern.rbs b/sig/generated/lrama/grammar/token_pattern.rbs new file mode 100644 index 000000000..2e23e14e8 --- /dev/null +++ b/sig/generated/lrama/grammar/token_pattern.rbs @@ -0,0 +1,34 @@ +# Generated from lib/lrama/grammar/token_pattern.rb with RBS::Inline + +module Lrama + class Grammar + # Represents a token pattern defined by %token-pattern directive + # Example: %token-pattern RSHIFT />>/ "right shift" + class TokenPattern + attr_reader id: Lexer::Token::Base + + attr_reader pattern: Lexer::Token::Regex + + attr_reader alias_name: String? + + attr_reader tag: Lexer::Token::Tag? + + attr_reader lineno: Integer + + attr_reader definition_order: Integer + + # @rbs (id: Lexer::Token::Base, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer, definition_order: Integer) -> void + def initialize: (id: Lexer::Token::Base, pattern: Lexer::Token::Regex, lineno: Integer, definition_order: Integer, ?alias_name: String?, ?tag: Lexer::Token::Tag?) -> void + + # @rbs () -> String + def name: () -> String + + # Returns the regex pattern string (without slashes) + # @rbs () -> String + def regex_pattern: () -> String + + # @rbs () -> bool + def layout?: () -> bool + end + end +end diff --git a/sig/generated/lrama/length_precedences.rbs b/sig/generated/lrama/length_precedences.rbs new file mode 100644 index 000000000..ca4c737d5 --- /dev/null +++ b/sig/generated/lrama/length_precedences.rbs @@ -0,0 +1,76 @@ +# Generated from lib/lrama/length_precedences.rb with RBS::Inline + +module Lrama + # Runtime length precedence matrix for PSLR pseudo-scanning. + # + # When a longer match for new_token is reached after an earlier match for + # old_token, #precedes? answers whether the longer match should replace it. + class LengthPrecedences + LEFT: Symbol + + RIGHT: Symbol + + UNDEFINED: Symbol + + PREFER_NEW: Symbol + + PREFER_OLD: Symbol + + UNRESOLVED: Symbol + + class LexicalPrecedenceConflictError < StandardError + end + + class RuleSource + attr_reader operator: Symbol + + attr_reader lineno: Integer + + # @rbs (Symbol operator, Integer lineno) -> void + def initialize: (Symbol operator, Integer lineno) -> void + end + + attr_reader table: Hash[[ String, String ], bool] + + attr_reader resolution_table: Hash[[ String, String ], Symbol] + + # @rbs (Grammar::LexPrec lex_prec) -> void + def initialize: (Grammar::LexPrec lex_prec) -> void + + # @rbs (String old_token, String new_token) -> bool + def normal_precedes?: (String old_token, String new_token) -> bool + + # @rbs (String old_token, String new_token) -> bool + def precedes?: (String old_token, String new_token) -> bool + + # @rbs (String old_token, String new_token) -> bool + def fallback_precedes?: (String old_token, String new_token) -> bool + + # Backward-compatible query used by existing specs. + # @rbs (String old_token, String new_token) -> bool + def prefer_shorter?: (String old_token, String new_token) -> bool + + # @rbs (String old_token, String new_token, ?fallback: bool, ?track: bool) -> Symbol + def resolution: (String old_token, String new_token, ?fallback: bool, ?track: bool) -> Symbol + + # @rbs (String old_token, String new_token) -> Symbol + def precedence: (String old_token, String new_token) -> Symbol + + # @rbs (Symbol operator) -> String + def self.operator_label: (Symbol operator) -> String + + private + + # @rbs (Grammar::LexPrec lex_prec) -> Hash[[String, String], Symbol] + def build_resolution_table: (Grammar::LexPrec lex_prec) -> Hash[[ String, String ], Symbol] + + # @rbs (Hash[[String, String], Symbol] table, Hash[[String, String], RuleSource] sources, [String, String] key, Symbol value, Grammar::LexPrec::Rule rule, Integer rule_index) -> void + def set_resolution!: (Hash[[ String, String ], Symbol] table, Hash[[ String, String ], RuleSource] sources, [ String, String ] key, Symbol value, Grammar::LexPrec::Rule rule, Integer rule_index) -> void + + # @rbs (Symbol operator) -> String + def operator_label: (Symbol operator) -> String + + # @rbs (Symbol value) -> String + def resolution_label: (Symbol value) -> String + end +end diff --git a/sig/generated/lrama/lexer.rbs b/sig/generated/lrama/lexer.rbs index 232026125..0002648c0 100644 --- a/sig/generated/lrama/lexer.rbs +++ b/sig/generated/lrama/lexer.rbs @@ -4,7 +4,7 @@ module Lrama class Lexer type token = lexer_token | c_token - type lexer_token = [ String, Token::Token ] | [ ::Symbol, Token::Tag ] | [ ::Symbol, Token::Char ] | [ ::Symbol, Token::Str ] | [ ::Symbol, Token::Int ] | [ ::Symbol, Token::Ident ] + type lexer_token = [ String, Token::Token ] | [ ::Symbol, Token::Tag ] | [ ::Symbol, Token::Char ] | [ ::Symbol, Token::Str ] | [ ::Symbol, Token::Int ] | [ ::Symbol, Token::Ident ] | [ ::Symbol, Token::Regex ] type c_token = [ :C_DECLARATION, Token::UserCode ] @@ -45,6 +45,9 @@ module Lrama # @rbs () -> void def lex_comment: () -> void + # @rbs () -> Token::Regex? + def scan_regex_token: () -> Token::Regex? + # @rbs () -> void def reset_first_position: () -> void diff --git a/sig/generated/lrama/lexer/token/regex.rbs b/sig/generated/lrama/lexer/token/regex.rbs new file mode 100644 index 000000000..b832c4be2 --- /dev/null +++ b/sig/generated/lrama/lexer/token/regex.rbs @@ -0,0 +1,15 @@ +# Generated from lib/lrama/lexer/token/regex.rb with RBS::Inline + +module Lrama + class Lexer + module Token + # Token class for regex patterns used in %token-pattern directive + # Example: /[a-zA-Z_][a-zA-Z0-9_]*/ + class Regex < Base + # Returns the regex pattern without the surrounding slashes + # @rbs () -> String + def pattern: () -> String + end + end + end +end diff --git a/sig/generated/lrama/lexer_context_classifier.rbs b/sig/generated/lrama/lexer_context_classifier.rbs new file mode 100644 index 000000000..5a969237d --- /dev/null +++ b/sig/generated/lrama/lexer_context_classifier.rbs @@ -0,0 +1,78 @@ +# Generated from lib/lrama/lexer_context_classifier.rb with RBS::Inline + +module Lrama + # Classifies parser states into lexer context categories. + # + # When LALR states are merged, states from different grammatical contexts + # (e.g., BEG vs CMDARG) share the same state number, making them + # indistinguishable to the lexer. This classifier analyzes kernel items + # to determine the lexer context of each state, enabling context-aware + # state splitting. + # + # Context definitions come from %lexer-context directives in the grammar file. + # Each directive maps a context name to a set of symbols: + # + # %lexer-context BEG keyword_if keyword_unless '(' '[' '{' + # %lexer-context CMDARG tIDENTIFIER tFID tCONSTANT + class LexerContextClassifier + # @rbs (Hash[String, Grammar::LexerContext] lexer_contexts, ?Hash[String, Array[String]] expansion_args) -> void + def initialize: (Hash[String, Grammar::LexerContext] lexer_contexts, ?Hash[String, Array[String]] expansion_args) -> void + + # Classify a state's kernel items into context groups. + # + # @rbs (State state) -> Hash[Integer, Array[State::Item]] + def classify: (State state) -> Hash[Integer, Array[State::Item]] + + # Infer the lexer context for a single kernel item. + # + # @rbs (State::Item item) -> Integer + def infer_item_context: (State::Item item) -> Integer + + # Classify context based on the symbol before the dot. + # + # @rbs (Grammar::Symbol sym) -> Integer + def classify_symbol_context: (Grammar::Symbol sym) -> Integer + + # For backward compatibility with states.rb split logic + # @rbs (Grammar::Symbol sym) -> Integer + def classify_terminal_context: (Grammar::Symbol sym) -> Integer + + # For backward compatibility with states.rb split logic + # @rbs (Grammar::Symbol sym) -> Integer + def classify_nonterminal_context: (Grammar::Symbol sym) -> Integer + + # Return a human-readable name for a context value. + # + # @rbs (Integer ctx) -> String + def context_name: (Integer ctx) -> String + + # Class-level context_name for use without an instance (e.g., output.rb). + # Requires lexer_contexts to build the name map. + # + # @rbs (Integer ctx, Hash[String, Grammar::LexerContext] lexer_contexts) -> String + def self.context_name: (Integer ctx, Hash[String, Grammar::LexerContext] lexer_contexts) -> String + + # All context bitmasks OR'd together (for "is context known?" checks). + # @rbs () -> Integer + def all_contexts_mask: () -> Integer + + # Return the ordered list of context definitions. + # @rbs () -> Array[Grammar::LexerContext] + def contexts: () -> Array[Grammar::LexerContext] + + private + + # Build a map from symbol name → context bitmask. + # @rbs () -> Hash[String, Integer] + def build_symbol_to_context_map: () -> Hash[String, Integer] + + # Build a map from bitmask value → context name. + # @rbs () -> Hash[Integer, String] + def build_context_names: () -> Hash[Integer, String] + + # Return the bitmask for the first defined context (used as default for position-0 items). + # Returns 0 if no contexts are defined. + # @rbs () -> Integer + def default_beg_context: () -> Integer + end +end diff --git a/sig/generated/lrama/reporter/pslr.rbs b/sig/generated/lrama/reporter/pslr.rbs new file mode 100644 index 000000000..c9de08058 --- /dev/null +++ b/sig/generated/lrama/reporter/pslr.rbs @@ -0,0 +1,18 @@ +# Generated from lib/lrama/reporter/pslr.rb with RBS::Inline + +module Lrama + class Reporter + class Pslr + # @rbs (?pslr: bool, **bool _) -> void + def initialize: (?pslr: bool, **bool _) -> void + + # @rbs (IO io, Lrama::States states) -> void + def report: (IO io, Lrama::States states) -> void + + private + + # @rbs (Numeric?) -> String + def format_ratio: (Numeric?) -> String + end + end +end diff --git a/sig/generated/lrama/scanner_fsa.rbs b/sig/generated/lrama/scanner_fsa.rbs new file mode 100644 index 000000000..7397a65d3 --- /dev/null +++ b/sig/generated/lrama/scanner_fsa.rbs @@ -0,0 +1,197 @@ +# Generated from lib/lrama/scanner_fsa.rb with RBS::Inline + +module Lrama + # Scanner Finite State Automaton for PSLR(1) + # Built from token patterns defined by %token-pattern directives + # Based on Definitions 3.2.12, 3.2.13 from the PSLR dissertation + class ScannerFSA + # Represents a state in the scanner FSA + class State + attr_reader id: Integer + + attr_reader transitions: Hash[String, Integer] + + attr_reader accepting_tokens: Array[Grammar::TokenPattern] + + # @rbs (Integer id) -> void + def initialize: (Integer id) -> void + + # @rbs () -> bool + def accepting?: () -> bool + + # @rbs (String char, Integer target_state_id) -> void + def add_transition: (String char, Integer target_state_id) -> void + + # @rbs (Grammar::TokenPattern token_pattern) -> void + def add_accepting_token: (Grammar::TokenPattern token_pattern) -> void + end + + attr_reader states: Array[State] + + attr_reader initial_state: State + + attr_reader token_patterns: Array[Grammar::TokenPattern] + + # @rbs (Array[Grammar::TokenPattern] token_patterns) -> void + def initialize: (Array[Grammar::TokenPattern] token_patterns) -> void + + # Returns the accepting state for a given FSA state + # Definition 3.2.13 (state_to_accepting_state) + # @rbs (Integer state_id) -> State? + def state_to_accepting_state: (Integer state_id) -> State? + + # Returns the set of tokens accepted at FSA state ss + # Definition 3.2.12 acc(ss) + # @rbs (Integer state_id) -> Array[Grammar::TokenPattern] + def acc_ss: (Integer state_id) -> Array[Grammar::TokenPattern] + + # Simulate the FSA on input string starting from initial state + # Returns all accepting states reached during the scan + # @rbs (String input) -> Array[{state: State, position: Integer, token: Grammar::TokenPattern}] + def scan: (String input) -> Array[{ state: State, position: Integer, token: Grammar::TokenPattern }] + + # Returns token pairs that can be in an identity or length scanner conflict. + # Pair keys are sorted token names. + # @rbs () -> Set[[String, String]] + def pairwise_conflict_pairs: () -> Set[[ String, String ]] + + # @rbs (String left, String right) -> bool + def pairwise_conflict?: (String left, String right) -> bool + + private + + # @rbs (String left, String right) -> [String, String] + def pair_key: (String left, String right) -> [ String, String ] + + # Build the FSA from token patterns + # Uses Thompson's construction for NFAs followed by subset construction for DFA + # @rbs () -> void + def build_fsa: () -> void + + # @rbs () -> State + def create_state: () -> State + + # Simple NFA state for regex compilation + class PatternError < StandardError + end + + class NFAState + attr_reader id: Integer + + attr_accessor transitions: Hash[String?, Array[NFAState]] + + attr_accessor accepting_token: Grammar::TokenPattern? + + # @rbs (Integer id) -> void + def initialize: (Integer id) -> void + + # @rbs (String? char, NFAState target) -> void + def add_transition: (String? char, NFAState target) -> void + + # @rbs () -> bool + def accepting?: () -> bool + end + + class Fragment + attr_reader start_state: NFAState + + attr_reader end_state: NFAState + + attr_reader nullable: bool + + # @rbs (NFAState start_state, NFAState end_state, bool nullable) -> void + def initialize: (NFAState start_state, NFAState end_state, bool nullable) -> void + + # @rbs () -> [NFAState, NFAState] + def to_ary: () -> [ NFAState, NFAState ] + end + + # Build NFA from all token patterns + # @rbs () -> Array[NFAState] + def build_nfa: () -> Array[NFAState] + + # @rbs (Array[Integer] counter, Array[NFAState] states) -> NFAState + def create_nfa_state: (Array[Integer] counter, Array[NFAState] states) -> NFAState + + ASCII_CHARS: Array[String] + + ANY_CHARS: Array[String] + + DIGIT_CHARS: Array[String] + + WORD_CHARS: Array[String] + + WHITESPACE_CHARS: Array[String] + + QUANTIFIERS: Array[String] + + ESCAPED_LITERAL_CHARS: Array[String] + + # Compile a regex pattern to NFA fragment. The supported dialect is a small + # ASCII regular-expression subset for PSLR pseudo scanning. + # @rbs (String pattern, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_regex: (String pattern, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # @rbs (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states, ?String? stop_char) -> [Fragment, Integer] + def compile_expression: (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states, ?String? stop_char) -> [ Fragment, Integer ] + + # @rbs (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states, String? stop_char) -> [Fragment?, Integer] + def compile_sequence: (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states, String? stop_char) -> [ Fragment?, Integer ] + + # @rbs (String? stop_char) -> String + def empty_sequence_message: (String? stop_char) -> String + + # @rbs (String pattern, Integer offset, Array[Integer] counter, Array[NFAState] states) -> [Fragment, Integer] + def compile_escape: (String pattern, Integer offset, Array[Integer] counter, Array[NFAState] states) -> [ Fragment, Integer ] + + # Compile a single literal character + # @rbs (String char, Array[Integer] counter, Array[NFAState] states) -> Fragment + def compile_literal: (String char, Array[Integer] counter, Array[NFAState] states) -> Fragment + + # @rbs (Array[String] chars, Array[Integer] counter, Array[NFAState] states) -> Fragment + def compile_chars: (Array[String] chars, Array[Integer] counter, Array[NFAState] states) -> Fragment + + # Compile a character class [...] + # @rbs (String char_class, Array[Integer] counter, Array[NFAState] states) -> Fragment + def compile_char_class: (String char_class, Array[Integer] counter, Array[NFAState] states) -> Fragment + + # Expand character class string to array of characters + # @rbs (String char_class) -> Array[String] + def expand_char_class: (String char_class) -> Array[String] + + # @rbs (String pattern, Integer offset) -> Integer? + def find_character_class_end: (String pattern, Integer offset) -> Integer? + + # @rbs (String char_class, Integer offset) -> [Array[String], Integer] + def read_char_class_element: (String char_class, Integer offset) -> [ Array[String], Integer ] + + # @rbs (String char, Integer offset) -> Array[String] + def escaped_char_class_chars: (String char, Integer offset) -> Array[String] + + # @rbs (Array[String] start_chars, Array[String] end_chars) -> Array[String] + def expand_char_range: (Array[String] start_chars, Array[String] end_chars) -> Array[String] + + # Compile . (any character) + # @rbs (Array[Integer] counter, Array[NFAState] states) -> Fragment + def compile_any_char: (Array[Integer] counter, Array[NFAState] states) -> Fragment + + # Apply a quantifier to a fragment + # @rbs (Fragment fragment, String quantifier, Array[Integer] counter, Array[NFAState] states) -> Fragment + def apply_quantifier: (Fragment fragment, String quantifier, Array[Integer] counter, Array[NFAState] states) -> Fragment + + # @rbs (Array[Fragment] fragments, Array[Integer] counter, Array[NFAState] states) -> Fragment + def alternate_fragments: (Array[Fragment] fragments, Array[Integer] counter, Array[NFAState] states) -> Fragment + + # Concatenate multiple NFA fragments into one + # @rbs (Array[Fragment] fragments, Array[Integer] counter, Array[NFAState] states) -> Fragment + def concatenate_fragments: (Array[Fragment] fragments, Array[Integer] counter, Array[NFAState] states) -> Fragment + + # Convert NFA to DFA using subset construction + # @rbs (Array[NFAState] nfa_states) -> void + def convert_nfa_to_dfa: (Array[NFAState] nfa_states) -> void + + # Compute epsilon closure of a set of NFA states + # @rbs (Array[NFAState] nfa_states) -> Array[NFAState] + def epsilon_closure: (Array[NFAState] nfa_states) -> Array[NFAState] + end +end diff --git a/sig/generated/lrama/state.rbs b/sig/generated/lrama/state.rbs index 8f585c332..42c4030de 100644 --- a/sig/generated/lrama/state.rbs +++ b/sig/generated/lrama/state.rbs @@ -76,6 +76,10 @@ module Lrama attr_accessor goto_follows: Hash[Action::Goto, Array[Grammar::Symbol]] + attr_accessor pslr_item_lookahead_set: lookahead_set? + + attr_accessor lexer_context: Integer? + # @rbs (Integer id, Grammar::Symbol accessing_symbol, Array[Item] kernels) -> void def initialize: (Integer id, Grammar::Symbol accessing_symbol, Array[Item] kernels) -> void @@ -100,6 +104,12 @@ module Lrama # @rbs (Grammar::Rule rule, Array[Grammar::Symbol] look_ahead) -> void def set_look_ahead: (Grammar::Rule rule, Array[Grammar::Symbol] look_ahead) -> void + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_reduce_lookahead: (Action::Reduce reduce) -> Array[Grammar::Symbol] + + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_pslr_reduce_lookahead: (Action::Reduce reduce) -> Array[Grammar::Symbol] + # @rbs (Grammar::Rule rule, Hash[Grammar::Symbol, Array[Action::Goto]] sources) -> void def set_look_ahead_sources: (Grammar::Rule rule, Hash[Grammar::Symbol, Array[Action::Goto]] sources) -> void @@ -158,6 +168,12 @@ module Lrama # @rbs (State next_state) -> lookahead_set def propagate_lookaheads: (State next_state) -> lookahead_set + # @rbs (State next_state) -> lookahead_set + def propagate_lookaheads_without_filter: (State next_state) -> lookahead_set + + # @rbs (State next_state, bool apply_filter) -> lookahead_set + def propagate_lookaheads_with_filter: (State next_state, bool apply_filter) -> lookahead_set + # Definition 3.43 (is_compatible) # # @rbs (lookahead_set filtered_lookahead) -> bool diff --git a/sig/generated/lrama/state/pslr_inadequacy.rbs b/sig/generated/lrama/state/pslr_inadequacy.rbs new file mode 100644 index 000000000..54c25cff6 --- /dev/null +++ b/sig/generated/lrama/state/pslr_inadequacy.rbs @@ -0,0 +1,52 @@ +# Generated from lib/lrama/state/pslr_inadequacy.rb with RBS::Inline + +module Lrama + class State + # PSLR Inadequacy detection + # Based on Section 3.4.3 from the PSLR dissertation + # + # PSLR inadequacy occurs when state merging causes different + # pseudo-scanner behavior + class PslrInadequacy + # Inadequacy types + LR_RELATIVE: Symbol + + PSLR_RELATIVE: Symbol + + attr_reader type: Symbol + + attr_reader state: State + + attr_reader conflicting_states: Array[State] + + attr_reader details: Hash[Symbol, untyped] + + # @rbs (type: Symbol, state: State, conflicting_states: Array[State], details: Hash[Symbol, untyped]) -> void + def initialize: (type: Symbol, state: State, conflicting_states: Array[State], details: Hash[Symbol, untyped]) -> void + + # @rbs () -> String + def to_s: () -> String + end + + # PSLR Compatibility checker + # Based on Definition 3.4.1 from the dissertation + class PslrCompatibilityChecker + # @rbs (ScannerAccepts scanner_accepts, LengthPrecedences length_prec) -> void + def initialize: (ScannerAccepts scanner_accepts, LengthPrecedences length_prec) -> void + + # Build a stable scanner profile for a parser state + # @rbs (State state, ScannerFSA scanner_fsa) -> Array[[Integer, String?]] + def profile: (State state, ScannerFSA scanner_fsa) -> Array[[ Integer, String? ]] + + # Partition states by scanner profile + # @rbs (Array[State] states, ScannerFSA scanner_fsa) -> Hash[Array[[Integer, String?]], Array[State]] + def group_by_profile: (Array[State] states, ScannerFSA scanner_fsa) -> Hash[Array[[ Integer, String? ]], Array[State]] + + # Check if two states are PSLR-compatible + # Definition 3.4.1: States are compatible if for any input, + # the pseudo-scanner selects the same token + # @rbs (State s1, State s2, ScannerFSA scanner_fsa) -> bool + def compatible?: (State s1, State s2, ScannerFSA scanner_fsa) -> bool + end + end +end diff --git a/sig/generated/lrama/state/scanner_accepts.rbs b/sig/generated/lrama/state/scanner_accepts.rbs new file mode 100644 index 000000000..93e0260c9 --- /dev/null +++ b/sig/generated/lrama/state/scanner_accepts.rbs @@ -0,0 +1,192 @@ +# Generated from lib/lrama/state/scanner_accepts.rb with RBS::Inline + +module Lrama + class State + # scanner_accepts[parser_state, accepting_scanner_state] for PSLR(1). + # + # Construction follows complete pseudo-scanner conflict profiles. A profile + # contains the shorter token set Ts, the selected shorter token ts, and the + # current/longest token set Tl. + # + # Normal parser-state rows are strict: unresolved pseudo-scanner + # conflicts are reported and are not resolved by token declaration order. + # + # The fallback row is used only for syntax-error handling. It first + # applies explicit PSLR lexical precedence declarations. For scanner + # conflicts that remain unresolved only in the fallback universe, it + # completes the decision with traditional scanner behavior: fallback + # length defaults are handled by LengthPrecedences#fallback_precedes?, + # and otherwise-unresolved identity conflicts are resolved by token + # declaration order. + # + # When %lex-scope declarations are present, each parser state may use + # a merged set of global + scope-active lexical precedence rules. + class ScannerAccepts + class Conflict + attr_reader parser_state_id: Integer? + + attr_reader scanner_state_id: Integer + + attr_reader shorter_tokens: Array[String] + + attr_reader selected_shorter_token: String? + + attr_reader current_tokens: Array[String] + + # @rbs (parser_state_id: Integer?, scanner_state_id: Integer, shorter_tokens: Array[String], selected_shorter_token: String?, current_tokens: Array[String]) -> void + def initialize: (parser_state_id: Integer?, scanner_state_id: Integer, shorter_tokens: Array[String], selected_shorter_token: String?, current_tokens: Array[String]) -> void + end + + class ProfileOutcome + EMPTY: Symbol + + RESOLVED: Symbol + + UNRESOLVED: Symbol + + attr_reader kind: Symbol + + attr_reader token_name: String? + + attr_reader conflict: Conflict? + + # @rbs (kind: Symbol, ?token_name: String?, ?conflict: Conflict?) -> void + def initialize: (kind: Symbol, ?token_name: String?, ?conflict: Conflict?) -> void + + # @rbs () -> bool + def empty?: () -> bool + + # @rbs () -> bool + def resolved?: () -> bool + + # @rbs () -> bool + def unresolved?: () -> bool + end + + class ProfileResolver + # @rbs (Grammar::LexPrec lex_prec, LengthPrecedences length_prec, ?fallback: bool, ?token_order: Hash[String, Integer]) -> void + def initialize: (Grammar::LexPrec lex_prec, LengthPrecedences length_prec, ?fallback: bool, ?token_order: Hash[String, Integer]) -> void + + # @rbs (Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> ProfileOutcome + def resolve: (Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> ProfileOutcome + + private + + # @rbs (Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> ProfileOutcome + def resolve_normal: (Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> ProfileOutcome + + # @rbs (Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> ProfileOutcome + def resolve_fallback: (Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> ProfileOutcome + + # @rbs (String old_token, String new_token) -> bool + def length_prefers_old?: (String old_token, String new_token) -> bool + + # @rbs (String old_token, String new_token) -> bool + def fallback_length_prefers_old?: (String old_token, String new_token) -> bool + + # @rbs (String candidate, Set[String] current_tokens) -> bool + def identity_winner?: (String candidate, Set[String] current_tokens) -> bool + + # @rbs (String candidate, Set[String] current_tokens) -> bool + def fallback_identity_winner?: (String candidate, Set[String] current_tokens) -> bool + + # @rbs (String candidate, String other) -> bool + def token_order_precedes?: (String candidate, String other) -> bool + + # @rbs (String token) -> [Integer, String] + def token_order_key: (String token) -> [ Integer, String ] + end + + class CompleteProfileComputer + attr_reader table: Hash[Integer, Grammar::TokenPattern] + + attr_reader conflicts: Array[Conflict] + + # @rbs (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec, Set[String] acceptable_tokens, ?Integer? parser_state_id, ?fallback: bool, ?token_order: Hash[String, Integer]) -> void + def initialize: (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec, Set[String] acceptable_tokens, ?Integer? parser_state_id, ?fallback: bool, ?token_order: Hash[String, Integer]) -> void + + # @rbs () -> void + def compute: () -> void + + private + + # @rbs (Integer fsa_state_id, Set[String] shorter_tokens, String? selected_shorter_token, Set[untyped] visited) -> void + def visit_transitions: (Integer fsa_state_id, Set[String] shorter_tokens, String? selected_shorter_token, Set[untyped] visited) -> void + + # @rbs (Integer fsa_state_id, Set[String] shorter_tokens, String? selected_shorter_token, Set[untyped] visited) -> void + def visit_state: (Integer fsa_state_id, Set[String] shorter_tokens, String? selected_shorter_token, Set[untyped] visited) -> void + + # @rbs (ScannerFSA::State fsa_state) -> Set[String] + def current_acceptable_tokens: (ScannerFSA::State fsa_state) -> Set[String] + + # @rbs (String token_name) -> Grammar::TokenPattern + def token_pattern_for: (String token_name) -> Grammar::TokenPattern + + # @rbs (Integer fsa_state_id, Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> [Integer, Array[String], String?, Array[String]] + def profile_key: (Integer fsa_state_id, Set[String] shorter_tokens, String? selected_shorter_token, Set[String] current_tokens) -> [ Integer, Array[String], String?, Array[String] ] + end + + class CompatibilityChecker + # @rbs (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + def initialize: (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + + # @rbs (Set[String] left_tokens, Set[String] right_tokens) -> bool + def compatible?: (Set[String] left_tokens, Set[String] right_tokens) -> bool + + private + + # @rbs (Integer fsa_state_id, Set[String] left_shorter, String? left_selected, Set[String] right_shorter, String? right_selected, Set[String] left_acc, Set[String] right_acc, Set[untyped] visited) -> bool + def visit_transitions: (Integer fsa_state_id, Set[String] left_shorter, String? left_selected, Set[String] right_shorter, String? right_selected, Set[String] left_acc, Set[String] right_acc, Set[untyped] visited) -> bool + + # @rbs (Integer fsa_state_id, Set[String] left_shorter, String? left_selected, Set[String] right_shorter, String? right_selected, Set[String] left_acc, Set[String] right_acc, Set[untyped] visited) -> bool + def visit_state: (Integer fsa_state_id, Set[String] left_shorter, String? left_selected, Set[String] right_shorter, String? right_selected, Set[String] left_acc, Set[String] right_acc, Set[untyped] visited) -> bool + + # @rbs (ScannerFSA::State fsa_state, Set[String] accepted_names) -> Set[String] + def tokens_accepted_by: (ScannerFSA::State fsa_state, Set[String] accepted_names) -> Set[String] + + # @rbs (ProfileOutcome left, ProfileOutcome right) -> bool + def outcomes_compatible?: (ProfileOutcome left, ProfileOutcome right) -> bool + end + + FALLBACK_ROW_ID: Integer + + attr_reader table: Hash[[ Integer, Integer ], Grammar::TokenPattern?] + + attr_reader fallback_table: Hash[Integer, Grammar::TokenPattern] + + attr_reader conflicts: Array[Conflict] + + # @rbs (Array[State] parser_states, ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec, ?Grammar::LexTie? lex_tie, ?layout_token_names: Set[String]) -> void + def initialize: (Array[State] parser_states, ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec, ?Grammar::LexTie? lex_tie, ?layout_token_names: Set[String]) -> void + + # @rbs (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec, Set[String] acceptable_tokens) -> [Hash[Integer, Grammar::TokenPattern], Array[Conflict]] + def self.compute_for_acceptable_tokens: (ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec, Set[String] acceptable_tokens) -> [ Hash[Integer, Grammar::TokenPattern], Array[Conflict] ] + + # @rbs () -> void + def build: () -> void + + # @rbs (Integer parser_state_id, Integer accepting_state_id) -> Grammar::TokenPattern? + def []: (Integer parser_state_id, Integer accepting_state_id) -> Grammar::TokenPattern? + + # @rbs () -> bool + def unresolved_conflicts?: () -> bool + + private + + # @rbs (State parser_state) -> void + def compute_for_parser_state: (State parser_state) -> void + + # @rbs () -> void + def compute_fallback_row: () -> void + + # @rbs (State parser_state) -> Set[String] + def compute_acc_sp: (State parser_state) -> Set[String] + + # @rbs (Set[String] tokens) -> Set[String] + def expand_lexical_ties: (Set[String] tokens) -> Set[String] + + # @rbs () -> Hash[String, Integer] + def token_order: () -> Hash[String, Integer] + end + end +end diff --git a/sig/generated/lrama/states.rbs b/sig/generated/lrama/states.rbs index 8e4b296e9..919b6a00c 100644 --- a/sig/generated/lrama/states.rbs +++ b/sig/generated/lrama/states.rbs @@ -44,6 +44,20 @@ module Lrama attr_reader lookback_relation: Hash[state_id, Hash[rule_id, Array[State::Action::Goto]]] + attr_reader scanner_fsa: ScannerFSA? + + attr_reader length_precedences: LengthPrecedences? + + attr_reader scanner_accepts_table: State::ScannerAccepts? + + attr_reader pslr_inadequacies: Array[State::PslrInadequacy] + + attr_reader pslr_metrics: Hash[Symbol, Integer | Float | nil] + + attr_reader lexer_context_classifier: LexerContextClassifier? + + attr_reader lexical_tie_candidates: Array[[ String, String ]] + # @rbs (Grammar grammar, Tracer tracer) -> void def initialize: (Grammar grammar, Tracer tracer) -> void @@ -53,6 +67,11 @@ module Lrama # @rbs () -> void def compute_ielr: () -> void + # Compute PSLR(1) states + # Based on Section 3.4 of the PSLR dissertation + # @rbs () -> void + def compute_pslr: () -> void + # @rbs () -> Integer def states_count: () -> Integer @@ -77,6 +96,27 @@ module Lrama # @rbs (Logger logger) -> void def validate!: (Logger logger) -> void + # Classify each state's lexer context based on kernel items. + # + # For each state, analyzes the kernel items to determine what lexer + # context (BEG, CMDARG, ARG, END, ENDFN, MID, DOT) the state belongs to. + # When a state has kernel items from multiple contexts, the context is + # set to the bitwise OR of all contexts (mixed context). + # + # @rbs () -> void + def classify_lexer_contexts: () -> void + + # Return the lexer context table as an array of context values, + # one per parser state (indexed by state id). + # + # @rbs () -> Array[Integer] + def lexer_context_table: () -> Array[Integer] + + # Check if lexer context classification has been performed. + # + # @rbs () -> bool + def lexer_context_enabled?: () -> bool + def compute_la_sources_for_conflicted_states: () -> untyped private @@ -188,15 +228,68 @@ module Lrama # @rbs () -> void def split_states: () -> void + # Split states where different predecessor paths lead to different + # lexer contexts. This resolves LALR state merging that makes + # BEG vs CMDARG (and other context pairs) indistinguishable. + # + # Algorithm: + # 1. For each state, group incoming transitions by the lexer context + # that the predecessor would imply + # 2. If a state has predecessors from multiple different contexts, + # split the state so each split has a unique context + # + # @rbs () -> void + def split_states_by_context: () -> void + + # For a given state, group its incoming transitions by the lexer context + # that the predecessor state implies for this state. + # + # The implied context is determined by what symbol was used to reach + # this state (the accessing symbol's context). + # + # @rbs (State state) -> Hash[Integer, Array[[State, State::Action::Shift | State::Action::Goto]]] + def compute_predecessor_context_groups: (State state) -> Hash[Integer, Array[[ State, State::Action::Shift | State::Action::Goto ]]] + + # Infer the lexer context that a transition implies for the target state. + # + # @rbs (State pred, State::Action::Shift | State::Action::Goto transition) -> Integer + def infer_transition_context: (State pred, State::Action::Shift | State::Action::Goto transition) -> Integer + + # Create a new split state that is an isocore copy of the given state. + # + # @rbs (State original) -> State + def create_context_split_state: (State original) -> State + + # @rbs () -> void + def capture_pslr_metrics_before_split: () -> void + # @rbs () -> void def compute_inadequacy_annotations: () -> void # @rbs (State state, State::lookahead_set filtered_lookaheads) -> void def merge_lookaheads: (State state, State::lookahead_set filtered_lookaheads) -> void + # @rbs (State state, State::lookahead_set pslr_lookaheads) -> void + def merge_pslr_lookaheads: (State state, State::lookahead_set pslr_lookaheads) -> void + # @rbs (State state, State::Action::Shift | State::Action::Goto transition, State next_state) -> void def compute_state: (State state, State::Action::Shift | State::Action::Goto transition, State next_state) -> void + # @rbs (State state, State::lookahead_set filtered_lookaheads, ?State::lookahead_set pslr_lookaheads) -> bool + def compatible_split_state?: (State state, State::lookahead_set filtered_lookaheads, ?State::lookahead_set pslr_lookaheads) -> bool + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Array[[Integer, String?]] + def pslr_state_signature: (State state, ?State::lookahead_set filtered_lookaheads) -> Array[[ Integer, String? ]] + + # @rbs (Set[String] left_acc, Set[String] right_acc) -> bool + def pslr_compatible_accept_sets?: (Set[String] left_acc, Set[String] right_acc) -> bool + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads, ?expand_ties: bool, ?include_layout: bool) -> Set[String] + def acceptable_tokens_for_pslr: (State state, ?State::lookahead_set filtered_lookaheads, ?expand_ties: bool, ?include_layout: bool) -> Set[String] + + # @rbs (?include_layout: bool) -> Set[String] + def layout_token_names_for_pslr: (?include_layout: bool) -> Set[String] + # @rbs (Logger logger) -> void def validate_conflicts_within_threshold!: (Logger logger) -> void @@ -211,5 +304,55 @@ module Lrama # @rbs () -> void def clear_look_ahead_sets: () -> void + + # Build Scanner FSA from token patterns + # @rbs () -> void + def build_scanner_fsa: () -> void + + # Build length precedences table + # @rbs () -> void + def build_length_precedences: () -> void + + # Build scanner_accepts table + # @rbs () -> void + def build_scanner_accepts: () -> void + + # Handle PSLR inadequacies + # Detects and splits states where pseudo-scanner behavior differs + # @rbs () -> void + def handle_pslr_inadequacies: () -> void + + # @rbs () -> void + def finalize_pslr_metrics: () -> void + + # Detect PSLR inadequacies in isocore groups + # @rbs () -> Array[State::PslrInadequacy] + def detect_pslr_inadequacies: () -> Array[State::PslrInadequacy] + + # @rbs () -> void + def collect_lexical_tie_candidates: () -> void + + # @rbs () -> Array[State] + def reachable_parser_states: () -> Array[State] + + # @rbs (Logger logger) -> void + def validate_pslr_inadequacies!: (Logger logger) -> void + + # @rbs (Logger logger) -> void + def validate_pslr_scanner_conflicts!: (Logger logger) -> void + + # @rbs (State::ScannerAccepts::Conflict conflict) -> String + def pslr_scanner_conflict_message: (State::ScannerAccepts::Conflict conflict) -> String + + # @rbs (Logger logger) -> void + def validate_pslr_state_growth!: (Logger logger) -> void + + # Report %lex-prec rules that were never used in scanner conflict resolution. + # @rbs (Logger logger) -> void + def validate_pslr_useless_lex_prec!: (Logger logger) -> void + + # Report lexical tie candidates that are not covered by %lex-tie or %lex-no-tie. + # @rbs (Logger logger) -> void + def validate_pslr_lexical_tie_candidates!: (Logger logger) -> void end end diff --git a/sig/generated/lrama/warnings/lexical_tie_candidates.rbs b/sig/generated/lrama/warnings/lexical_tie_candidates.rbs new file mode 100644 index 000000000..43e569a41 --- /dev/null +++ b/sig/generated/lrama/warnings/lexical_tie_candidates.rbs @@ -0,0 +1,13 @@ +# Generated from lib/lrama/warnings/lexical_tie_candidates.rb with RBS::Inline + +module Lrama + class Warnings + class LexicalTieCandidates + # @rbs (Logger logger, bool warnings) -> void + def initialize: (Logger logger, bool warnings) -> void + + # @rbs (Lrama::States states) -> void + def warn: (Lrama::States states) -> void + end + end +end diff --git a/spec/fixtures/command/pslr_growth_limit.y b/spec/fixtures/command/pslr_growth_limit.y new file mode 100644 index 000000000..095ed6ae8 --- /dev/null +++ b/spec/fixtures/command/pslr_growth_limit.y @@ -0,0 +1,36 @@ +%define lr.type pslr + +%token-pattern P /p/ +%token-pattern Q /q/ +%token-pattern X /x/ +%token-pattern IF /if/ +%token-pattern ID /[a-z]+/ + +%lex-prec ID <~ IF + +%% + +program + : kw_context + | id_context + ; + +kw_context + : P shared IF + ; + +id_context + : Q shared ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : X + ; diff --git a/spec/fixtures/command/pslr_pure_reduce.y b/spec/fixtures/command/pslr_pure_reduce.y new file mode 100644 index 000000000..99e8ec6ef --- /dev/null +++ b/spec/fixtures/command/pslr_pure_reduce.y @@ -0,0 +1,26 @@ +%define lr.type pslr + +%token-pattern RSHIFT />>/ +%token-pattern RANGLE />/ +%token-pattern ID /[a-z]+/ + +%lex-prec RANGLE -~ RSHIFT + +%% + +program + : templ + | rshift_expr + ; + +templ + : a RANGLE + ; + +rshift_expr + : a RSHIFT ID + ; + +a + : ID + ; diff --git a/spec/fixtures/integration/pslr_context.l b/spec/fixtures/integration/pslr_context.l new file mode 100644 index 000000000..06cbd121d --- /dev/null +++ b/spec/fixtures/integration/pslr_context.l @@ -0,0 +1,50 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_context.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +ID_PATTERN [a-zA-Z_][a-zA-Z0-9_]* + +%% + +{ID_PATTERN} { + (void)yylval; + return ID; +} + +"<" { + return LANGLE; +} + +[>]{1,2} { + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = (yyleng == 2) ? RSHIFT : RANGLE; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_context.y b/spec/fixtures/integration/pslr_context.y new file mode 100644 index 000000000..74de59abd --- /dev/null +++ b/spec/fixtures/integration/pslr_context.y @@ -0,0 +1,75 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_context.h" +#include "pslr_context-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern RSHIFT />>/ +%token-pattern RANGLE />/ +%token-pattern LANGLE / +#include "pslr_fallback_precedence.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) + +static const char * +token_name(int token) +{ + switch (token) { + case X: + return "X"; + case COM: + return "COM"; + case A: + return "A"; + case B: + return "B"; + case SHORT: + return "SHORT"; + case IA: + return "IA"; + case IB: + return "IB"; + case CYC_A: + return "CYC_A"; + case CYC_B: + return "CYC_B"; + case CYC_C: + return "CYC_C"; + case ZA: + return "ZA"; + case ZB: + return "ZB"; + case PA: + return "PA"; + case PB: + return "PB"; + case NON: + return "NON"; + case WORD: + return "WORD"; + default: + return "UNKNOWN"; + } +} +%} + +%% + +(.|\n)+ { + yypslr_scan_result result; + int token; + + (void)yylval; + token = YYPSLR_PSEUDO_SCAN_RESULT(p, yytext, &result); + if (result.length <= 0) { + result.length = 1; + } + printf("%s %d\n", token_name(token), result.length); + if (result.length < yyleng) { + yyless(result.length); + } + return token; +} + +<> { + return YYEOF; +} + +%% diff --git a/spec/fixtures/integration/pslr_fallback_precedence.y b/spec/fixtures/integration/pslr_fallback_precedence.y new file mode 100644 index 000000000..86f708a5e --- /dev/null +++ b/spec/fixtures/integration/pslr_fallback_precedence.y @@ -0,0 +1,93 @@ +%{ +#include +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_fallback_precedence.h" +#include "pslr_fallback_precedence-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern X /x/ +%token-pattern COM /\/\*(.|\n)*\*\// +%token-pattern A /a/ +%token-pattern B /ab/ +%token-pattern SHORT /c/ +%token-pattern IA /cd/ +%token-pattern IB /cd/ +%token-pattern CYC_A /~/ +%token-pattern CYC_B /~/ +%token-pattern CYC_C /~/ +%token-pattern ZA /z/ +%token-pattern ZB /z/ +%token-pattern PA /@/ +%token-pattern PB /@/ +%token-pattern NON /non-/ +%token-pattern WORD /[a-z-]+/ + +%lex-prec COM -s COM +%lex-prec WORD -< NON +%lex-prec PA <- PB +%lex-prec IA <- IB +%lex-prec CYC_A <- CYC_B +%lex-prec CYC_B <- CYC_C +%lex-prec CYC_C <- CYC_A + +%% + +start + : X { printf("ok\n"); } + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + (void)str; + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2 && strcmp(argv[1], "__empty__") == 0) { + yypslr_scan_result result; + int match_length = -1; + int token = YYPSLR_PSEUDO_SCAN_RESULT(¶ms, "", &result); + int wrapper_token = YYPSLR_PSEUDO_SCAN(¶ms, "", &match_length); + int ok = token == YYEOF && result.token == YYEOF && wrapper_token == YYEOF && + result.length == 0 && result.is_layout == 0 && result.is_character_token == 0 && + match_length == 0; + + printf("%s %d %d %d %d\n", ok ? "EOF" : "BAD", result.length, result.is_layout, + result.is_character_token, match_length); + return ok ? 0 : 1; + } + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + return yyparse(¶ms); +} diff --git a/spec/fixtures/integration/pslr_implicit_literal.l b/spec/fixtures/integration/pslr_implicit_literal.l new file mode 100644 index 000000000..31121d203 --- /dev/null +++ b/spec/fixtures/integration/pslr_implicit_literal.l @@ -0,0 +1,30 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include "pslr_implicit_literal.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +(.|\n)+ { + yypslr_scan_result result; + int token; + + (void)yylval; + token = YYPSLR_PSEUDO_SCAN_RESULT(p, yytext, &result); + if (result.length <= 0) { + result.length = 1; + } + if (result.length < yyleng) { + yyless(result.length); + } + return token; +} + +<> { + return YYEOF; +} + +%% diff --git a/spec/fixtures/integration/pslr_implicit_literal.y b/spec/fixtures/integration/pslr_implicit_literal.y new file mode 100644 index 000000000..1f9526429 --- /dev/null +++ b/spec/fixtures/integration/pslr_implicit_literal.y @@ -0,0 +1,66 @@ +%{ +#include +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_implicit_literal.h" +#include "pslr_implicit_literal-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern ID /[a-z]+/ + +%% + +start + : ID ';' { printf("ok\n"); } + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + (void)str; + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2 && strcmp(argv[1], "__fallback_semi__") == 0) { + yypslr_scan_result result; + int token = YYPSLR_PSEUDO_SCAN_RESULT(¶ms, ";", &result); + int ok = token == ';' && result.token == ';' && result.length == 1 && + result.is_character_token == 0; + + printf("%s %d %d\n", ok ? "SEMI" : "BAD", result.length, result.is_character_token); + return ok ? 0 : 1; + } + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + return yyparse(¶ms); +} diff --git a/spec/fixtures/integration/pslr_keyword_context.l b/spec/fixtures/integration/pslr_keyword_context.l new file mode 100644 index 000000000..a54322064 --- /dev/null +++ b/spec/fixtures/integration/pslr_keyword_context.l @@ -0,0 +1,41 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_keyword_context.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +[a-z]+ { + (void)yylval; + + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = ID; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_keyword_context.y b/spec/fixtures/integration/pslr_keyword_context.y new file mode 100644 index 000000000..6f5f6b509 --- /dev/null +++ b/spec/fixtures/integration/pslr_keyword_context.y @@ -0,0 +1,87 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_keyword_context.h" +#include "pslr_keyword_context-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern P /p/ +%token-pattern Q /q/ +%token-pattern X /x/ +%token-pattern IF /if/ +%token-pattern ID /[a-z]+/ + +%lex-prec ID <~ IF + +%% + +program + : kw_context { printf("kw\n"); } + | id_context { printf("id\n"); } + ; + +kw_context + : P shared IF + ; + +id_context + : Q shared ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : X + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_layout_comment.l b/spec/fixtures/integration/pslr_layout_comment.l new file mode 100644 index 000000000..82fc8fa77 --- /dev/null +++ b/spec/fixtures/integration/pslr_layout_comment.l @@ -0,0 +1,36 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_layout_comment.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +(.|\n)+ { + yypslr_scan_result result; + int token; + + (void)yylval; + token = YYPSLR_PSEUDO_SCAN_RESULT(p, yytext, &result); + if (result.length <= 0) { + result.length = 1; + } + if (result.length < yyleng) { + yyless(result.length); + } + /* Layout tokens are scanner-only. Consume them and keep the same parser + state while scanning the remaining input. */ + if (result.is_layout) { + return yylex(yylval, p); + } + return token; +} + +<> { + return YYEOF; +} + +%% diff --git a/spec/fixtures/integration/pslr_layout_comment.y b/spec/fixtures/integration/pslr_layout_comment.y new file mode 100644 index 000000000..185fb927a --- /dev/null +++ b/spec/fixtures/integration/pslr_layout_comment.y @@ -0,0 +1,70 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_layout_comment.h" +#include "pslr_layout_comment-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token ID +%token DIV "/" +%token SEMI ";" + +%token-pattern ID /[a-zA-Z][a-zA-Z0-9_]*/ +%token-pattern DIV /\// +%token-pattern SEMI /;/ +%token-pattern YYLAYOUT_COMMENT /\/\*([^*]|\*+[^*/])*\*+\// +%token-pattern YYLAYOUT_WS /[ \t\r\n]+/ + +%lex-prec DIV -~ YYLAYOUT_COMMENT + +%% + +start + : ID DIV ID SEMI { printf("ok\n"); } + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_mixed_context.l b/spec/fixtures/integration/pslr_mixed_context.l new file mode 100644 index 000000000..5c75c7a31 --- /dev/null +++ b/spec/fixtures/integration/pslr_mixed_context.l @@ -0,0 +1,69 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_mixed_context.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +[a-z]+ { + (void)yylval; + + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = ID; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +"<" { + return LT; +} + +"@" { + return START; +} + +"#" { + return MARK; +} + +[>]{1,2} { + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = (yyleng == 2) ? RSHIFT : RANGLE; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_mixed_context.y b/spec/fixtures/integration/pslr_mixed_context.y new file mode 100644 index 000000000..38eba8c90 --- /dev/null +++ b/spec/fixtures/integration/pslr_mixed_context.y @@ -0,0 +1,102 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_mixed_context.h" +#include "pslr_mixed_context-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern LT />/ +%token-pattern RANGLE />/ + +%lex-prec ID <~ IF +%lex-no-tie RANGLE RSHIFT + +%% + +program + : kw_context { printf("kw\n"); } + | id_context { printf("id\n"); } + | template_expr { printf("template\n"); } + | shift_expr { printf("shift\n"); } + ; + +kw_context + : P shared IF + ; + +id_context + : Q shared ID + ; + +template_expr + : LT shared RANGLE + ; + +shift_expr + : START shared RSHIFT ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : MARK + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_shift_chain.l b/spec/fixtures/integration/pslr_shift_chain.l new file mode 100644 index 000000000..885ba62e8 --- /dev/null +++ b/spec/fixtures/integration/pslr_shift_chain.l @@ -0,0 +1,58 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_shift_chain.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +ID_PATTERN [a-z]+ + +%% + +{ID_PATTERN} { + (void)yylval; + return ID; +} + +"<" { + return LT; +} + +"@" { + return START; +} + +"#" { + return MARK; +} + +[>]{1,2} { + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = (yyleng == 2) ? RSHIFT : RANGLE; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_shift_chain.y b/spec/fixtures/integration/pslr_shift_chain.y new file mode 100644 index 000000000..d7b63d759 --- /dev/null +++ b/spec/fixtures/integration/pslr_shift_chain.y @@ -0,0 +1,88 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_shift_chain.h" +#include "pslr_shift_chain-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern LT />/ +%token-pattern RANGLE />/ +%token-pattern ID /[a-z]+/ + +%lex-no-tie RANGLE RSHIFT + +%% + +program + : template_expr { printf("template\n"); } + | shift_expr { printf("shift\n"); } + ; + +template_expr + : LT shared RANGLE + ; + +shift_expr + : START shared RSHIFT ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : MARK + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_template_argument_lists.l b/spec/fixtures/integration/pslr_template_argument_lists.l new file mode 100644 index 000000000..aedab7edf --- /dev/null +++ b/spec/fixtures/integration/pslr_template_argument_lists.l @@ -0,0 +1,36 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_template_argument_lists.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +(.|\n)+ { + yypslr_scan_result result; + int token; + + (void)yylval; + token = YYPSLR_PSEUDO_SCAN_RESULT(p, yytext, &result); + if (result.length <= 0) { + result.length = 1; + } + if (result.length < yyleng) { + yyless(result.length); + } + /* Layout tokens are scanner-only. Consume them and keep the same parser + state while scanning the remaining input. */ + if (result.is_layout) { + return yylex(yylval, p); + } + return token; +} + +<> { + return YYEOF; +} + +%% diff --git a/spec/fixtures/integration/pslr_template_argument_lists.y b/spec/fixtures/integration/pslr_template_argument_lists.y new file mode 100644 index 000000000..baf265b57 --- /dev/null +++ b/spec/fixtures/integration/pslr_template_argument_lists.y @@ -0,0 +1,91 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_template_argument_lists.h" +#include "pslr_template_argument_lists-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token ID +%token LT "<" +%token RANGLE ">" +%token RSHIFT ">>" +%token SEMI ";" + +%token-pattern ID /[a-zA-Z][a-zA-Z0-9_]*/ +%token-pattern LT // +%token-pattern RSHIFT />>/ +%token-pattern SEMI /;/ +%token-pattern YYLAYOUT /[ \t\r\n]+/ + +%lex-no-tie RANGLE RSHIFT + +%% + +start + : decl SEMI { printf("decl\n"); } + | expr SEMI { printf("expr\n"); } + ; + +decl + : type id + ; + +type + : id LT type RANGLE + | ID + ; + +expr + : id RSHIFT id + ; + +id + : ID + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_token_action.y b/spec/fixtures/integration/pslr_token_action.y new file mode 100644 index 000000000..8c36b986f --- /dev/null +++ b/spec/fixtures/integration/pslr_token_action.y @@ -0,0 +1,69 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_token_action.h" +#include "pslr_token_action-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token ID NUM SEMI + +%token-pattern ID /[a-zA-Z][a-zA-Z0-9_]*/ +%token-pattern NUM /[0-9]+/ +%token-pattern YYLAYOUT_WS /[ \t\r\n]+/ + +%token-action ID { /* token action for ID */ } +%token-action NUM { /* token action for NUM */ } + +%lex-prec ID -~ NUM + +%% + +start + : ID NUM ';' { printf("ok\n"); } + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/lrama/command_spec.rb b/spec/lrama/command_spec.rb index 58069e4a7..ce30eccf0 100644 --- a/spec/lrama/command_spec.rb +++ b/spec/lrama/command_spec.rb @@ -81,5 +81,105 @@ File.delete("report.output") end end + + context "when a PSLR grammar needs pure-reduce lookahead to choose tokens" do + let(:outfile) { File.join(Dir.tmpdir, "pslr-pure-reduce.c") } + + before do + File.delete(outfile) if File.exist?(outfile) + end + + after do + File.delete(outfile) if File.exist?(outfile) + end + + it "emits parser output successfully" do + command = Lrama::Command.new(["-o", outfile, fixture_path("command/pslr_pure_reduce.y")]) + + expect(command.run).to be_nil + expect(File).to exist(outfile) + end + end + + context "when validation aborts" do + let(:outfile) { File.join(Dir.tmpdir, "validate-abort.c") } + + before do + File.delete(outfile) if File.exist?(outfile) + end + + after do + File.delete(outfile) if File.exist?(outfile) + end + + it "fails before writing parser output" do + allow_any_instance_of(Lrama::States).to receive(:validate!).and_raise(SystemExit) + + command = Lrama::Command.new(["-o", outfile, fixture_path("command/basic.y")]) + + expect { command.run }.to raise_error(SystemExit) + expect(File).not_to exist(outfile) + end + end + + context "when a PSLR grammar exceeds the configured state limit" do + let(:outfile) { File.join(Dir.tmpdir, "pslr-growth-limit.c") } + + before do + File.delete(outfile) if File.exist?(outfile) + end + + after do + File.delete(outfile) if File.exist?(outfile) + end + + it "fails before writing parser output" do + command = Lrama::Command.new([ + "-Dpslr.max-states=5", + "-o", outfile, + fixture_path("command/pslr_growth_limit.y") + ]) + + expect do + begin + command.run + rescue SystemExit + nil + end + end.to output(/error: PSLR state growth exceeded pslr.max-states=5/).to_stderr_from_any_process + + expect(File).not_to exist(outfile) + end + end + + context "when PSLR report output is requested" do + let(:outfile) { File.join(Dir.tmpdir, "pslr-report.c") } + let(:report_file) { File.join(Dir.tmpdir, "pslr-report.output") } + + before do + File.delete(outfile) if File.exist?(outfile) + File.delete(report_file) if File.exist?(report_file) + end + + after do + File.delete(outfile) if File.exist?(outfile) + File.delete(report_file) if File.exist?(report_file) + end + + it "writes PSLR metrics into the report file" do + command = Lrama::Command.new([ + "--report=pslr", + "--report-file=#{report_file}", + "-o", outfile, + fixture_path("command/pslr_growth_limit.y") + ]) + + expect(command.run).to be_nil + report = File.read(report_file) + expect(report).to include("PSLR Summary") + expect(report).to include("Base states:") + expect(report).to include("Total states:") + end + end end end diff --git a/spec/lrama/grammar/lex_prec_spec.rb b/spec/lrama/grammar/lex_prec_spec.rb new file mode 100644 index 000000000..8681a3a4f --- /dev/null +++ b/spec/lrama/grammar/lex_prec_spec.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Grammar::LexPrec do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + def ident(name) + Lrama::Lexer::Token::Ident.new(s_value: name) + end + + it "stores identity-right and longest-match rules" do + lex_prec.add_rule( + left_token: ident("ID"), + operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT_LONGEST, + right_token: ident("IF"), + lineno: 1 + ) + + expect(lex_prec.rules.size).to eq(1) + expect(lex_prec.identity_precedes?("IF", "ID")).to be true + expect(lex_prec.identity_precedes?("ID", "IF")).to be false + expect(lex_prec.longest_pair?("ID", "IF")).to be true + end + + it "does not infer transitive identity precedence" do + lex_prec.add_rule( + left_token: ident("A"), + operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, + right_token: ident("B"), + lineno: 1 + ) + lex_prec.add_rule( + left_token: ident("B"), + operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, + right_token: ident("C"), + lineno: 1 + ) + + expect(lex_prec.identity_precedes?("B", "A")).to be true + expect(lex_prec.identity_precedes?("C", "B")).to be true + expect(lex_prec.identity_precedes?("C", "A")).to be false + end + + it "separates lexical ties from precedence" do + tie = Lrama::Grammar::LexTie.new + tie.add_tie("ID", "IF") + + expect(tie.tied?("ID", "IF")).to be true + expect(lex_prec.identity_precedes?("IF", "ID")).to be false + end +end diff --git a/spec/lrama/grammar/token_action_spec.rb b/spec/lrama/grammar/token_action_spec.rb new file mode 100644 index 000000000..238d0a2c9 --- /dev/null +++ b/spec/lrama/grammar/token_action_spec.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Grammar::TokenAction do + def ident(name) + Lrama::Lexer::Token::Ident.new(s_value: name) + end + + def user_code(code) + Lrama::Lexer::Token::UserCode.new(s_value: code) + end + + it "stores token action attributes" do + action = Lrama::Grammar::TokenAction.new( + token_id: ident("ID"), + code: user_code('printf("matched");'), + lineno: 5 + ) + + expect(action.token_name).to eq("ID") + expect(action.code.s_value).to eq('printf("matched");') + expect(action.lineno).to eq(5) + end +end diff --git a/spec/lrama/grammar/token_pattern_spec.rb b/spec/lrama/grammar/token_pattern_spec.rb new file mode 100644 index 000000000..2318865b9 --- /dev/null +++ b/spec/lrama/grammar/token_pattern_spec.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Grammar::TokenPattern do + it "stores token pattern information" do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + pattern = Lrama::Lexer::Token::Regex.new(s_value: "/>>>/") + + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: pattern, + alias_name: "right shift", + tag: nil, + lineno: 1, + definition_order: 0 + ) + + expect(token_pattern.name).to eq("RSHIFT") + expect(token_pattern.regex_pattern).to eq(">>>") + expect(token_pattern.alias_name).to eq("right shift") + expect(token_pattern.definition_order).to eq(0) + end + + it "recognizes YYLAYOUT-prefixed layout tokens" do + pattern = Lrama::Lexer::Token::Regex.new(s_value: "/[ ]+/") + layout = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "YYLAYOUT_COMMENT"), + pattern: pattern, + lineno: 1, + definition_order: 0 + ) + normal = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "ID"), + pattern: pattern, + lineno: 1, + definition_order: 1 + ) + + expect(layout).to be_layout + expect(normal).not_to be_layout + end +end diff --git a/spec/lrama/grammar_spec.rb b/spec/lrama/grammar_spec.rb index 3be8eab4f..4a663401d 100644 --- a/spec/lrama/grammar_spec.rb +++ b/spec/lrama/grammar_spec.rb @@ -242,5 +242,247 @@ end end end + + context 'when PSLR state member is not a valid C identifier' do + before do + grammar.define = { + 'lr.type' => 'pslr', + 'api.pslr.state-member' => 'current-state' + } + end + + it 'raises an error with the invalid member name' do + expect { grammar.validate! } + .to raise_error(RuntimeError, '%define api.pslr.state-member must be a valid C identifier, got "current-state".') + end + end + + context 'when PSLR max states is not an integer' do + before do + grammar.define = { + 'lr.type' => 'pslr', + 'pslr.max-states' => 'many' + } + end + + it 'raises an error with the invalid value' do + expect { grammar.validate! } + .to raise_error(RuntimeError, '%define pslr.max-states must be an integer, got "many".') + end + end + + context 'when PSLR max state ratio is smaller than one' do + before do + grammar.define = { + 'lr.type' => 'pslr', + 'pslr.max-state-ratio' => '0.5' + } + end + + it 'raises an error with the invalid ratio' do + expect { grammar.validate! } + .to raise_error(RuntimeError, '%define pslr.max-state-ratio must be greater than or equal to 1.0, got "0.5".') + end + end + end + + describe "#finalize_lexical_ties!" do + def build_pslr_grammar(source) + grammar = Lrama::Parser.new(source, "lex_tie.y").parse + grammar.prepare + grammar.validate! + grammar + end + + it "keeps token-token ties even without a scanner conflict" do + grammar = build_pslr_grammar(<<~GRAMMAR) + %define lr.type pslr + %token-pattern A /a/ + %token-pattern B /b/ + %lex-tie A B + %% + start: A | B ; + GRAMMAR + + grammar.finalize_lexical_ties!(Lrama::ScannerFSA.new(grammar.token_patterns)) + + expect(grammar.lex_tie.tied?("A", "B")).to be true + end + + it "limits set-set ties to scanner-conflicting pairs" do + grammar = build_pslr_grammar(<<~GRAMMAR) + %define lr.type pslr + %token-pattern RANGLE />/ + %token-pattern RSHIFT />>/ + %token-pattern DOT /\\./ + %token-pattern COMMA /,/ + %symbol-set punct RANGLE RSHIFT DOT COMMA + %lex-tie punct punct + %% + start: RANGLE | RSHIFT | DOT | COMMA ; + GRAMMAR + + grammar.finalize_lexical_ties!(Lrama::ScannerFSA.new(grammar.token_patterns)) + + expect(grammar.lex_tie.tied?("RANGLE", "RSHIFT")).to be true + expect(grammar.lex_tie.tied?("DOT", "COMMA")).to be false + end + + it "limits set-token ties to scanner-conflicting pairs" do + grammar = build_pslr_grammar(<<~GRAMMAR) + %define lr.type pslr + %token-pattern ID /[a-z]+/ + %token-pattern KW_IF /if/ + %token-pattern KW_WHILE /while/ + %token-pattern PLUS /\\+/ + %symbol-set keywords KW_IF KW_WHILE + %lex-tie ID keywords + %lex-tie PLUS keywords + %% + start: ID | KW_IF | KW_WHILE | PLUS ; + GRAMMAR + + grammar.finalize_lexical_ties!(Lrama::ScannerFSA.new(grammar.token_patterns)) + + expect(grammar.lex_tie.tied?("ID", "KW_IF")).to be true + expect(grammar.lex_tie.tied?("ID", "KW_WHILE")).to be true + expect(grammar.lex_tie.tied?("PLUS", "KW_IF")).to be false + expect(grammar.lex_tie.tied?("PLUS", "KW_WHILE")).to be false + end + + it "limits yyall ties to scanner-conflicting pairs" do + grammar = build_pslr_grammar(<<~GRAMMAR) + %define lr.type pslr + %token-pattern PLUS /\\+/ + %token-pattern PLUSPLUS /\\+\\+/ + %token-pattern DOT /\\./ + %token-pattern SLASH /\\// + %lex-tie yyall yyall + %% + start: PLUS | PLUSPLUS | DOT | SLASH ; + GRAMMAR + + grammar.finalize_lexical_ties!(Lrama::ScannerFSA.new(grammar.token_patterns)) + + expect(grammar.lex_tie.tied?("PLUS", "PLUSPLUS")).to be true + expect(grammar.lex_tie.tied?("DOT", "SLASH")).to be false + expect(grammar.lex_tie.tied?("PLUS", "DOT")).to be false + expect(grammar.lex_tie.tied?("SLASH", "PLUSPLUS")).to be false + end + + it "lets a specific tie override generic yyall no-tie" do + grammar = build_pslr_grammar(<<~GRAMMAR) + %define lr.type pslr + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %symbol-set keywords IF + %lex-no-tie yyall yyall + %lex-tie ID keywords + %% + start: IF | ID ; + GRAMMAR + + grammar.finalize_lexical_ties!(Lrama::ScannerFSA.new(grammar.token_patterns)) + + expect(grammar.lex_tie.tied?("ID", "IF")).to be true + expect(grammar.lex_tie.no_tie?("ID", "IF")).to be false + end + + it "rejects a direct no-tie that conflicts with transitive ties" do + grammar = build_pslr_grammar(<<~GRAMMAR) + %define lr.type pslr + %token-pattern A /a/ + %token-pattern B /a/ + %token-pattern C /a/ + %lex-tie A B + %lex-tie B C + %lex-no-tie A C + %% + start: A | B | C ; + GRAMMAR + + expect do + grammar.finalize_lexical_ties!(Lrama::ScannerFSA.new(grammar.token_patterns)) + end.to raise_error(RuntimeError, /%lex-no-tie A C conflicts/) + end + end + + describe "#synthesize_implicit_literal_token_patterns!" do + it "adds exact-match token patterns for character literal terminals" do + grammar = Lrama::Parser.new(<<~GRAMMAR, "implicit_literal.y").parse + %define lr.type pslr + %token-pattern ID /[a-z]+/ + %% + start: ID ';' ; + GRAMMAR + grammar.prepare + grammar.validate! + + grammar.synthesize_implicit_literal_token_patterns! + + literal_pattern = grammar.token_patterns.find {|pattern| pattern.name == "';'" } + expect(literal_pattern).not_to be_nil + expect(Lrama::ScannerFSA.new(grammar.token_patterns).scan(";").map {|result| result[:token].name }).to include("';'") + end + + it "escapes regex metacharacters in character literal patterns" do + grammar = Lrama::Parser.new(<<~GRAMMAR, "implicit_escape.y").parse + %define lr.type pslr + %token-pattern ID /[a-z]+/ + %% + start + : ID '/' + | ID '[' + | ID ']' + | ID '+' + ; + GRAMMAR + grammar.prepare + grammar.validate! + + grammar.synthesize_implicit_literal_token_patterns! + + slash = grammar.token_patterns.find {|p| p.name == "'/'" } + lbracket = grammar.token_patterns.find {|p| p.name == "'['" } + rbracket = grammar.token_patterns.find {|p| p.name == "']'" } + plus = grammar.token_patterns.find {|p| p.name == "'+'" } + + expect(slash).not_to be_nil + expect(slash.regex_pattern).to eq("\\/") + expect(lbracket).not_to be_nil + expect(lbracket.regex_pattern).to eq("\\[") + expect(rbracket).not_to be_nil + expect(rbracket.regex_pattern).to eq("\\]") + expect(plus).not_to be_nil + expect(plus.regex_pattern).to eq("\\+") + end + + it "handles backslash and control character literals" do + grammar = Lrama::Parser.new(<<~GRAMMAR, "implicit_ctrl.y").parse + %define lr.type pslr + %token-pattern ID /[a-z]+/ + %% + start + : ID '\\\\' + | ID '\\n' + | ID '\\t' + ; + GRAMMAR + grammar.prepare + grammar.validate! + + grammar.synthesize_implicit_literal_token_patterns! + + backslash = grammar.token_patterns.find {|p| p.name == "'\\\\'" } + newline = grammar.token_patterns.find {|p| p.name == "'\\n'" } + tab = grammar.token_patterns.find {|p| p.name == "'\\t'" } + + expect(backslash).not_to be_nil + expect(backslash.regex_pattern).to eq("\\\\") + expect(newline).not_to be_nil + expect(newline.regex_pattern).to eq("\\n") + expect(tab).not_to be_nil + expect(tab.regex_pattern).to eq("\\t") + end end end diff --git a/spec/lrama/integration_spec.rb b/spec/lrama/integration_spec.rb index c0bba17f4..66ed9f35a 100644 --- a/spec/lrama/integration_spec.rb +++ b/spec/lrama/integration_spec.rb @@ -130,6 +130,160 @@ def generate_object(grammar_file_path, c_path, obj_path, command_args: []) end end + describe "PSLR parser and lexer integration" do + it "selects the longer token in shift contexts" do + test_parser("pslr_context", "foo>>bar", "shift\n") + end + + it "can prefer the shorter match in template contexts" do + test_parser("pslr_context", "foo>", "template\n") + end + + it "splits keyword and identifier contexts with the same lexeme" do + test_parser("pslr_keyword_context", "p x if", "kw\n") + test_parser("pslr_keyword_context", "q x if", "id\n") + end + + it "keeps chained shift and template contexts distinct" do + test_parser("pslr_shift_chain", "< # >", "template\n") + test_parser("pslr_shift_chain", "@ # >> foo", "shift\n") + end + + it "handles mixed keyword, identifier, template, and shift contexts in one grammar" do + test_parser("pslr_mixed_context", "p # if", "kw\n") + test_parser("pslr_mixed_context", "q # if", "id\n") + test_parser("pslr_mixed_context", "< # >", "template\n") + test_parser("pslr_mixed_context", "@ # >> foo", "shift\n") + end + + it "handles template argument lists without a short-token lex-prec override" do + test_parser("pslr_template_argument_lists", "vector> v;", "decl\n") + test_parser("pslr_template_argument_lists", "a >> b;", "expr\n") + end + + it "discards layout comments that share a prefix with normal tokens" do + test_parser("pslr_layout_comment", "a/* comment */ / b;", "ok\n") + end + + it "reports unresolved layout scanner conflicts without an explicit length rule" do + grammar_text = File.read(fixture_path("integration/pslr_layout_comment.y")) + .sub("%lex-prec DIV -~ YYLAYOUT_COMMENT\n", "") + grammar = Lrama::Parser.new(grammar_text, "integration/pslr_layout_comment_no_prec.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + logger = Lrama::Logger.new + allow(logger).to receive(:error) + + expect { states.validate!(logger) }.to raise_error(SystemExit) + expect(logger).to have_received(:error).with(a_string_including("DIV", "YYLAYOUT_COMMENT")) + end + + it "honors fallback shortest-match length precedence" do + test_parser("pslr_fallback_precedence", "/*a*/ b /*c*/", "COM 5\n", expect_success: false) + end + + it "uses traditional longest match for unspecified fallback length conflicts" do + test_parser("pslr_fallback_precedence", "ab", "B 2\n", expect_success: false) + end + + it "honors fallback right-token length precedence" do + test_parser("pslr_fallback_precedence", "non-euclidean", "NON 4\n", expect_success: false) + end + + it "uses declaration order for unspecified fallback identity conflicts" do + test_parser("pslr_fallback_precedence", "z", "ZA 1\n", expect_success: false) + end + + it "uses explicit identity precedence before fallback declaration order" do + test_parser("pslr_fallback_precedence", "@", "PB 1\n", expect_success: false) + end + + it "preserves explicit identity precedence when fallback length precedence is needed" do + test_parser("pslr_fallback_precedence", "cd", "IB 2\n", expect_success: false) + end + + it "uses declaration order for fallback-only explicit identity cycles" do + test_parser("pslr_fallback_precedence", "~", "CYC_A 1\n", expect_success: false) + end + + it "returns YYEOF for empty input in PSLR pseudo-scan helpers" do + test_parser("pslr_fallback_precedence", "__empty__", "EOF 0 0 0 0\n") + end + + it "synthesizes implicit literal token patterns for parser-state rows" do + test_parser("pslr_implicit_literal", "x;", "ok\n") + end + + it "uses implicit literal token patterns in the fallback row" do + test_parser("pslr_implicit_literal", "__fallback_semi__", "SEMI 1 0\n") + end + + it "does not use fallback longest rules for normal parser-state rows" do + grammar_text = <<~GRAMMAR + %define lr.type pslr + %token-pattern A /a/ + %token-pattern B /ab/ + + %% + + start + : A + | B + ; + GRAMMAR + grammar = Lrama::Parser.new(grammar_text, "integration/pslr_normal_conflict.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + logger = Lrama::Logger.new + allow(logger).to receive(:error) + + expect { states.validate!(logger) }.to raise_error(SystemExit) + expect(logger).to have_received(:error).with(a_string_including("unresolved PSLR scanner conflict", "A", "B")) + end + + it "does not use fallback declaration order for normal parser-state identity cycles" do + grammar_text = <<~GRAMMAR + %define lr.type pslr + %token-pattern A /a/ + %token-pattern B /a/ + %token-pattern C /a/ + + %lex-prec A <- B + %lex-prec B <- C + %lex-prec C <- A + + %% + + start + : A + | B + | C + ; + GRAMMAR + grammar = Lrama::Parser.new(grammar_text, "integration/pslr_normal_identity_cycle.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + logger = Lrama::Logger.new + allow(logger).to receive(:error) + + expect { states.validate!(logger) }.to raise_error(SystemExit) + expect(logger).to have_received(:error).with(a_string_including("unresolved PSLR scanner conflict", "A", "B", "C")) + end + + it "consumes an unmatched PSLR character token before reporting an error" do + test_parser("pslr_template_argument_lists", "$", "", expect_success: false) + end + end + describe "user defined parameterized rules" do it "prints messages corresponding to rules" do expected = <<~STR @@ -307,4 +461,232 @@ def generate_object(grammar_file_path, c_path, obj_path, command_args: []) end end end + + describe "PSLR context-dependent lexing" do + describe "Scanner FSA with overlapping patterns" do + let(:rangle) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:rshift) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + + it "recognizes both RANGLE and RSHIFT as possible matches for '>>'" do + results = scanner_fsa.scan(">>") + + token_names = results.map { |r| r[:token].name } + expect(token_names).to include("RANGLE") + expect(token_names).to include("RSHIFT") + end + + it "RANGLE matches at position 1, RSHIFT matches at position 2" do + results = scanner_fsa.scan(">>") + + rangle_match = results.find { |r| r[:token].name == "RANGLE" } + rshift_match = results.find { |r| r[:token].name == "RSHIFT" } + + expect(rangle_match[:position]).to eq(1) + expect(rshift_match[:position]).to eq(2) + end + end + + describe "Length precedence resolution" do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + before do + left = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::SHORTEST, + right_token: right, + lineno: 1 + ) + end + + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + + it "indicates RANGLE (shorter) should be preferred over RSHIFT (longer)" do + expect(length_prec.prefer_shorter?("RANGLE", "RSHIFT")).to be true + end + + it "returns :left precedence for RANGLE vs RSHIFT" do + expect(length_prec.precedence("RANGLE", "RSHIFT")).to eq(:left) + end + end + + describe "Keyword vs identifier precedence" do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + before do + left = Lrama::Lexer::Token::Ident.new(s_value: "ID") + right = Lrama::Lexer::Token::Ident.new(s_value: "IF") + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT_LONGEST, + right_token: right, + lineno: 1 + ) + end + + it "indicates IF has higher priority than ID" do + expect(lex_prec.identity_precedes?("IF", "ID")).to be true + end + + it "indicates ID does not have higher priority than IF" do + expect(lex_prec.identity_precedes?("ID", "IF")).to be false + end + end + + describe "Full PSLR grammar compilation" do + let(:grammar_text) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern RSHIFT />>/ "right shift" + %token-pattern RANGLE />/ "right angle" + %token-pattern LANGLE />") + token_names = results.map { |r| r[:token].name } + + expect(token_names).to include("RANGLE") + expect(token_names).to include("RSHIFT") + end + + describe "context-dependent token selection" do + it "scanner_accepts table is built" do + expect(states.scanner_accepts_table).not_to be_nil + end + + it "different parser states may accept different tokens for same FSA state" do + scanner_accepts = states.scanner_accepts_table + scanner_fsa = states.scanner_fsa + + results = scanner_fsa.scan(">>") + rshift_result = results.find { |r| r[:token].name == "RSHIFT" } + rangle_result = results.find { |r| r[:token].name == "RANGLE" } + + expect(rshift_result).not_to be_nil + expect(rangle_result).not_to be_nil + expect(scanner_accepts.table).to be_a(Hash) + end + end + + describe "generated C code output" do + let(:out) { StringIO.new } + let(:context) { Lrama::Context.new(states) } + let(:output) do + Lrama::Output.new( + out: out, + output_file_path: "pslr_test.c", + template_name: "bison/yacc.c", + grammar_file_path: "pslr_test.y", + context: context, + grammar: grammar + ) + end + + before do + output.render + out.rewind + end + + let(:rendered) { out.read } + + it "includes yy_scanner_transition table" do + expect(rendered).to include("yy_scanner_transition") + expect(rendered).to include("YY_SCANNER_NUM_STATES") + end + + it "includes yy_state_to_accepting mapping" do + expect(rendered).to include("yy_state_to_accepting") + expect(rendered).to include("YY_ACCEPTING_NONE") + end + + it "includes yy_pslr_length_precedes table" do + expect(rendered).to include("yy_pslr_length_precedes") + end + + it "includes yy_pseudo_scan function" do + expect(rendered).to include("yy_pseudo_scan") + expect(rendered).to include("parser_state") + expect(rendered).to include("match_length") + end + + it "pseudo_scan function uses length precedences for token selection" do + expect(rendered).to include("yy_pslr_length_precedes[pbest][pattern_index]") + end + end + end + end end diff --git a/spec/lrama/length_precedences_spec.rb b/spec/lrama/length_precedences_spec.rb new file mode 100644 index 000000000..c8eef01d0 --- /dev/null +++ b/spec/lrama/length_precedences_spec.rb @@ -0,0 +1,129 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::LengthPrecedences do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + def ident(name) + Lrama::Lexer::Token::Ident.new(s_value: name) + end + + def add_rule(left, operator, right, lineno) + lex_prec.add_rule( + left_token: ident(left), + operator: operator, + right_token: ident(right), + lineno: lineno + ) + end + + describe "#resolution" do + it "defaults same-token autolength conflicts to longest match" do + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.resolution("ID", "ID")).to eq(Lrama::LengthPrecedences::PREFER_NEW) + expect(length_prec.precedes?("ID", "ID")).to be true + end + + it "leaves different-token length conflicts unresolved without a rule" do + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.resolution("A", "B")).to eq(Lrama::LengthPrecedences::UNRESOLVED) + expect(length_prec.precedence("A", "B")).to eq(:undefined) + end + + it "supports explicit shortest-match precedence" do + lex_prec.add_rule( + left_token: ident("COM"), + operator: Lrama::Grammar::LexPrec::SHORTEST, + right_token: ident("COM"), + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.resolution("COM", "COM")).to eq(Lrama::LengthPrecedences::PREFER_OLD) + expect(length_prec.prefer_shorter?("COM", "COM")).to be true + end + + it "supports explicit longest-match precedence" do + lex_prec.add_rule( + left_token: ident("ID"), + operator: Lrama::Grammar::LexPrec::LONGEST, + right_token: ident("IF"), + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.resolution("ID", "IF")).to eq(Lrama::LengthPrecedences::PREFER_NEW) + expect(length_prec.resolution("IF", "ID")).to eq(Lrama::LengthPrecedences::PREFER_NEW) + end + + it "supports right-token length precedence" do + lex_prec.add_rule( + left_token: ident("WORD"), + operator: Lrama::Grammar::LexPrec::TOKEN_RIGHT_LENGTH, + right_token: ident("NON"), + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.resolution("WORD", "NON")).to eq(Lrama::LengthPrecedences::PREFER_NEW) + expect(length_prec.resolution("NON", "WORD")).to eq(Lrama::LengthPrecedences::PREFER_OLD) + end + end + + describe "#fallback_precedes?" do + it "uses traditional longest match for unspecified fallback length conflicts" do + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.normal_precedes?("A", "B")).to be false + expect(length_prec.fallback_precedes?("A", "B")).to be true + end + + it "respects explicit same-token shortest-match precedence" do + add_rule("COM", Lrama::Grammar::LexPrec::SHORTEST, "COM", 1) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.normal_precedes?("COM", "COM")).to be false + expect(length_prec.fallback_precedes?("COM", "COM")).to be false + end + + it "respects explicit right-token length precedence" do + add_rule("WORD", Lrama::Grammar::LexPrec::TOKEN_RIGHT_LENGTH, "NON", 1) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.fallback_precedes?("WORD", "NON")).to be true + expect(length_prec.fallback_precedes?("NON", "WORD")).to be false + end + end + + describe "#initialize" do + it "rejects contradictory shortest and longest rules for the same scan direction" do + add_rule("RANGLE", Lrama::Grammar::LexPrec::SHORTEST, "RSHIFT", 10) + add_rule("RANGLE", Lrama::Grammar::LexPrec::LONGEST, "RSHIFT", 12) + + expect { Lrama::LengthPrecedences.new(lex_prec) } + .to raise_error( + Lrama::LengthPrecedences::LexicalPrecedenceConflictError, + /RANGLE -> RSHIFT.*-s at line 10.*-~ at line 12/m + ) + end + + it "rejects contradictory right-token length winners in reverse declarations" do + add_rule("RANGLE", Lrama::Grammar::LexPrec::TOKEN_RIGHT_LENGTH, "RSHIFT", 20) + add_rule("RSHIFT", Lrama::Grammar::LexPrec::TOKEN_RIGHT_LENGTH, "RANGLE", 21) + + expect { Lrama::LengthPrecedences.new(lex_prec) } + .to raise_error( + Lrama::LengthPrecedences::LexicalPrecedenceConflictError, + /RSHIFT -> RANGLE.*-< at line 20.*-< at line 21/m + ) + end + + it "allows repeated declarations with the same length resolution" do + add_rule("RANGLE", Lrama::Grammar::LexPrec::LONGEST, "RSHIFT", 30) + add_rule("RSHIFT", Lrama::Grammar::LexPrec::LONGEST, "RANGLE", 31) + + expect { Lrama::LengthPrecedences.new(lex_prec) }.not_to raise_error + end + end +end diff --git a/spec/lrama/lexer/token/regex_spec.rb b/spec/lrama/lexer/token/regex_spec.rb new file mode 100644 index 000000000..bc4fb38ea --- /dev/null +++ b/spec/lrama/lexer/token/regex_spec.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Lexer::Token::Regex do + describe "#pattern" do + it "returns the pattern without surrounding slashes" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>>/") + expect(regex.pattern).to eq(">>>") + end + + it "handles character class patterns" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-zA-Z_][a-zA-Z0-9_]*/") + expect(regex.pattern).to eq("[a-zA-Z_][a-zA-Z0-9_]*") + end + + it "handles escape sequences" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/\\+/") + expect(regex.pattern).to eq("\\+") + end + + it "handles empty pattern" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "//") + expect(regex.pattern).to eq("") + end + + it "handles single character pattern" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + expect(regex.pattern).to eq(">") + end + end + + describe "#s_value" do + it "returns the original value including slashes" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>>/") + expect(regex.s_value).to eq("/>>>/") + end + end +end diff --git a/spec/lrama/lexer_context_classifier_spec.rb b/spec/lrama/lexer_context_classifier_spec.rb new file mode 100644 index 000000000..0ad703ca1 --- /dev/null +++ b/spec/lrama/lexer_context_classifier_spec.rb @@ -0,0 +1,457 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::LexerContextClassifier do + include PslrFamilyHelper + + # Helper to build a classifier with standard CRuby-like contexts + def build_classifier_with_contexts + lexer_contexts = {} + [ + ["BEG", %w[keyword_if keyword_unless keyword_while keyword_do tPLUS tMINUS tLPAREN tLBRACK tLBRACE]], + ["CMDARG", %w[tIDENTIFIER tFID tCONSTANT]], + ["END", %w[tINTEGER tFLOAT tSTRING_END keyword_end tRPAREN tRBRACK tRBRACE]], + ["ENDFN", %w[keyword_def]], + ["DOT", %w[tDOT tCOLON2 tANDDOT]], + ].each_with_index do |(name, syms), idx| + lc = Lrama::Grammar::LexerContext.new(name: name, index: idx) + syms.each do |s| + lc.add_symbols([double("token", s_value: s)]) + end + lexer_contexts[name] = lc + end + described_class.new(lexer_contexts) + end + + describe "context bitmask assignment" do + it "assigns non-overlapping bitmask flags by definition order" do + classifier = build_classifier_with_contexts + bitmasks = classifier.contexts.map(&:bitmask) + + # All bitmasks should be powers of 2 + bitmasks.each do |bm| + expect(bm).to be > 0 + expect(bm & (bm - 1)).to eq(0), "#{bm} is not a power of 2" + end + + # No two should overlap + bitmasks.combination(2).each do |a, b| + expect(a & b).to eq(0), "Bitmasks #{a} and #{b} overlap" + end + end + end + + describe ".context_name" do + let(:lexer_contexts) do + lcs = {} + lc = Lrama::Grammar::LexerContext.new(name: "BEG", index: 0) + lcs["BEG"] = lc + lc2 = Lrama::Grammar::LexerContext.new(name: "CMDARG", index: 1) + lcs["CMDARG"] = lc2 + lcs + end + + it "returns UNKNOWN for 0" do + expect(described_class.context_name(0, lexer_contexts)).to eq("UNKNOWN") + end + + it "returns single context name for single flag" do + expect(described_class.context_name(0x01, lexer_contexts)).to eq("BEG") + expect(described_class.context_name(0x02, lexer_contexts)).to eq("CMDARG") + end + + it "returns combined name for multiple flags" do + name = described_class.context_name(0x01 | 0x02, lexer_contexts) + expect(name).to include("BEG") + expect(name).to include("CMDARG") + end + end + + describe "#classify_symbol_context" do + let(:classifier) { build_classifier_with_contexts } + + it "classifies operator-like terminals" do + %w[tPLUS tMINUS].each do |name| + sym = double("symbol", id: double("id", s_value: name), term?: true) + ctx = classifier.classify_symbol_context(sym) + # BEG = 1 << 0 = 0x01 + expect(ctx).to eq(0x01), "Expected #{name} to be BEG" + end + end + + it "classifies identifier terminals as CMDARG" do + %w[tIDENTIFIER tFID tCONSTANT].each do |name| + sym = double("symbol", id: double("id", s_value: name), term?: true) + ctx = classifier.classify_symbol_context(sym) + # CMDARG = 1 << 1 = 0x02 + expect(ctx).to eq(0x02), "Expected #{name} to be CMDARG" + end + end + + it "classifies literal terminals as END" do + %w[tINTEGER tFLOAT tSTRING_END].each do |name| + sym = double("symbol", id: double("id", s_value: name), term?: true) + ctx = classifier.classify_symbol_context(sym) + # END = 1 << 2 = 0x04 + expect(ctx).to eq(0x04), "Expected #{name} to be END" + end + end + + it "classifies keyword_def as ENDFN" do + sym = double("symbol", id: double("id", s_value: "keyword_def"), term?: true) + ctx = classifier.classify_symbol_context(sym) + # ENDFN = 1 << 3 = 0x08 + expect(ctx).to eq(0x08) + end + + it "classifies dot tokens as DOT" do + %w[tDOT tCOLON2 tANDDOT].each do |name| + sym = double("symbol", id: double("id", s_value: name), term?: true) + ctx = classifier.classify_symbol_context(sym) + # DOT = 1 << 4 = 0x10 + expect(ctx).to eq(0x10), "Expected #{name} to be DOT" + end + end + + it "classifies open brackets as BEG" do + %w[tLPAREN tLBRACK tLBRACE].each do |name| + sym = double("symbol", id: double("id", s_value: name), term?: true) + ctx = classifier.classify_symbol_context(sym) + expect(ctx).to eq(0x01), "Expected #{name} to be BEG" + end + end + + it "classifies close brackets as END" do + %w[tRPAREN tRBRACK tRBRACE].each do |name| + sym = double("symbol", id: double("id", s_value: name), term?: true) + ctx = classifier.classify_symbol_context(sym) + expect(ctx).to eq(0x04), "Expected #{name} to be END" + end + end + + it "returns 0 for unknown symbols" do + sym = double("symbol", id: double("id", s_value: "unknown_token"), term?: true) + ctx = classifier.classify_symbol_context(sym) + expect(ctx).to eq(0) + end + end + + describe "#classify with grammar-defined contexts" do + context "with %lexer-context directives" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "lexer_context/basic.y") + %define lr.type pslr + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec ID <~ IF + + %lexer-context BEG IF + %lexer-context CMDARG ID + + %% + + program + : expr + ; + + expr + : ID + | expr '+' expr + ; + GRAMMAR + end + + it "classifies states without errors" do + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + states.states.each do |state| + expect(state.lexer_context).not_to be_nil + end + end + end + + context "with operator-heavy grammar" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "lexer_context/operators.y") + %define lr.type pslr + %token-pattern PLUS /\\+/ + %token-pattern STAR /\\*/ + %token-pattern ID /[a-z]+/ + %token-pattern NUM /[0-9]+/ + + %lexer-context BEG PLUS STAR + %lexer-context CMDARG ID + %lexer-context END NUM + + %% + + program + : expr + ; + + expr + : NUM + | ID + | expr PLUS expr + | expr STAR expr + ; + GRAMMAR + end + + it "classifies all states" do + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + states.states.each do |state| + expect(state.lexer_context).not_to be_nil + end + end + end + end + + describe "integration with States" do + context "lexer_context_enabled?" do + it "returns false when no %lexer-context directives" do + grammar = build_grammar(<<~GRAMMAR, "lexer_context/no_ctx.y") + %define lr.type pslr + %token-pattern ID /[a-z]+/ + + %% + + program : ID ; + GRAMMAR + + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + expect(states.lexer_context_enabled?).to eq(false) + end + + it "returns true when %lexer-context directives are present" do + grammar = build_grammar(<<~GRAMMAR, "lexer_context/with_ctx.y") + %define lr.type pslr + %token-pattern ID /[a-z]+/ + + %lexer-context BEG ID + + %% + + program : ID ; + GRAMMAR + + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + expect(states.lexer_context_enabled?).to eq(true) + end + end + + context "lexer_context_table" do + it "returns an array with one entry per state" do + grammar = build_grammar(<<~GRAMMAR, "lexer_context/table.y") + %define lr.type pslr + %token-pattern ID /[a-z]+/ + %token-pattern NUM /[0-9]+/ + + %lexer-context BEG ID + %lexer-context END NUM + + %% + + program + : expr + ; + + expr + : NUM + | ID + | expr '+' expr + ; + GRAMMAR + + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + table = states.lexer_context_table + expect(table.size).to eq(states.states_count) + expect(table.all? { |v| v.is_a?(Integer) }).to eq(true) + end + end + end + + describe "context-based state splitting" do + context "with operator vs identifier predecessor contexts" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "lexer_context/split_expr.y") + %define lr.type pslr + %token-pattern tPLUS /\\+/ + %token-pattern tSTAR /\\*/ + %token-pattern tIDENTIFIER /[a-z]+/ + %token-pattern tINTEGER /[0-9]+/ + + %lexer-context BEG tPLUS tSTAR + %lexer-context CMDARG tIDENTIFIER + %lexer-context END tINTEGER + + %% + + program + : expr + ; + + expr + : tINTEGER + | tIDENTIFIER + | expr tPLUS expr + | expr tSTAR expr + ; + GRAMMAR + end + + it "classifies all states with non-nil context" do + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + states.states.each do |state| + expect(state.lexer_context).not_to be_nil, + "State #{state.id} has nil lexer_context" + end + end + + it "has BEG context after operators" do + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + # Find states after tPLUS or tSTAR + operator_target_states = [] + states.states.each do |state| + state.term_transitions.each do |shift| + name = shift.next_sym.id.s_value + if name == "tPLUS" || name == "tSTAR" + operator_target_states << shift.to_state + end + end + end + + lexer_contexts = grammar.lexer_contexts + beg_mask = lexer_contexts["BEG"].bitmask + + operator_target_states.each do |target| + ctx = target.lexer_context || 0 + ctx_name = described_class.context_name(ctx, lexer_contexts) + expect(ctx & beg_mask).not_to eq(0), + "State #{target.id} after operator should have BEG context, got #{ctx_name}" + end + end + end + + context "with def keyword creating ENDFN context" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "lexer_context/endfn.y") + %define lr.type pslr + %token-pattern keyword_def /def/ + %token-pattern keyword_end /end/ + %token-pattern tIDENTIFIER /[a-z]+/ + %token-pattern tINTEGER /[0-9]+/ + + %lexer-context ENDFN keyword_def + %lexer-context END keyword_end tINTEGER + %lexer-context CMDARG tIDENTIFIER + + %% + + program + : defn + ; + + defn + : keyword_def tIDENTIFIER keyword_end + ; + GRAMMAR + end + + it "marks state after keyword_def as ENDFN" do + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.compute_pslr + + lexer_contexts = grammar.lexer_contexts + endfn_mask = lexer_contexts["ENDFN"].bitmask + + # Find state reached after keyword_def + def_target = nil + states.states.each do |state| + state.term_transitions.each do |shift| + if shift.next_sym.id.s_value == "keyword_def" + def_target = shift.to_state + end + end + end + + expect(def_target).not_to be_nil + ctx = def_target.lexer_context || 0 + ctx_name = described_class.context_name(ctx, lexer_contexts) + expect(ctx & endfn_mask).not_to eq(0), + "State after keyword_def should have ENDFN context, got #{ctx_name}" + end + end + end + + describe "existing PSLR tests still pass" do + context "pure reduce profile" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_pure_reduce.y") + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -~ RSHIFT + + %% + + program + : templ + | rshift_expr + ; + + templ + : a RANGLE + ; + + rshift_expr + : a RSHIFT ID + ; + + a + : ID + ; + GRAMMAR + end + + it "does not break PSLR" do + _, pslr_states = compute_ielr_and_pslr(grammar) + expect(pslr_states.pslr_inadequacies).to be_empty + end + end + + context "chained keyword split" do + let(:grammar) do + build_grammar(keyword_context_source(depth: 2), "states/pslr_keyword_ctx.y") + end + + it "does not break PSLR split" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + end + end + end +end diff --git a/spec/lrama/option_parser_spec.rb b/spec/lrama/option_parser_spec.rb index 7675c4daa..0efaa9008 100644 --- a/spec/lrama/option_parser_spec.rb +++ b/spec/lrama/option_parser_spec.rb @@ -80,6 +80,7 @@ lookaheads explicitly associate lookahead tokens to items solved describe shift/reduce conflicts solving counterexamples, cex generate conflict counterexamples + pslr report PSLR split and scanner metrics rules list unused rules terms list unused terminals verbose report detailed internal state and analysis results @@ -128,12 +129,19 @@ end end + context "when pslr is passed" do + it "returns option hash pslr flag enabled" do + opts = option_parser.send(:validate_report, ["pslr"]) + expect(opts).to eq({grammar: true, pslr: true}) + end + end + context "when all is passed" do it "returns option hash all flags enabled" do opts = option_parser.send(:validate_report, ["all"]) expect(opts).to eq({ grammar: true, states: true, itemsets: true, - lookaheads: true, solved: true, counterexamples: true, + lookaheads: true, solved: true, counterexamples: true, pslr: true, rules: true, terms: true, verbose: true }) end diff --git a/spec/lrama/output_spec.rb b/spec/lrama/output_spec.rb index 5fa1d04f0..a88c93fa8 100644 --- a/spec/lrama/output_spec.rb +++ b/spec/lrama/output_spec.rb @@ -203,4 +203,316 @@ end end end + + describe "PSLR methods" do + let(:token_pattern) do + id = Lrama::Lexer::Token::Ident.new(s_value: "ID") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-z]+/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([token_pattern]) } + + let(:mock_states) do + instance_double( + Lrama::States, + scanner_fsa: scanner_fsa, + scanner_accepts_table: nil, + length_precedences: nil, + token_patterns: [token_pattern], + states: [], + find_symbol_by_s_value!: instance_double(Lrama::Grammar::Symbol, token_id: 301), + lexer_context_enabled?: false + ) + end + + let(:mock_context) do + instance_double(Lrama::Context, states: mock_states) + end + + let(:mock_grammar) do + instance_double( + Lrama::Grammar, + eof_symbol: nil, + error_symbol: nil, + undef_symbol: nil, + accept_symbol: nil, + locations: false, + parse_param: "struct parse_params *p", + lex_param: "struct parse_params *p", + pslr_defined?: true, + pslr_state_member: "current_state", + token_actions: [] + ) + end + + let(:pslr_output) do + out = StringIO.new + Lrama::Output.new( + out: out, + output_file_path: "test.c", + template_name: "bison/yacc.c", + grammar_file_path: "test.y", + context: mock_context, + grammar: mock_grammar + ) + end + + describe "#pslr_enabled?" do + it "returns true when grammar requested PSLR output" do + expect(pslr_output.pslr_enabled?).to be true + end + + it "returns false when grammar did not request PSLR output" do + allow(mock_grammar).to receive(:pslr_defined?).and_return(false) + expect(pslr_output.pslr_enabled?).to be false + end + end + + describe "#pslr_scanner_enabled?" do + it "returns true when scanner FSA is built with states" do + expect(pslr_output.pslr_scanner_enabled?).to be true + end + + it "returns false when scanner FSA is nil" do + allow(mock_states).to receive(:scanner_fsa).and_return(nil) + expect(pslr_output.pslr_scanner_enabled?).to be false + end + + it "returns false when scanner FSA has no states" do + empty_fsa = Lrama::ScannerFSA.new([]) + allow(mock_states).to receive(:scanner_fsa).and_return(empty_fsa) + expect(pslr_output.pslr_scanner_enabled?).to be false + end + end + + describe "#scanner_transition_table" do + it "generates C code for scanner transitions" do + result = pslr_output.scanner_transition_table + expect(result).to include("YY_SCANNER_NUM_STATES") + expect(result).to include("yy_scanner_transition") + end + end + + describe "#pseudo_scan_function" do + it "generates the pseudo_scan C function" do + result = pslr_output.pseudo_scan_function + expect(result).to include("yy_pseudo_scan") + expect(result).to include("parser_state") + expect(result).to include("match_length") + expect(result).to include("yy_token_pattern_to_token_id") + expect(result).to include("yy_pslr_fallback_length_precedes") + expect(result).to include("result->token = YYEOF") + end + end + + describe "#pslr_tables_and_functions" do + it "generates all PSLR C code" do + result = pslr_output.pslr_tables_and_functions + expect(result).to include("PSLR(1) Scanner Tables and Functions") + expect(result).to include("YY_SCANNER_NUM_STATES") + expect(result).to include("yy_scanner_transition") + expect(result).to include("yy_pseudo_scan") + expect(result).to include("yy_token_pattern_to_token_id") + end + + it "generates layout accumulation code" do + result = pslr_output.pslr_tables_and_functions + expect(result).to include("yypslr_layout_buffer") + expect(result).to include("YYPSLR_LAYOUT_TEXT") + expect(result).to include("YYPSLR_LAYOUT_LENGTH") + expect(result).to include("yypslr_scan_with_layout") + end + end + + describe "#pslr_lac_function" do + it "generates a LAC checker for PSLR parsers" do + result = pslr_output.pslr_lac_function + + expect(result).to include("yy_lac_check_") + expect(result).to include("yydefact") + end + end + + describe "#state_to_accepting_table" do + it "generates state to accepting mapping" do + result = pslr_output.state_to_accepting_table + expect(result).to include("yy_state_to_accepting") + expect(result).to include("YY_ACCEPTING_NONE") + end + end + + describe "#lexer_context_table_code" do + context "when lexer context is not enabled" do + it "returns empty string" do + result = pslr_output.lexer_context_table_code + expect(result).to eq("") + end + end + + context "when lexer context is enabled" do + let(:lexer_contexts) do + lcs = {} + lcs["BEG"] = Lrama::Grammar::LexerContext.new(name: "BEG", index: 0) + lcs["END"] = Lrama::Grammar::LexerContext.new(name: "END", index: 1) + lcs + end + + let(:classifier) { Lrama::LexerContextClassifier.new(lexer_contexts) } + + before do + allow(mock_states).to receive(:lexer_context_enabled?).and_return(true) + allow(mock_states).to receive(:lexer_context_table).and_return([0x01, 0x02, 0x01, 0x00]) + allow(mock_states).to receive(:lexer_context_classifier).and_return(classifier) + allow(mock_grammar).to receive(:lexer_contexts).and_return(lexer_contexts) + end + + it "generates the C context table" do + result = pslr_output.lexer_context_table_code + expect(result).to include("yy_lexer_context") + expect(result).to include("yy_lexer_context_is") + expect(result).to include("/* state 0 */") + expect(result).to include("/* state 3 */") + end + + it "generates defines via lexer_context_defines_code" do + result = pslr_output.lexer_context_defines_code + expect(result).to include("YY_CTX_BEG") + expect(result).to include("YY_CTX_END") + end + end + end + + describe "#pslr_function_declarations" do + it "declares the PSLR helper entry points" do + result = pslr_output.pslr_function_declarations + expect(result).to include("int yy_state_accepts_token") + expect(result).to include("int yy_pseudo_scan") + expect(result).to include("YYPSLR_PSEUDO_SCAN_STATE") + expect(result).to include("YYPSLR_PSEUDO_SCAN") + expect(result).to include("YYSETSTATE_CONTEXT(CurrentState)") + end + end + + describe "#length_precedences_table_code" do + let(:mock_length_prec) { Lrama::LengthPrecedences.new(Lrama::Grammar::LexPrec.new) } + + before do + allow(mock_states).to receive(:length_precedences).and_return(mock_length_prec) + end + + it "generates length precedences table" do + result = pslr_output.length_precedences_table_code + expect(result).to include("yy_pslr_length_precedes") + expect(result).to include("yy_pslr_fallback_length_precedes") + expect(result).to include("old_token") + end + + it "keeps traditional longest defaults out of the normal row table" do + a = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "A"), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/a/"), + lineno: 1, + definition_order: 0 + ) + b = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "B"), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/ab/"), + lineno: 1, + definition_order: 1 + ) + allow(mock_states).to receive(:token_patterns).and_return([a, b]) + + result = pslr_output.length_precedences_table_code + + expect(result).to include("static const int yy_pslr_length_precedes[2][2] = {\n /* A */ {1, 0},") + expect(result).to include("static const int yy_pslr_fallback_length_precedes[2][2] = {\n /* A */ {1, 1},") + end + end + + describe "#accepting_tokens_table" do + it "generates accepting tokens information" do + result = pslr_output.accepting_tokens_table + expect(result).to include("Accepting state token IDs") + end + end + end + + describe "PSLR integration in render" do + let(:pslr_grammar_text) do + <<~GRAMMAR + %code requires { + struct parse_params { + int current_state; + }; + } + %define lr.type pslr + %define api.pslr.state-member current_state + %parse-param {struct parse_params *p} + %lex-param {struct parse_params *p} + %token-pattern RSHIFT />>/ "right shift" + %token-pattern RANGLE />/ "right angle" + %lex-prec RANGLE -s RSHIFT + %% + program: RSHIFT | RANGLE + GRAMMAR + end + + let(:pslr_grammar) do + grammar = Lrama::Parser.new(pslr_grammar_text, "pslr_test.y").parse + grammar.prepare + grammar.validate! + grammar + end + + let(:pslr_states) do + s = Lrama::States.new(pslr_grammar, Lrama::Tracer.new(Lrama::Logger.new)) + s.compute + s.compute_pslr + s + end + + let(:pslr_context) { Lrama::Context.new(pslr_states) } + let(:pslr_out) { StringIO.new } + + let(:pslr_full_output) do + Lrama::Output.new( + out: pslr_out, + output_file_path: "pslr_test.c", + template_name: "bison/yacc.c", + grammar_file_path: "pslr_test.y", + header_out: header_out, + header_file_path: "pslr_test.h", + context: pslr_context, + grammar: pslr_grammar + ) + end + + it "includes PSLR tables in rendered output" do + pslr_full_output.render + pslr_out.rewind + header_out.rewind + rendered = pslr_out.read + rendered_header = header_out.read + + expect(rendered).to include("PSLR(1) Scanner Tables and Functions") + expect(rendered).to include("YY_SCANNER_NUM_STATES") + expect(rendered).to include("yy_scanner_transition") + expect(rendered).to include("yy_pseudo_scan") + expect(rendered).to include("yy_token_pattern_to_token_id") + expect(rendered).to include("yy_scanner_fallback_accepts") + expect(rendered).to include("yy_token_pattern_is_layout") + expect(rendered).to include("yy_lac_check_") + expect(rendered_header).to include("int yy_state_accepts_token") + expect(rendered_header).to include("int yy_pseudo_scan") + expect(rendered_header).to include("yypslr_scan_result") + expect(rendered_header).to include("YYSETSTATE_CONTEXT(CurrentState)") + expect(rendered_header).to include("YYPSLR_PSEUDO_SCAN(Context, Input, MatchLength)") + end + end end diff --git a/spec/lrama/parser_spec.rb b/spec/lrama/parser_spec.rb index c2115ee27..5341d5a47 100644 --- a/spec/lrama/parser_spec.rb +++ b/spec/lrama/parser_spec.rb @@ -4598,4 +4598,219 @@ class : keyword_class tSTRING keyword_end end end end + + describe "PSLR directives" do + describe "%token-pattern" do + it "parses a single token pattern" do + y = <<~GRAMMAR + %token-pattern RSHIFT />>/ "right shift" + %% + program: RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_patterns.size).to eq(1) + token_pattern = grammar.token_patterns.first + expect(token_pattern.name).to eq("RSHIFT") + expect(token_pattern.regex_pattern).to eq(">>") + expect(token_pattern.alias_name).to eq("\"right shift\"") + end + + it "parses multiple token patterns" do + y = <<~GRAMMAR + %token-pattern RSHIFT />>/ "right shift" + %token-pattern RANGLE />/ "right angle" + %token-pattern LANGLE / RSHIFT />>/ "right shift" + %% + program: RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_patterns.size).to eq(1) + token_pattern = grammar.token_patterns.first + expect(token_pattern.tag.s_value).to eq("") + end + + it "parses escaped slashes inside token patterns" do + y = <<~GRAMMAR + %token-pattern SLASH /\\// + %% + program: SLASH + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_patterns.first.regex_pattern).to eq("\\/") + end + + it "keeps empty token patterns available for scanner diagnostics" do + y = <<~GRAMMAR + %token-pattern EMPTY // + %% + program: EMPTY + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_patterns.first.regex_pattern).to eq("") + end + end + + describe "%lex-prec" do + it "parses shorter priority rule" do + y = <<~GRAMMAR + %token RANGLE RSHIFT + %lex-prec RANGLE -s RSHIFT + %% + program: RANGLE | RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.lex_prec.declarations.size).to eq(1) + grammar.finalize_lexical_declarations! + expect(grammar.lex_prec.shortest_pair?("RANGLE", "RSHIFT")).to be true + end + + it "parses identity-right longest rule" do + y = <<~GRAMMAR + %token IF ID + %lex-prec ID <~ IF + %% + program: IF | ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.lex_prec.declarations.size).to eq(1) + grammar.finalize_lexical_declarations! + expect(grammar.lex_prec.identity_precedes?("IF", "ID")).to be true + expect(grammar.lex_prec.longest_pair?("ID", "IF")).to be true + end + + it "parses chained lex-prec rules" do + y = <<~GRAMMAR + %token IF ELSE WHILE ID + %lex-prec ID <- WHILE <- ELSE <- IF + %% + program: IF | ELSE | WHILE | ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.lex_prec.declarations.size).to eq(3) + grammar.finalize_lexical_declarations! + expect(grammar.lex_prec.identity_precedes?("WHILE", "ID")).to be true + expect(grammar.lex_prec.identity_precedes?("ELSE", "WHILE")).to be true + expect(grammar.lex_prec.identity_precedes?("IF", "ELSE")).to be true + expect(grammar.lex_prec.identity_precedes?("IF", "ID")).to be false + end + + it "parses symbol sets and lexical ties" do + y = <<~GRAMMAR + %token IF WHILE ID RANGLE RSHIFT + %symbol-set keywords IF WHILE + %lex-tie ID keywords + %lex-no-tie RANGLE RSHIFT + %% + program: IF | WHILE | ID | RANGLE | RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.symbol_sets.fetch("keywords").map(&:s_value)).to eq(["IF", "WHILE"]) + expect(grammar.lex_tie.declarations.map(&:kind)).to eq([:tie, :no_tie]) + expect(grammar.lex_tie.no_tie?("RANGLE", "RSHIFT")).to be true + end + end + + describe "%define lr.type pslr" do + it "recognizes pslr lr.type" do + y = <<~GRAMMAR + %define lr.type pslr + %token ID + %% + program: ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.pslr_defined?).to be true + end + end + + describe "%token-action" do + it "parses a single token action" do + y = <<~GRAMMAR + %token ID + %token-action ID { printf("matched ID"); } + %% + program: ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_actions.size).to eq(1) + action = grammar.token_actions.first + expect(action.token_name).to eq("ID") + expect(action.code.s_value).to include("printf") + end + + it "parses multiple token actions" do + y = <<~GRAMMAR + %token ID NUM + %token-action ID { handle_id(); } NUM { handle_num(); } + %% + program: ID | NUM + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_actions.size).to eq(2) + expect(grammar.token_actions.map(&:token_name)).to eq(["ID", "NUM"]) + end + end + + end end diff --git a/spec/lrama/pslr_family_regressions_spec.rb b/spec/lrama/pslr_family_regressions_spec.rb new file mode 100644 index 000000000..0001e911e --- /dev/null +++ b/spec/lrama/pslr_family_regressions_spec.rb @@ -0,0 +1,394 @@ +# frozen_string_literal: true + +RSpec.describe "PSLR family regressions" do + include PslrFamilyHelper + + describe "pure-reduce profile" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_pure_reduce.y") + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -~ RSHIFT + + %% + + program + : templ + | rshift_expr + ; + + templ + : a RANGLE + ; + + rshift_expr + : a RSHIFT ID + ; + + a + : ID + ; + GRAMMAR + end + + it "keeps pure reduce states scanner-compatible without forcing a split" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + reduce_state = pslr_states.states.find do |state| + state.reduces.any? { |reduce| reduce.rule.display_name == "a -> ID" } + end + + expect(pslr_states.states_count).to eq(ielr_states.states_count) + expect(pslr_states.pslr_inadequacies).to be_empty + expect(acceptable_tokens(pslr_states, reduce_state)).to contain_exactly("RANGLE", "RSHIFT") + end + end + + describe "chained keyword split" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_keyword_context.y") + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern X /x/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec ID <~ IF + + %% + + program + : kw_context + | id_context + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : X + ; + GRAMMAR + end + + it "splits every chained reduce state by scanner profile" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> X"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| acceptable_tokens(pslr_states, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("IF") && !set.include?("ID") }).to be(true) + expect(token_sets.any? { |set| set.include?("ID") && !set.include?("IF") }).to be(true) + end + end + end + + describe "chained shift/angle split" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_shift_chain.y") + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-no-tie RANGLE RSHIFT + + %% + + program + : template_expr + | shift_expr + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + end + + it "splits every chained reduce state by shift/angle scanner profile" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> MARK"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| acceptable_tokens(pslr_states, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("RANGLE") && !set.include?("RSHIFT") }).to be(true) + expect(token_sets.any? { |set| set.include?("RSHIFT") && !set.include?("RANGLE") }).to be(true) + end + end + end + + describe "mixed families" do + { + "empty shared wrapper" => { + path: "states/pslr_mixed_empty.y", + grows: true, + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec ID <~ IF + %lex-no-tie RANGLE RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : opt n1 + ; + + opt + : + ; + + n1 + : MARK + ; + GRAMMAR + }, + "chain2 shared wrapper" => { + path: "states/pslr_mixed_chain2.y", + grows: true, + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec ID <~ IF + %lex-no-tie RANGLE RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + } + }.each do |label, attrs| + it "keeps #{label} scanner-compatible" do + grammar = build_grammar(attrs[:grammar], attrs[:path]) + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + if attrs[:grows] + expect(pslr_states.states_count).to be > ielr_states.states_count + else + expect(pslr_states.states_count).to eq(ielr_states.states_count) + end + expect(pslr_states.pslr_inadequacies).to be_empty + end + end + end + + describe "lexical tie candidates" do + it "reports scanner-conflicting tokens that appear one-sided in parser states" do + grammar = build_grammar(<<~GRAMMAR, "states/pslr_lexical_tie_candidate.y") + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern MARK /#/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec ID <~ IF + + %% + + start + : P MARK IF + | Q MARK ID + ; + GRAMMAR + + _ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + expect(pslr_states.lexical_tie_candidates).to include(["ID", "IF"]) + end + + it "suppresses candidates with yyall no-tie" do + grammar = build_grammar(<<~GRAMMAR, "states/pslr_lexical_tie_no_tie.y") + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern MARK /#/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec ID <~ IF + %lex-no-tie yyall yyall + + %% + + start + : P MARK IF + | Q MARK ID + ; + GRAMMAR + + _ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + expect(pslr_states.lexical_tie_candidates).to be_empty + end + end + + describe "template argument lists" do + it "splits RANGLE and RSHIFT contexts without short-token precedence" do + grammar = build_grammar(<<~GRAMMAR, "states/pslr_template_argument_lists.y") + %define lr.type pslr + %token-pattern ID /[a-zA-Z][a-zA-Z0-9_]*/ + %token-pattern LT // + %token-pattern RSHIFT />>/ + %token-pattern SEMI /;/ + %token-pattern YYLAYOUT /[ \\t\\r\\n]+/ + %lex-no-tie RANGLE RSHIFT + + %% + + start + : decl SEMI + | expr SEMI + ; + + decl + : type id + ; + + type + : id LT type RANGLE + | ID + ; + + expr + : id RSHIFT id + ; + + id + : ID + ; + GRAMMAR + + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + expect(pslr_states.states_count).to be >= ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + expect(pslr_states.lexical_tie_candidates).not_to include(["RANGLE", "RSHIFT"]) + end + end +end diff --git a/spec/lrama/pslr_generated_families_spec.rb b/spec/lrama/pslr_generated_families_spec.rb new file mode 100644 index 000000000..795d25d69 --- /dev/null +++ b/spec/lrama/pslr_generated_families_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +RSpec.describe "PSLR generated family coverage" do + include PslrFamilyHelper + + families = [ + { + label: "keyword/id", + builder: :keyword_context_source, + path_prefix: "generated/pslr_keyword_depth", + split_expected: true + }, + { + label: "shift/angle", + builder: :shift_angle_source, + path_prefix: "generated/pslr_shift_depth", + split_expected: true + }, + { + label: "mixed", + builder: :mixed_context_source, + path_prefix: "generated/pslr_mixed_depth", + split_expected: true + } + ].freeze + + families.each do |family| + (0..3).each do |depth| + it "#{family[:label]} depth=#{depth} keeps PSLR inadequacies resolved" do + grammar = build_grammar( + public_send(family[:builder], depth: depth), + "#{family[:path_prefix]}_#{depth}.y" + ) + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + aggregate_failures do + expect(pslr_states.pslr_inadequacies).to be_empty + expect(pslr_states.states_count).to be >= ielr_states.states_count + expect(pslr_states.pslr_metrics[:growth_count]).to eq(pslr_states.states_count - pslr_states.pslr_metrics[:base_states_count]) + + next unless family[:split_expected] + next unless 1 <= depth + + expect(pslr_states.states_count).to be > ielr_states.states_count + end + end + end + end +end diff --git a/spec/lrama/scanner_fsa_spec.rb b/spec/lrama/scanner_fsa_spec.rb new file mode 100644 index 000000000..9912945ee --- /dev/null +++ b/spec/lrama/scanner_fsa_spec.rb @@ -0,0 +1,322 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::ScannerFSA do + def token_pattern(name, regex, lineno: 1, order: 0) + Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: name), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/#{regex}/"), + lineno: lineno, + definition_order: order + ) + end + + describe "initialization" do + it "creates an empty FSA for no patterns" do + fsa = Lrama::ScannerFSA.new([]) + expect(fsa.states).to be_empty + end + + it "creates FSA for single literal pattern" do + id = Lrama::Lexer::Token::Ident.new(s_value: "PLUS") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/\\+/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + expect(fsa.states).not_to be_empty + expect(fsa.initial_state).not_to be_nil + end + end + + describe "#scan" do + it "matches a single character pattern" do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + results = fsa.scan(">") + expect(results.size).to eq(1) + expect(results[0][:token].name).to eq("RANGLE") + expect(results[0][:position]).to eq(1) + end + + it "matches a multi-character pattern" do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + results = fsa.scan(">>") + expect(results.size).to eq(1) + expect(results[0][:token].name).to eq("RSHIFT") + expect(results[0][:position]).to eq(2) + end + + it "returns multiple matches for overlapping patterns" do + rangle_id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + rangle_regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + rangle = Lrama::Grammar::TokenPattern.new( + id: rangle_id, + pattern: rangle_regex, + lineno: 1, + definition_order: 0 + ) + + rshift_id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + rshift_regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + rshift = Lrama::Grammar::TokenPattern.new( + id: rshift_id, + pattern: rshift_regex, + lineno: 1, + definition_order: 1 + ) + + fsa = Lrama::ScannerFSA.new([rangle, rshift]) + + results = fsa.scan(">>") + + # Should match both RANGLE at position 1 and RSHIFT at position 2 + expect(results.size).to eq(2) + positions = results.map { |r| [r[:token].name, r[:position]] } + expect(positions).to include(["RANGLE", 1]) + expect(positions).to include(["RSHIFT", 2]) + end + + it "matches character class patterns" do + id = Lrama::Lexer::Token::Ident.new(s_value: "ID") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-zA-Z_][a-zA-Z0-9_]*/") + id_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([id_pattern]) + + results = fsa.scan("hello_world123") + expect(results).not_to be_empty + # Should have matches at each position as the identifier grows + end + + it "matches digit patterns" do + id = Lrama::Lexer::Token::Ident.new(s_value: "INT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[0-9]+/") + int_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([int_pattern]) + + results = fsa.scan("12345") + expect(results).not_to be_empty + end + + it "matches escaped whitespace inside character classes" do + id = Lrama::Lexer::Token::Ident.new(s_value: "YYLAYOUT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[ \\t\\r\\n]+/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + expect(fsa.scan("\t").map { |result| result[:token].name }).to include("YYLAYOUT") + expect(fsa.scan("\n").map { |result| result[:token].name }).to include("YYLAYOUT") + end + + it "matches escaped literals in and outside character classes" do + slash = token_pattern("SLASH", "\\/", order: 0) + rbrack = token_pattern("RBRACK", "[\\]]", order: 1) + backslash = token_pattern("BACKSLASH", "[\\\\]", order: 2) + + expect(Lrama::ScannerFSA.new([slash]).scan("/").map { |result| result[:token].name }).to include("SLASH") + expect(Lrama::ScannerFSA.new([rbrack]).scan("]").map { |result| result[:token].name }).to include("RBRACK") + expect(Lrama::ScannerFSA.new([backslash]).scan("\\").map { |result| result[:token].name }).to include("BACKSLASH") + end + + it "matches ranges and negated character classes over ASCII" do + not_star = token_pattern("NOT_STAR", "[^*]+") + fsa = Lrama::ScannerFSA.new([not_star]) + + expect(fsa.scan("abc/]").map { |result| result[:token].name }).to include("NOT_STAR") + expect(fsa.scan("a\nb").map { |result| result[:token].name }).to include("NOT_STAR") + expect(fsa.scan("*")).to be_empty + end + end + + describe "pattern validation" do + it "rejects empty token patterns" do + expect { Lrama::ScannerFSA.new([token_pattern("EMPTY", "", lineno: 42)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /EMPTY at line 42.*empty patterns/m) + end + + it "rejects dangling escapes" do + expect { Lrama::ScannerFSA.new([token_pattern("BAD_ESCAPE", "abc\\", lineno: 7)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /BAD_ESCAPE at line 7.*dangling escape/m) + end + + it "rejects unclosed character classes" do + expect { Lrama::ScannerFSA.new([token_pattern("BAD_CLASS", "[abc", lineno: 6)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /BAD_CLASS at line 6.*unclosed character class/m) + end + + it "rejects unsupported alphabetic escapes" do + expect { Lrama::ScannerFSA.new([token_pattern("BAD_ESCAPE", "\\q", lineno: 8)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /BAD_ESCAPE at line 8.*unsupported escape \\q/m) + end + + it "rejects malformed character class ranges" do + expect { Lrama::ScannerFSA.new([token_pattern("BAD_RANGE", "[z-a]", lineno: 9)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /BAD_RANGE at line 9.*invalid character class range z-a/m) + end + + it "rejects nullable token patterns" do + expect { Lrama::ScannerFSA.new([token_pattern("NULLABLE", "a*", lineno: 10)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /NULLABLE at line 10.*nullable patterns/m) + expect { Lrama::ScannerFSA.new([token_pattern("NULLABLE", "a?", lineno: 10)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /NULLABLE at line 10.*nullable patterns/m) + expect { Lrama::ScannerFSA.new([token_pattern("NULLABLE", "()", lineno: 10)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /NULLABLE at line 10.*empty groups/m) + end + + it "rejects empty alternatives" do + expect { Lrama::ScannerFSA.new([token_pattern("EMPTY_ALT", "a|", lineno: 11)]) } + .to raise_error(Lrama::ScannerFSA::PatternError, /EMPTY_ALT at line 11.*empty alternatives/m) + end + end + + describe "#pairwise_conflict_pairs" do + it "detects identity and length conflicts" do + rangle = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "RANGLE"), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/>/"), + lineno: 1, + definition_order: 0 + ) + rshift = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT"), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/>>/"), + lineno: 1, + definition_order: 1 + ) + keyword = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "IF"), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/if/"), + lineno: 1, + definition_order: 2 + ) + identifier = Lrama::Grammar::TokenPattern.new( + id: Lrama::Lexer::Token::Ident.new(s_value: "ID"), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/[a-z]+/"), + lineno: 1, + definition_order: 3 + ) + + pairs = Lrama::ScannerFSA.new([rangle, rshift, keyword, identifier]).pairwise_conflict_pairs + + expect(pairs).to include(["RANGLE", "RSHIFT"]) + expect(pairs).to include(["ID", "IF"]) + end + + it "checks pairwise conflicts for sorted token pairs" do + rangle = token_pattern("RANGLE", ">", order: 0) + rshift = token_pattern("RSHIFT", ">>", order: 1) + dot = token_pattern("DOT", "\\.", order: 2) + comma = token_pattern("COMMA", ",", order: 3) + fsa = Lrama::ScannerFSA.new([rangle, rshift, dot, comma]) + + expect(fsa.pairwise_conflict?("RSHIFT", "RANGLE")).to be true + expect(fsa.pairwise_conflict?("DOT", "COMMA")).to be false + end + end + + describe "#acc_ss" do + it "returns empty array for non-accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "AB") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/ab/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + # Initial state shouldn't be accepting for non-empty pattern + tokens = fsa.acc_ss(0) + expect(tokens).to be_empty + end + + it "returns accepting tokens for accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "A") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/a/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + # Scan to reach accepting state + results = fsa.scan("a") + expect(results).not_to be_empty + + accepting_state = results[0][:state] + tokens = fsa.acc_ss(accepting_state.id) + expect(tokens.map(&:name)).to include("A") + end + end + + describe "#state_to_accepting_state" do + it "returns nil for non-accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "AB") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/ab/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + expect(fsa.state_to_accepting_state(0)).to be_nil + end + + it "returns the state itself for accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "A") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/a/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + results = fsa.scan("a") + accepting_state = results[0][:state] + + expect(fsa.state_to_accepting_state(accepting_state.id)).to eq(accepting_state) + end + end +end diff --git a/spec/lrama/state/pslr_inadequacy_spec.rb b/spec/lrama/state/pslr_inadequacy_spec.rb new file mode 100644 index 000000000..cef7a5d62 --- /dev/null +++ b/spec/lrama/state/pslr_inadequacy_spec.rb @@ -0,0 +1,234 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::State::PslrInadequacy do + let(:mock_state) do + instance_double(Lrama::State, id: 0) + end + + let(:mock_conflicting_states) do + [ + instance_double(Lrama::State, id: 1), + instance_double(Lrama::State, id: 2) + ] + end + + describe "#initialize" do + it "creates an LR-relative inadequacy" do + inadequacy = Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::LR_RELATIVE, + state: mock_state, + conflicting_states: mock_conflicting_states, + details: { reason: "test" } + ) + + expect(inadequacy.type).to eq(:lr_relative) + expect(inadequacy.state).to eq(mock_state) + expect(inadequacy.conflicting_states).to eq(mock_conflicting_states) + expect(inadequacy.details[:reason]).to eq("test") + end + + it "creates a PSLR-relative inadequacy" do + inadequacy = Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::PSLR_RELATIVE, + state: mock_state, + conflicting_states: mock_conflicting_states, + details: {} + ) + + expect(inadequacy.type).to eq(:pslr_relative) + end + end + + describe "#to_s" do + it "returns a human-readable description" do + inadequacy = Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::PSLR_RELATIVE, + state: mock_state, + conflicting_states: mock_conflicting_states, + details: {} + ) + + expect(inadequacy.to_s).to include("PSLR Inadequacy") + expect(inadequacy.to_s).to include("pslr_relative") + expect(inadequacy.to_s).to include("state 0") + expect(inadequacy.to_s).to include("1, 2") + end + end + + describe "constants" do + it "defines LR_RELATIVE constant" do + expect(Lrama::State::PslrInadequacy::LR_RELATIVE).to eq(:lr_relative) + end + + it "defines PSLR_RELATIVE constant" do + expect(Lrama::State::PslrInadequacy::PSLR_RELATIVE).to eq(:pslr_relative) + end + end +end + +RSpec.describe Lrama::State::PslrCompatibilityChecker do + let(:rangle) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:rshift) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + let(:accepting_state_ids) { scanner_fsa.states.select(&:accepting?).map(&:id) } + let(:short_state_id) { accepting_state_ids.min } + let(:long_state_id) { accepting_state_ids.max } + + describe "#initialize" do + it "creates a compatibility checker" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + expect(checker).to be_a(Lrama::State::PslrCompatibilityChecker) + end + end + + describe "#compatible?" do + context "when both states select same tokens" do + it "returns true" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]).and_return(rangle) + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + + expect(checker.compatible?(state1, state2, scanner_fsa)).to be true + end + end + + context "when both states have no tokens (nil)" do + it "returns true" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]).and_return(nil) + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + + expect(checker.compatible?(state1, state2, scanner_fsa)).to be true + end + end + + context "when states select different tokens" do + it "returns false" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + + # State 0 selects RANGLE, State 1 selects RSHIFT + allow(scanner_accepts).to receive(:[]) do |state_id, _fsa_state_id| + if state_id == 0 + rangle + else + rshift + end + end + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + + expect(checker.compatible?(state1, state2, scanner_fsa)).to be false + end + end + end + + describe "#profile" do + it "returns a stable accepting-state profile" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]) do |state_id, fsa_state_id| + if state_id == 0 + fsa_state_id == short_state_id ? rangle : rshift + else + fsa_state_id == short_state_id ? rangle : nil + end + end + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state = instance_double(Lrama::State, id: 0) + + expect(checker.profile(state, scanner_fsa)).to eq([ + [short_state_id, "RANGLE"], + [long_state_id, "RSHIFT"], + ]) + end + end + + describe "#group_by_profile" do + it "partitions states by scanner behavior" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]) do |state_id, fsa_state_id| + case [state_id, fsa_state_id] + when [0, short_state_id], [1, short_state_id] + rangle + when [0, long_state_id] + rshift + when [1, long_state_id] + nil + when [2, short_state_id] + rshift + when [2, long_state_id] + rshift + end + end + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + state3 = instance_double(Lrama::State, id: 2) + + grouped = checker.group_by_profile([state1, state2, state3], scanner_fsa) + + expect(grouped.values.map(&:size)).to contain_exactly(1, 1, 1) + expect(grouped.keys).to include( + [[short_state_id, "RANGLE"], [long_state_id, "RSHIFT"]], + [[short_state_id, "RANGLE"], [long_state_id, nil]], + [[short_state_id, "RSHIFT"], [long_state_id, "RSHIFT"]], + ) + end + end +end diff --git a/spec/lrama/state/scanner_accepts_spec.rb b/spec/lrama/state/scanner_accepts_spec.rb new file mode 100644 index 000000000..56a0bbcf8 --- /dev/null +++ b/spec/lrama/state/scanner_accepts_spec.rb @@ -0,0 +1,394 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::State::ScannerAccepts do + def ident(name) + Lrama::Lexer::Token::Ident.new(s_value: name) + end + + def token_pattern(name, regex, order) + Lrama::Grammar::TokenPattern.new( + id: ident(name), + pattern: Lrama::Lexer::Token::Regex.new(s_value: "/#{regex}/"), + lineno: 1, + definition_order: order + ) + end + + def shift_for(name) + symbol = instance_double( + Lrama::Grammar::Symbol, + term?: true, + id: ident(name) + ) + instance_double(Lrama::State::Action::Shift, next_sym: symbol) + end + + def parser_state(id, token_names) + instance_double( + Lrama::State, + id: id, + term_transitions: token_names.map {|name| shift_for(name) }, + reduces: [] + ) + end + + describe "#build and #[]" do + let(:rangle) { token_pattern("RANGLE", ">", 0) } + let(:rshift) { token_pattern("RSHIFT", ">>", 1) } + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + + it "builds scanner_accepts from parser acceptable tokens" do + state = parser_state(0, ["RANGLE"]) + scanner_accepts = Lrama::State::ScannerAccepts.new( + [state], + scanner_fsa, + lex_prec, + length_prec + ) + + scanner_accepts.build + + accepting = scanner_fsa.states.find {|s| s.accepting_tokens.map(&:name).include?("RANGLE") } + expect(scanner_accepts[0, accepting.id].name).to eq("RANGLE") + end + + it "includes layout tokens in every parser-state accept set" do + div = token_pattern("DIV", "/", 0) + layout = token_pattern("YYLAYOUT_WS", "[ \\t]+", 1) + scanner_fsa = Lrama::ScannerFSA.new([div, layout]) + state = parser_state(0, ["DIV"]) + scanner_accepts = Lrama::State::ScannerAccepts.new( + [state], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec), + layout_token_names: Set["YYLAYOUT_WS"] + ) + + scanner_accepts.build + + accepting = scanner_fsa.states.find {|s| s.accepting_tokens.map(&:name).include?("YYLAYOUT_WS") } + expect(scanner_accepts[0, accepting.id].name).to eq("YYLAYOUT_WS") + end + end + + describe "complete conflict resolution" do + it "does not use declaration order for unresolved identity conflicts" do + tokens = [ + token_pattern("A", "a", 0), + token_pattern("B", "a", 1), + token_pattern("C", "a", 2) + ] + scanner_fsa = Lrama::ScannerFSA.new(tokens) + lex_prec = Lrama::Grammar::LexPrec.new + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["A", "B", "C"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec) + ) + + scanner_accepts.build + + expect(scanner_accepts.unresolved_conflicts?).to be true + parser_rows = scanner_accepts.table.reject do |(parser_state_id, _scanner_state_id), _token| + parser_state_id == Lrama::State::ScannerAccepts::FALLBACK_ROW_ID + end + expect(parser_rows).to be_empty + expect(scanner_accepts.fallback_table.values.map(&:name)).to contain_exactly("A") + end + + it "selects a unique explicitly declared identity winner" do + tokens = [ + token_pattern("A", "a", 0), + token_pattern("B", "a", 1), + token_pattern("C", "a", 2) + ] + scanner_fsa = Lrama::ScannerFSA.new(tokens) + lex_prec = Lrama::Grammar::LexPrec.new + lex_prec.add_rule(left_token: ident("A"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("C"), lineno: 1) + lex_prec.add_rule(left_token: ident("B"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("C"), lineno: 1) + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["A", "B", "C"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec) + ) + + scanner_accepts.build + + accepting = scanner_fsa.states.find(&:accepting?) + expect(scanner_accepts[0, accepting.id].name).to eq("C") + expect(scanner_accepts.unresolved_conflicts?).to be false + end + + it "keeps conflicts finite for looped scanner states" do + id = token_pattern("ID", "[a-z]+", 0) + kw = token_pattern("IF", "if", 1) + scanner_fsa = Lrama::ScannerFSA.new([id, kw]) + lex_prec = Lrama::Grammar::LexPrec.new + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["ID", "IF"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec) + ) + + scanner_accepts.build + + expect(scanner_accepts.conflicts.size).to be < 10 + end + + it "keeps an empty outcome distinct from an unresolved conflict" do + resolver = Lrama::State::ScannerAccepts::ProfileResolver.new( + Lrama::Grammar::LexPrec.new, + Lrama::LengthPrecedences.new(Lrama::Grammar::LexPrec.new) + ) + + outcome = resolver.resolve(Set.new, nil, Set.new) + + expect(outcome).to be_empty + expect(outcome).not_to be_unresolved + end + + it "uses same-token autolength without fallback mode" do + resolver = Lrama::State::ScannerAccepts::ProfileResolver.new( + Lrama::Grammar::LexPrec.new, + Lrama::LengthPrecedences.new(Lrama::Grammar::LexPrec.new) + ) + + outcome = resolver.resolve(Set["ID"], "ID", Set["ID"]) + + expect(outcome).to be_resolved + expect(outcome.token_name).to eq("ID") + end + + it "uses declaration order only in fallback mode" do + lex_prec = Lrama::Grammar::LexPrec.new + length_prec = Lrama::LengthPrecedences.new(lex_prec) + normal = Lrama::State::ScannerAccepts::ProfileResolver.new( + lex_prec, + length_prec, + token_order: { "A" => 1, "B" => 0 } + ) + fallback = Lrama::State::ScannerAccepts::ProfileResolver.new( + lex_prec, + length_prec, + fallback: true, + token_order: { "A" => 1, "B" => 0 } + ) + + expect(normal.resolve(Set.new, nil, Set["A", "B"])).to be_unresolved + expect(fallback.resolve(Set.new, nil, Set["A", "B"]).token_name).to eq("B") + end + + it "uses explicit identity precedence before fallback declaration order" do + lex_prec = Lrama::Grammar::LexPrec.new + lex_prec.add_rule(left_token: ident("A"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("B"), lineno: 1) + fallback = Lrama::State::ScannerAccepts::ProfileResolver.new( + lex_prec, + Lrama::LengthPrecedences.new(lex_prec), + fallback: true, + token_order: { "A" => 0, "B" => 1 } + ) + + expect(fallback.resolve(Set.new, nil, Set["A", "B"]).token_name).to eq("B") + end + + it "preserves explicit identity precedence when fallback length precedence is needed" do + tokens = [ + token_pattern("X", "x", 0), + token_pattern("SHORT", "a", 1), + token_pattern("A", "ab", 2), + token_pattern("B", "ab", 3) + ] + scanner_fsa = Lrama::ScannerFSA.new(tokens) + lex_prec = Lrama::Grammar::LexPrec.new + lex_prec.add_rule(left_token: ident("A"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("B"), lineno: 1) + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["X"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec) + ) + + scanner_accepts.build + + accepting_ab = scanner_fsa.states.find do |state| + state.accepting_tokens.map(&:name).sort == ["A", "B"] + end + expect(scanner_accepts.fallback_table.fetch(accepting_ab.id).name).to eq("B") + end + + it "uses declaration order for fallback-only explicit identity cycles" do + tokens = [ + token_pattern("X", "x", 0), + token_pattern("A", "a", 1), + token_pattern("B", "a", 2), + token_pattern("C", "a", 3) + ] + scanner_fsa = Lrama::ScannerFSA.new(tokens) + lex_prec = Lrama::Grammar::LexPrec.new + lex_prec.add_rule(left_token: ident("A"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("B"), lineno: 1) + lex_prec.add_rule(left_token: ident("B"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("C"), lineno: 2) + lex_prec.add_rule(left_token: ident("C"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("A"), lineno: 3) + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["X"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec) + ) + + scanner_accepts.build + + accepting = scanner_fsa.states.find {|state| state.accepting_tokens.map(&:name).sort == ["A", "B", "C"] } + expect(scanner_accepts.fallback_table.fetch(accepting.id).name).to eq("A") + end + + it "keeps normal parser-state rows strict for explicit identity cycles" do + tokens = [ + token_pattern("A", "a", 0), + token_pattern("B", "a", 1), + token_pattern("C", "a", 2) + ] + scanner_fsa = Lrama::ScannerFSA.new(tokens) + lex_prec = Lrama::Grammar::LexPrec.new + lex_prec.add_rule(left_token: ident("A"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("B"), lineno: 1) + lex_prec.add_rule(left_token: ident("B"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("C"), lineno: 2) + lex_prec.add_rule(left_token: ident("C"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT, right_token: ident("A"), lineno: 3) + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["A", "B", "C"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec) + ) + + scanner_accepts.build + + accepting = scanner_fsa.states.find {|state| state.accepting_tokens.map(&:name).sort == ["A", "B", "C"] } + expect(scanner_accepts[0, accepting.id]).to be_nil + expect(scanner_accepts.unresolved_conflicts?).to be true + end + end + + describe Lrama::State::ScannerAccepts::CompatibilityChecker do + let(:rangle) { token_pattern("RANGLE", ">", 0) } + let(:rshift) { token_pattern("RSHIFT", ">>", 1) } + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:checker) do + described_class.new(scanner_fsa, lex_prec, Lrama::LengthPrecedences.new(lex_prec)) + end + + it "treats a missing match on one side as irrelevant" do + a = token_pattern("A", "a", 0) + b = token_pattern("B", "b", 1) + fsa = Lrama::ScannerFSA.new([a, b]) + checker = described_class.new(fsa, lex_prec, Lrama::LengthPrecedences.new(lex_prec)) + + expect(checker.compatible?(Set["A"], Set["B"])).to be true + end + + it "rejects different resolved outcomes when both sides match" do + expect(checker.compatible?(Set["RANGLE"], Set["RSHIFT"])).to be false + end + + it "rejects resolved versus unresolved outcomes" do + a = token_pattern("A", "a", 0) + b = token_pattern("B", "a", 1) + fsa = Lrama::ScannerFSA.new([a, b]) + checker = described_class.new(fsa, lex_prec, Lrama::LengthPrecedences.new(lex_prec)) + + expect(checker.compatible?(Set["A"], Set["A", "B"])).to be false + end + + it "accepts unresolved outcomes on both sides" do + a = token_pattern("A", "a", 0) + b = token_pattern("B", "a", 1) + fsa = Lrama::ScannerFSA.new([a, b]) + checker = described_class.new(fsa, lex_prec, Lrama::LengthPrecedences.new(lex_prec)) + + expect(checker.compatible?(Set["A", "B"], Set["A", "B"])).to be true + end + end + + describe "lexical ties" do + it "expands acc(sp) through tie closure" do + id = token_pattern("ID", "[a-z]+", 0) + kw = token_pattern("IF", "if", 1) + scanner_fsa = Lrama::ScannerFSA.new([id, kw]) + lex_prec = Lrama::Grammar::LexPrec.new + lex_prec.add_rule(left_token: ident("ID"), operator: Lrama::Grammar::LexPrec::IDENTITY_RIGHT_LONGEST, right_token: ident("IF"), lineno: 1) + lex_tie = Lrama::Grammar::LexTie.new + lex_tie.add_tie("ID", "IF") + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["ID"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec), + lex_tie + ) + + scanner_accepts.build + + accepting = scanner_fsa.states.find {|state| state.accepting_tokens.map(&:name).include?("IF") } + expect(scanner_accepts[0, accepting.id].name).to eq("IF") + end + + it "does not expand tokens tied only through layout injection" do + div = token_pattern("DIV", "/", 0) + layout = token_pattern("YYLAYOUT_WS", "[ \\t]+", 1) + layout_alias = token_pattern("LAYOUT_ALIAS", "[ \\t]+", 2) + scanner_fsa = Lrama::ScannerFSA.new([div, layout, layout_alias]) + lex_prec = Lrama::Grammar::LexPrec.new + lex_tie = Lrama::Grammar::LexTie.new + lex_tie.add_tie("YYLAYOUT_WS", "LAYOUT_ALIAS") + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state(0, ["DIV"])], + scanner_fsa, + lex_prec, + Lrama::LengthPrecedences.new(lex_prec), + lex_tie, + layout_token_names: Set["YYLAYOUT_WS"] + ) + + acc_sp = scanner_accepts.send(:compute_acc_sp, parser_state(0, ["DIV"])) + + expect(acc_sp).to contain_exactly("DIV", "YYLAYOUT_WS") + end + end + + describe "pure reduce states" do + let(:rangle) { token_pattern("RANGLE", ">", 0) } + let(:rshift) { token_pattern("RSHIFT", ">>", 1) } + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + let(:reduce) { instance_double(Lrama::State::Action::Reduce) } + let(:parser_state) do + instance_double( + Lrama::State, + term_transitions: [], + reduces: [reduce], + ) + end + + it "uses propagated item lookaheads when explicit reduce lookahead is absent" do + allow(parser_state).to receive(:acceptable_pslr_reduce_lookahead).with(reduce).and_return([ + instance_double(Lrama::Grammar::Symbol, id: ident("RANGLE")), + instance_double(Lrama::Grammar::Symbol, id: ident("RSHIFT")), + ]) + + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state], + scanner_fsa, + lex_prec, + length_prec + ) + + expect(scanner_accepts.send(:compute_acc_sp, parser_state).to_a).to contain_exactly("RANGLE", "RSHIFT") + end + end +end diff --git a/spec/lrama/states_spec.rb b/spec/lrama/states_spec.rb index 28c217e2a..d864b0323 100644 --- a/spec/lrama/states_spec.rb +++ b/spec/lrama/states_spec.rb @@ -3158,5 +3158,502 @@ class : keyword_class tSTRING keyword_end %prec tPLUS expect(logger).not_to have_received(:error) end end + + context "when unresolved PSLR inadequacies remain" do + let(:header) do + <<~STR + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %lex-no-tie RANGLE RSHIFT + + %% + + program: RSHIFT | RANGLE + STR + end + + it "fails fast instead of silently generating a parser" do + grammar = Lrama::Parser.new(header, "states/pslr_inadequacy.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.instance_variable_set( + :@pslr_inadequacies, + [ + Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::PSLR_RELATIVE, + state: instance_double(Lrama::State, id: 3), + conflicting_states: [instance_double(Lrama::State, id: 3), instance_double(Lrama::State, id: 4)], + details: { reason: "Scanner behavior differs between isocore states" } + ) + ] + ) + logger = Lrama::Logger.new + allow(logger).to receive(:error) + + expect { states.validate!(logger) }.to raise_error(SystemExit) + expect(logger).to have_received(:error).with(include("PSLR Inadequacy")) + end + end + end + + describe "PSLR split helpers" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %lex-no-tie RANGLE RSHIFT + + %% + + program: RSHIFT | RANGLE + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_split.y").parse + g.prepare + g.validate! + g + end + + let(:states) { Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) } + let(:kernel_item) { instance_double(Lrama::State::Item, end_of_rule?: true) } + let(:reduce) { instance_double(Lrama::State::Action::Reduce, item: kernel_item, look_ahead: [grammar.find_symbol_by_s_value!("RSHIFT")]) } + let(:mock_state) do + instance_double( + Lrama::State, + is_compatible?: true, + kernels: [kernel_item], + term_transitions: [], + reduces: [reduce], + acceptable_reduce_lookahead: [grammar.find_symbol_by_s_value!("RSHIFT")], + acceptable_pslr_reduce_lookahead: [grammar.find_symbol_by_s_value!("RSHIFT")], + ) + end + + before do + states.instance_variable_set(:@scanner_fsa, Lrama::ScannerFSA.new(grammar.token_patterns)) + states.instance_variable_set(:@pslr_split_enabled, true) + end + + it "derives different PSLR signatures from different propagated lookaheads" do + current = states.send(:pslr_state_signature, mock_state) + filtered = states.send( + :pslr_state_signature, + mock_state, + { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] }, + ) + + expect(current.map(&:last)).to include("RSHIFT") + expect(current.map(&:last)).not_to include("RANGLE") + expect(filtered.map(&:last)).to include("RANGLE") + expect(filtered.map(&:last)).not_to include("RSHIFT") + end + + it "treats states with different PSLR signatures as incompatible during splitting" do + filtered_lookaheads = { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] } + expect(states.send(:compatible_split_state?, mock_state, filtered_lookaheads)).to be false + end + + it "detects unresolved PSLR inadequacies per transition" do + propagated = { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] } + matching_state = instance_double(Lrama::State, id: 8) + next_state = instance_double(Lrama::State, id: 4) + transition_symbol = instance_double( + Lrama::Grammar::Symbol, + id: instance_double(Lrama::Lexer::Token::Ident, s_value: "RSHIFT"), + ) + transition = instance_double(Lrama::State::Action::Shift, to_state: next_state, next_sym: transition_symbol) + from_state = instance_double( + Lrama::State, + id: 1, + transitions: [transition], + propagate_lookaheads_without_filter: propagated, + ) + + allow(next_state).to receive(:lalr_isocore).and_return(next_state) + allow(next_state).to receive(:ielr_isocores).and_return([next_state, matching_state]) + allow(states).to receive(:pslr_state_signature).with(next_state, propagated).and_return([[1, "RANGLE"]]) + allow(states).to receive(:pslr_state_signature).with(next_state).and_return([[1, "RSHIFT"]]) + allow(states).to receive(:pslr_state_signature).with(matching_state).and_return([[1, "RANGLE"]]) + allow(states).to receive(:acceptable_tokens_for_pslr).with(next_state, propagated).and_return(Set["RANGLE"]) + allow(states).to receive(:acceptable_tokens_for_pslr).with(next_state).and_return(Set["RSHIFT"]) + allow(states).to receive(:acceptable_tokens_for_pslr).with(matching_state).and_return(Set["RANGLE"]) + states.instance_variable_set(:@states, [from_state]) + + inadequacies = states.send(:detect_pslr_inadequacies) + + expect(inadequacies.size).to eq(1) + expect(inadequacies.first.details[:matching_state_id]).to eq(8) + expect(inadequacies.first.details[:transition_symbol]).to eq("RSHIFT") + end + + it "merges propagated lookaheads into an existing split state" do + current_lookaheads = { kernel_item => [grammar.find_symbol_by_s_value!("RSHIFT")] } + incoming_lookaheads = { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] } + target_state = instance_double(Lrama::State, lookaheads_recomputed: true) + transition = instance_double(Lrama::State::Action::Shift, to_state: target_state) + split_state = instance_double( + Lrama::State, + kernels: [kernel_item], + item_lookahead_set: current_lookaheads, + transitions: [transition], + ) + + allow(split_state).to receive(:item_lookahead_set=) + + states.send(:merge_lookaheads, split_state, incoming_lookaheads) + + expect(split_state).to have_received(:item_lookahead_set=).with( + kernel_item => [grammar.find_symbol_by_s_value!("RSHIFT"), grammar.find_symbol_by_s_value!("RANGLE")], + ) + end + end + + describe "PSLR pure-reduce profile regression" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -~ RSHIFT + + %% + + program + : templ + | rshift_expr + ; + + templ + : a RANGLE + ; + + rshift_expr + : a RSHIFT ID + ; + + a + : ID + ; + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_pure_reduce.y").parse + g.prepare + g.validate! + g + end + + it "keeps pure reduce states scanner-compatible without forcing a split" do + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + reduce_state = pslr_states.states.find do |state| + state.reduces.any? { |reduce| reduce.rule.display_name == "a -> ID" } + end + + expect(pslr_states.states_count).to eq(ielr_states.states_count) + expect(pslr_states.pslr_inadequacies).to be_empty + expect(pslr_states.send(:acceptable_tokens_for_pslr, reduce_state).to_a).to contain_exactly("RANGLE", "RSHIFT") + end + end + + describe "PSLR chained keyword split regression" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern X /x/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec ID <~ IF + + %% + + program + : kw_context + | id_context + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : X + ; + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_keyword_context.y").parse + g.prepare + g.validate! + g + end + + it "splits every chained reduce state by scanner profile" do + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> X"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| pslr_states.send(:acceptable_tokens_for_pslr, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("IF") && !set.include?("ID") }).to be(true) + expect(token_sets.any? { |set| set.include?("ID") && !set.include?("IF") }).to be(true) + end + end + end + + describe "PSLR chained shift/angle split regression" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-no-tie RANGLE RSHIFT + + %% + + program + : template_expr + | shift_expr + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_shift_chain.y").parse + g.prepare + g.validate! + g + end + + it "splits every chained reduce state by shift/angle scanner profile" do + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> MARK"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| pslr_states.send(:acceptable_tokens_for_pslr, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("RANGLE") && !set.include?("RSHIFT") }).to be(true) + expect(token_sets.any? { |set| set.include?("RSHIFT") && !set.include?("RANGLE") }).to be(true) + end + end + end + + describe "PSLR mixed family regressions" do + { + "empty shared wrapper" => { + path: "states/pslr_mixed_empty.y", + grows: true, + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec ID <~ IF + %lex-no-tie RANGLE RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : opt n1 + ; + + opt + : + ; + + n1 + : MARK + ; + GRAMMAR + }, + "chain2 shared wrapper" => { + path: "states/pslr_mixed_chain2.y", + grows: true, + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec ID <~ IF + %lex-no-tie RANGLE RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + } + }.each do |label, attrs| + it "keeps #{label} scanner-compatible" do + grammar = Lrama::Parser.new(attrs[:grammar], attrs[:path]).parse + grammar.prepare + grammar.validate! + + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + if attrs[:grows] + expect(pslr_states.states_count).to be > ielr_states.states_count + else + expect(pslr_states.states_count).to eq(ielr_states.states_count) + end + expect(pslr_states.pslr_inadequacies).to be_empty + end + end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index c832a1cea..338404c19 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -26,6 +26,9 @@ end require "lrama" +Dir[File.expand_path("support/**/*.rb", __dir__)].sort.each do |file| + require file +end module RSpecHelper def fixture_path(file_name) diff --git a/spec/support/pslr_family_helper.rb b/spec/support/pslr_family_helper.rb new file mode 100644 index 000000000..77934b9b8 --- /dev/null +++ b/spec/support/pslr_family_helper.rb @@ -0,0 +1,159 @@ +# frozen_string_literal: true + +module PslrFamilyHelper + def build_grammar(source, path) + grammar = Lrama::Parser.new(source, path).parse + grammar.prepare + grammar.validate! + grammar + end + + def compute_ielr_and_pslr(grammar) + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + [ielr_states, pslr_states] + end + + def acceptable_tokens(states, state) + states.send(:acceptable_tokens_for_pslr, state).to_a + end + + def shared_chain_rules(name:, terminal:, depth:, prefix: "n") + return <<~RULES if depth.zero? + #{name} + : #{terminal} + ; + RULES + + rules = [<<~RULE] + #{name} + : #{prefix}1 + ; + RULE + + 1.upto(depth - 1) do |index| + rules << <<~RULE + #{prefix}#{index} + : #{prefix}#{index + 1} + ; + RULE + end + + rules << <<~RULE + #{prefix}#{depth} + : #{terminal} + ; + RULE + + rules.join("\n") + end + + def keyword_context_source(depth:) + <<~GRAMMAR + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern X /x/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec ID <~ IF + + %% + + program + : kw_context + | id_context + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + #{shared_chain_rules(name: "shared", terminal: "X", depth: depth)} + GRAMMAR + end + + def shift_angle_source(depth:) + <<~GRAMMAR + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-no-tie RANGLE RSHIFT + + %% + + program + : template_expr + | shift_expr + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + #{shared_chain_rules(name: "shared", terminal: "MARK", depth: depth)} + GRAMMAR + end + + def mixed_context_source(depth:) + <<~GRAMMAR + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec ID <~ IF + %lex-no-tie RANGLE RSHIFT + + %% + + program + : kw_context + | id_context + | template_expr + | shift_expr + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + #{shared_chain_rules(name: "shared", terminal: "MARK", depth: depth)} + GRAMMAR + end +end diff --git a/template/bison/_yacc.h b/template/bison/_yacc.h index 3e270c917..7cf4b6b21 100644 --- a/template/bison/_yacc.h +++ b/template/bison/_yacc.h @@ -71,6 +71,10 @@ struct YYLTYPE <%-# b4_declare_yyparse -%> int yyparse (<%= output.parse_param %>); +<%- if output.pslr_enabled? -%> +<%= output.pslr_function_declarations %> +<%- end -%> + <%= output.percent_code("provides") %> <%-# b4_cpp_guard_close([b4_spec_mapped_header_file]) -%> diff --git a/template/bison/yacc.c b/template/bison/yacc.c index 6edd59a0d..d33950691 100644 --- a/template/bison/yacc.c +++ b/template/bison/yacc.c @@ -68,6 +68,10 @@ #define YYPULL 1 +<%- if output.lexer_context_enabled? -%> +<%= output.lexer_context_defines_code %> +<%- end -%> + <%# b4_user_pre_prologue -%> <%- if output.aux.prologue -%> /* First part of user prologue. */ @@ -582,6 +586,206 @@ static const <%= output.int_type_for(output.context.yyr2) %> yyr2[] = <%= output.int_array_to_string(output.context.yyr2) %> }; +<%- if output.pslr_enabled? -%> +<%= output.pslr_function_declarations %> +<%- end -%> + +<%- if output.pslr_enabled? -%> +<%= output.pslr_tables_and_functions %> +<%- end -%> + +<%- if output.lexer_context_enabled? -%> +<%= output.lexer_context_table_code %> +<%- end -%> + +<%- if output.pslr_enabled? -%> +#ifndef YYSETSTATE_CONTEXT +# define YYSETSTATE_CONTEXT(CurrentState) ((void) 0) +#endif +<%- end -%> + +int +yy_state_accepts_token (int yystate, int yychar) +{ + yysymbol_kind_t yytoken = YYTRANSLATE (yychar); + int yyn = yypact[yystate]; + + if (yypact_value_is_default (yyn)) + return 0; + + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + return 0; + + yyn = yytable[yyn]; + if (yyn <= 0) + return !yytable_value_is_error (yyn); + + return 1; +} + +/* + * Like yy_state_accepts_token, but also follows chains of default reductions + * where the rule has zero symbols on the right-hand side (yyr2 == 0). + * This allows the lexer to see tokens that become visible only after + * empty productions are reduced (e.g., opt_terms -> epsilon). + * + * Returns 1 if the token would be accepted in the current state or in a + * state reachable via a chain of empty default reductions; 0 otherwise. + */ +int +yy_state_eventually_accepts_token (int yystate, int yychar) +{ + yysymbol_kind_t yytoken = YYTRANSLATE (yychar); + /* Limit iteration to prevent infinite loops from cyclic empty reductions. */ + int visited[64]; + int visited_count = 0; + + for (;;) + { + int yyn; + + /* 1. Check the current state's action table for the token. */ + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += yytoken; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == yytoken) + { + yyn = yytable[yyn]; + if (yyn > 0 || !yytable_value_is_error (yyn)) + return 1; + } + } + + /* 2. Try to follow the default reduction if it's an empty rule. */ + { + int rule = yydefact[yystate]; + int lhs, goto_state; + int i; + + if (rule == 0 || yyr2[rule] != 0) + return 0; /* No default or non-empty rule: can't proceed. */ + + /* Cycle detection. */ + for (i = 0; i < visited_count; i++) + if (visited[i] == yystate) + return 0; + if (visited_count < 64) + visited[visited_count++] = yystate; + else + return 0; + + /* Compute GOTO state after reducing by the empty rule. */ + lhs = yyr1[rule] - YYNTOKENS; + goto_state = yypgoto[lhs] + yystate; + if (0 <= goto_state && goto_state <= YYLAST + && yycheck[goto_state] == yystate) + yystate = yytable[goto_state]; + else + yystate = yydefgoto[lhs]; + } + } +} + +/* + * Like yy_state_eventually_accepts_token, but also follows non-empty + * default reductions by using the actual parser stack to determine + * GOTO states. This allows the lexer to see tokens that become visible + * after reductions like stmt -> expr (yyr2 > 0). + * + * stack_base and stack_top point to the parser's state stack (yy_state_t). + * Returns 1 if the token is reachable; 0 otherwise. + */ +int +yy_state_deep_accepts_token (int yystate, int yychar, + const void *stack_base_v, const void *stack_top_v) +{ + typedef short yy_state_t_compat; + yysymbol_kind_t yytoken = YYTRANSLATE (yychar); + const yy_state_t_compat *stack_base = (const yy_state_t_compat *)stack_base_v; + const yy_state_t_compat *stack_top = (const yy_state_t_compat *)stack_top_v; + int visited[64]; + int visited_count = 0; + int stack_consumed = 0; /* how many stack items we've "popped" */ + + if (!stack_base || !stack_top) + return 0; + + for (;;) + { + int yyn; + + /* 1. Check the current state's action table for the token. */ + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += yytoken; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == yytoken) + { + yyn = yytable[yyn]; + if (yyn > 0 || !yytable_value_is_error (yyn)) + return 1; + } + } + + /* 2. Try to follow the default reduction. */ + { + int rule = yydefact[yystate]; + int rhs_len, lhs, goto_state, uncovered_state; + int i; + + if (rule == 0) + return 0; /* No default action. */ + + /* Cycle detection. */ + for (i = 0; i < visited_count; i++) + if (visited[i] == yystate) + return 0; + if (visited_count < 64) + visited[visited_count++] = yystate; + else + return 0; + + rhs_len = yyr2[rule]; + + if (rhs_len == 0) + { + /* Empty reduction: use current state for GOTO (same as eventually_accepts). */ + lhs = yyr1[rule] - YYNTOKENS; + goto_state = yypgoto[lhs] + yystate; + if (0 <= goto_state && goto_state <= YYLAST + && yycheck[goto_state] == yystate) + yystate = yytable[goto_state]; + else + yystate = yydefgoto[lhs]; + } + else + { + /* Non-empty reduction: need to look at the stack. */ + int total_depth = stack_consumed + rhs_len; + const yy_state_t_compat *target = stack_top - total_depth; + + if (target < stack_base) + return 0; /* Stack too shallow. */ + + uncovered_state = (int)*target; + lhs = yyr1[rule] - YYNTOKENS; + goto_state = yypgoto[lhs] + uncovered_state; + if (0 <= goto_state && goto_state <= YYLAST + && yycheck[goto_state] == uncovered_state) + yystate = yytable[goto_state]; + else + yystate = yydefgoto[lhs]; + + /* After a non-empty reduction, the stack effectively shrank. + * But since we consumed rhs_len items and pushed 1 (the GOTO state), + * net consumption is rhs_len - 1. */ + stack_consumed += rhs_len - 1; + } + } + } +} enum { YYENOMEM = -2 }; @@ -1582,6 +1786,9 @@ YYLTYPE yylloc = yyloc_default; YY_IGNORE_USELESS_CAST_BEGIN *yyssp = YY_CAST (yy_state_t, yystate); YY_IGNORE_USELESS_CAST_END +<%- if output.pslr_enabled? -%> + YYSETSTATE_CONTEXT (yystate); +<%- end -%> YY_STACK_PRINT (yyss, yyssp<%= output.user_args %>); if (yyss + yystacksize - 1 <= yyssp) @@ -2065,4 +2272,3 @@ YYLTYPE yylloc = yyloc_default; #line <%= output.aux.epilogue_first_lineno - 1 %> "<%= output.grammar_file_path %>" <%= output.aux.epilogue -%> <%- end -%> -