diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b8cf526 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,14 @@ +*.json.gz binary + +# Keep csmith corpus + plugin sources as LF on all platforms so the +# byte-exact fixtures (built on Linux) match the parser's output on +# Windows checkouts too. Without this, git's autocrlf would rewrite +# .c files to CRLF on Windows and the parser's token spans/sources +# would diverge from the committed LF fixtures. +*.c text eol=lf +*.h text eol=lf +*.ts text eol=lf +*.json text eol=lf +*.jsonic text eol=lf +*.md text eol=lf +*.yml text eol=lf diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 86381d6..dfa7b49 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -26,25 +26,3 @@ jobs: - run: npm i - run: npm run build --if-present - run: npm test - - build-go: - - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - - runs-on: ${{ matrix.os }} - - steps: - - uses: actions/checkout@v4 - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.24' - - name: Build - working-directory: go - run: go build ./... - - name: Test - working-directory: go - run: go test -v ./... diff --git a/.gitignore b/.gitignore index aa45c2f..90bb357 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ lib-cov *.out *.pid *.gz +!test/csmith-fixtures/*.json.gz pids logs @@ -34,3 +35,4 @@ yarn.lock # Go go/vendor/ +platform.info diff --git a/Makefile b/Makefile index 3558e3d..137b5db 100644 --- a/Makefile +++ b/Makefile @@ -1,55 +1,15 @@ -.PHONY: all build test clean embed build-ts build-go test-ts test-go clean-ts clean-go publish-go tags-go tidy-go reset +.PHONY: all build test clean reset all: build test -build: build-ts build-go - -test: test-ts test-go - -clean: clean-ts clean-go - -# Embed jsonc-grammar.jsonic into src/jsonc.ts and go/jsonc.go. -embed: - node embed-grammar.js - -# TypeScript -build-ts: +build: npm run build -test-ts: +test: npm test -clean-ts: +clean: rm -rf dist dist-test -# Go -build-go: embed - cd go && go build ./... - -test-go: - cd go && go test ./... - -clean-go: - cd go && go clean -cache - -# Publish Go module: make publish-go V=0.1.1 -publish-go: test-go - @test -n "$(V)" || (echo "Usage: make publish-go V=x.y.z" && exit 1) - sed -i '' 's/^const Version = ".*"/const Version = "$(V)"/' go/jsonc.go - git add go/jsonc.go - git commit -m "go: v$(V)" - git tag go/v$(V) - git push origin main go/v$(V) - if command -v gh >/dev/null 2>&1; then gh release create go/v$(V) --title "go/v$(V)" --notes "Go module release v$(V)"; fi - -tidy-go: - cd go && go mod tidy - -tags-go: - git tag -l 'go/v*' --sort=-version:refname - reset: npm run reset - cd go && go clean -cache - cd go && go build ./... - cd go && go test -v ./... diff --git a/README.md b/README.md index c722c0e..e9237f9 100644 --- a/README.md +++ b/README.md @@ -1,243 +1,201 @@ -# @jsonic/jsonc +# @jsonic/c -This plugin allows the [Jsonic](https://jsonic.senecajs.org) JSON parser -to parse [JSONC](https://github.com/microsoft/node-jsonc-parser) format -files (JSON with Comments). - -JSONC is a strict superset of JSON that adds single-line (`//`) and -block (`/* */`) comments. Trailing commas in objects and arrays can be -optionally enabled. - -[![npm version](https://img.shields.io/npm/v/@jsonic/jsonc.svg)](https://npmjs.com/package/@jsonic/jsonc) -[![build](https://github.com/jsonicjs/jsonc/actions/workflows/build.yml/badge.svg)](https://github.com/jsonicjs/jsonc/actions/workflows/build.yml) -[![Coverage Status](https://coveralls.io/repos/github/jsonicjs/jsonc/badge.svg?branch=main)](https://coveralls.io/github/jsonicjs/jsonc?branch=main) -[![Known Vulnerabilities](https://snyk.io/test/github/jsonicjs/jsonc/badge.svg)](https://snyk.io/test/github/jsonicjs/jsonc) - - -| ![Voxgig](https://www.voxgig.com/res/img/vgt01r.png) | This open source module is sponsored and supported by [Voxgig](https://www.voxgig.com). | -| ---------------------------------------------------- | --------------------------------------------------------------------------------------- | - - -The documentation below is organized along the -[Diátaxis](https://diataxis.fr) quadrants: - -- [Quick start](#quick-start) — tutorial -- [How-to guides](#how-to-guides) — task recipes -- [Reference](#reference) — API surface -- [JSONC format](#jsonc-format) — explanation +A [Jsonic](https://jsonic.senecajs.org) plugin that parses **C source code** +into a **concrete syntax tree** — preserving every token, comment, macro +definition, macro use, and compiler extension as-is. +Targets **C23** plus the common **GCC / Clang / MSVC** extensions, with +best-effort handling of preprocessor conditional groups. ## Quick start -### TypeScript +```ts +import { Jsonic } from 'jsonic' +import { C } from '@jsonic/c' -Install: +const j = Jsonic.make().use(C) -```bash -npm install @jsonic/jsonc @jsonic/jsonic-next +const cst = j(` + typedef int T; + T x = 1; +`) +// cst.kind === 'translation_unit' +// cst.children = [external_declaration{declKind:'declaration'}, ...] ``` -Parse: - -```typescript -import { Jsonic } from '@jsonic/jsonic-next' -import { Jsonc } from '@jsonic/jsonc' - -const j = Jsonic.make().use(Jsonc) +Walk the tree depth-first and concatenate every `kind:'token'` `src` to +recover the original source byte-for-byte (modulo whitespace, whose +positions are preserved on token spans). -const result = j('{ "name": "app", /* version */ "version": "1.0" }') -// => { name: 'app', version: '1.0' } -``` +## Architecture -### Go +- **Focused lex matchers** (`src/matchers.ts`): one matcher per concept — + whitespace, line continuation, line/block comments, preprocessor + directive opener (line-start gated), directive newline, header name, + identifier (with keyword/typedef-name/macro-name reclassification), + integer/float/char/string literals, and longest-match punctuator + dispatch. -Install: +- **Symbol & macro tables** (`src/symbols.ts`): scope stack and macro + lookup live on `ctx.meta.cmeta` so both lex matchers and rule + actions share state. Lex matchers consult the tables when + classifying identifiers; rule actions register names when they + finalize a `typedef` or `#define`. Pre-lexed lookahead tokens are + reclassified in place so the very next match sees the updated + classification immediately. -```bash -go get github.com/jsonicjs/jsonc/go -``` +- **Token catalog** (`src/tokens.ts`): every C23 keyword, every + compiler-extension keyword, and every punctuator gets its own named + token. Grammar rules and structuring code reference these names + directly. -Parse: +- **Coarse-grained jsonic grammar** (`src/c.ts`): `translation_unit` + opens an `extdecl_loop` that absorbs tokens into per-declaration + chomp nodes terminating at top-level `;` or `}` (with PP_NEWLINE + for directives). Directive lines get terminated separately so each + `#…` is its own external_declaration. -```go -package main +- **Recursive-descent structuring** (`src/structure.ts`, + `src/expr.ts`): a post-pass over each chomped token list produces + the structured concrete-syntax tree. Walking depth-first yields the + original tokens in source order. -import ( - "fmt" - jsonic "github.com/jsonicjs/jsonic/go" - jsonc "github.com/jsonicjs/jsonc/go" -) +## Concrete-syntax shapes -func main() { - j := jsonic.Make() - j.Use(jsonc.Jsonc) +Every node carries `{kind, span, children, trivia}` plus per-kind +fields. Highlights: - result, err := j.Parse(`{ "name": "app", /* version */ "version": "1.0" }`) - if err != nil { - panic(err) - } - fmt.Println(result) - // => map[name:app version:1.0] -} ``` - - -## How-to guides - -### Allow trailing commas - -TypeScript: - -```typescript -const j = Jsonic.make().use(Jsonc, { allowTrailingComma: true }) -j('{ "debug": true, "verbose": false, }') -// => { debug: true, verbose: false } +translation_unit + conditional_group (#if … #elif … #else … #endif folded) + branches: conditional_branch { branchKind, directive, body } + endif + external_declaration { declKind: 'declaration'|'function_definition' } + declaration_specifiers + attribute_spec, struct_specifier, union_specifier, enum_specifier + member_decl_list / enumerator_list (typed members, bitfields, + static_assert, enumerators) + init_declarator_list + init_declarator { declaredName } + declarator + pointer (with qualifiers + attribute_spec) + direct_declarator + array_postfix + function_postfix + parameter_type_list { variadic? } + parameter_declaration { declaredName } + identifier_list (K&R) + asm_label?, attribute_spec? + '=' initializer + static_assert_declaration { condition, message? } + define_directive { macroName, macroKind, macroParams?, macroVariadic? } + include_directive { includeForm, headerKind, headerName } + conditional_directive { directive } + pragma_directive / error_directive / warning_directive / undef_directive + compound_statement + declaration | statement + if_statement, switch_statement, while_statement, do_statement, + for_statement (for_controls), labeled_statement + { labelKind, labelName? }, jump_statement { jumpKind }, + expression_statement, asm_statement, preprocessor_line ``` -Go: +### Expression shapes (Pratt-parsed, full C precedence) -```go -j := jsonic.Make() -j.Use(jsonc.Jsonc, map[string]any{"allowTrailingComma": true}) -result, _ := j.Parse(`{ "debug": true, "verbose": false, }`) ``` - -### Parse strict JSON (disable comments) - -TypeScript: - -```typescript -const j = Jsonic.make().use(Jsonc, { disallowComments: true }) -j('{ "foo": /* not allowed */ true }') // throws +literal_expression { literalKind, value } +identifier_expression { name } +paren_expression +call_expression { callee, isMacro } + argument_list +subscript_expression { target, index_list } +member_expression { object, op ('.'|'->'), memberName } +postfix_unary_expression { target, op } +unary_expression { op, operand } // ++/--/+/-/!/~/*/&/sizeof/_Alignof/... +cast_expression { typeName, operand } +binary_expression { op, left, right } // 11 precedence levels +conditional_expression { cond, then, else } +assignment_expression { left, op, right } // right-assoc +comma_expression +generic_selection + generic_controlling_expression { expression } + generic_association { associationKind, typeName?, value } +statement_expression // GCC ({ ... }) +compound_literal { typeName, initializer_list } +initializer_list + initializer_item { designation?, value } + designation + member_designator { memberName } + index_designator ``` -Go: - -```go -j := jsonic.Make() -j.Use(jsonc.Jsonc, map[string]any{"disallowComments": true}) -``` +## Disambiguation strategy -### Handle parse errors +C's classic ambiguity (an identifier may name a typedef OR a variable) +is resolved at lex time. The identifier matcher consults +`SymbolTable.isTypedef(word)` and emits **TYPEDEF_NAME** instead of +**ID** for every typedef'd name. After a `typedef int T;` declaration +finalizes, the symbol table is updated AND any pre-fetched lookahead +tokens carrying that name are reclassified in place, so the next +declaration sees the new classification regardless of jsonic's +arbitrary-lookahead. -TypeScript — parse errors throw: +A parallel **macro table** records `#define`d names. Identifiers seen +earlier in a `#define` lex as **MACRO_NAME**, and `call_expression` +nodes carrying such a callee get `isMacro: true` so consumers can +distinguish a macro invocation from a real function call without +re-querying any table. `#undef` removes the entry. -```typescript -try { - j('{ "bad": }') -} catch (err) { - console.error(err.message) -} -``` +Full **nested scoping** (file / function-prototype / function-body / +block / struct-or-union / enum / for-init) is implemented in +`SymbolTable`. Inner non-typedef bindings shadow outer typedefs. -Go — errors are returned: +## Preprocessor -```go -if _, err := j.Parse(`{ "bad": }`); err != nil { - fmt.Println(err) -} -``` +Each `#-line` is its own structured directive node (see shapes above). +A translation-unit-level post-pass folds the flat sequence of +`#if`/`#ifdef`/`#ifndef` … (`#elif`…)\* (`#else`)? … `#endif` into a +single `conditional_group` containing typed branches. Best-effort: +unmatched `#endif` or unterminated `#if` leaves the surrounding +sequence flat. Nested `#if … #endif` inside a branch is recursively +grouped. -### Parse a file +`#define` directives populate `ctx.meta.cmeta.macros`; `#undef` +removes. The macro table is the single source of truth used by lex-time +**MACRO_NAME** tagging. -TypeScript: +## Attributes (all three forms structured) -```typescript -import { readFileSync } from 'node:fs' -const j = Jsonic.make().use(Jsonc, { allowTrailingComma: true }) -const config = j(readFileSync('tsconfig.json', 'utf8')) ``` - -Go: - -```go -src, _ := os.ReadFile("tsconfig.json") -j := jsonic.Make() -j.Use(jsonc.Jsonc, map[string]any{"allowTrailingComma": true}) -config, _ := j.Parse(string(src)) +attribute_spec { attributeForm: 'gcc'|'msvc'|'c23', items } + attribute_item { attributeName, attributePrefix?, argumentList? } + attribute_argument_list // Pratt-parsed args ``` +`__attribute__((noreturn, format(printf, 1, 2)))`, +`__declspec(dllexport)`, and C23 `[[gnu::pure]]` / +`[[deprecated("reason")]]` all produce the same item shape. -## Reference - -### TypeScript - -```typescript -function Jsonc(jsonic: Jsonic, options?: JsoncOptions): void +## GCC inline assembly -type JsoncOptions = { - allowTrailingComma?: boolean // default: false - disallowComments?: boolean // default: false -} ``` - -Register with `jsonic.use(Jsonc, options?)`. After registration, invoke -the jsonic instance as a function on a source string; it returns the -parsed value or throws on syntax errors. - -| Option | Type | Default | Effect | -|--------|------|---------|--------| -| `allowTrailingComma` | `boolean` | `false` | Permit a trailing comma before `}` and `]` | -| `disallowComments` | `boolean` | `false` | Reject `//` and `/* */` comments (strict JSON) | - -### Go - -```go -func Jsonc(j *jsonic.Jsonic, pluginOpts map[string]any) error +asm_statement { qualifiers } + asm_template { expression } + asm_outputs? asm_operand { asmName?, constraint, value { expression } } + asm_inputs? + asm_clobbers? asm_clobber { value } + asm_labels? asm_label_ref { labelName } ``` -Register with `j.Use(jsonc.Jsonc)` or `j.Use(jsonc.Jsonc, opts)` where -`opts` is a `map[string]any`. `Parse` then returns `(any, error)` — -`map[string]any` for objects, `[]any` for arrays, `float64` for numbers, -`string`, `bool`, or `nil`. - -| Key | Type | Default | Effect | -|-----|------|---------|--------| -| `allowTrailingComma` | `bool` | `false` | Permit a trailing comma before `}` and `]` | -| `disallowComments` | `bool` | `false` | Reject `//` and `/* */` comments (strict JSON) | - - -## JSONC format - -JSONC follows [RFC 8259](https://tools.ietf.org/html/rfc8259) (JSON) -with these extensions: - -- **Line comments**: `//` to end of line -- **Block comments**: `/* */` (non-nesting) -- **Trailing commas**: optional, in objects and arrays - -All other JSON rules apply: - -- Strings must be double-quoted -- Standard escapes only: `\"` `\\` `\/` `\b` `\f` `\n` `\r` `\t` `\uXXXX` -- Numbers: integer, decimal, scientific notation (no hex, octal, binary) -- Keywords: `true`, `false`, `null` (case-sensitive) -- Property names must be double-quoted strings - -### Conformance notes - -The plugin layers JSONC rules on top of jsonic, which is intentionally -lenient in some places vs. strict RFC 8259. The test suite runs the -[nst/JSONTestSuite](https://github.com/nst/JSONTestSuite) corpus in -strict mode (`disallowComments: true`) and pins the known-lenient -cases in `test/jsontestsuite.test.ts` (see `N_KNOWN_LENIENT`). Examples -of accepted-but-non-RFC input include numbers with leading zeros and -unquoted object keys. Use an RFC-strict parser if byte-perfect RFC 8259 -rejection is required. - - -## Acknowledgments - -Conformance testing uses third-party corpora under MIT License: - -- [nst/JSONTestSuite](https://github.com/nst/JSONTestSuite) by Nicolas - Seriot — vendored at `test/JSONTestSuite/` (the `test_parsing/` corpus). -- [microsoft/node-jsonc-parser](https://github.com/microsoft/node-jsonc-parser) — - parse-level test cases ported into `test/jsonc.test.ts`. - -See [THIRD_PARTY_NOTICES.md](./THIRD_PARTY_NOTICES.md) for details. +## for-loop controls +``` +for_controls + for_init { value: declaration | | empty } + for_cond { value: | empty } + for_iter { value: | empty } +``` ## License -MIT. Copyright (c) 2021-2025 Richard Rodger and contributors. +MIT. Copyright (c) 2026 Richard Rodger and contributors. diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md deleted file mode 100644 index 3362e95..0000000 --- a/THIRD_PARTY_NOTICES.md +++ /dev/null @@ -1,50 +0,0 @@ -# Third-Party Notices - -This project incorporates material from the projects listed below. The -original copyright notices and license texts are preserved as required. - -## nst/JSONTestSuite - -Vendored at `test/JSONTestSuite/` for RFC 8259 conformance testing via -`test/jsontestsuite.test.ts`. Only the `test_parsing/` corpus, `LICENSE`, -and `README.md` from the upstream project are included; the upstream -`LICENSE` is preserved in place. - -- Project: https://github.com/nst/JSONTestSuite -- License: MIT -- Copyright (c) 2016 Nicolas Seriot - -## microsoft/node-jsonc-parser - -Parse-level test cases in `test/jsonc.test.ts` were ported from -`src/test/json.test.ts` of `microsoft/node-jsonc-parser`. - -- Project: https://github.com/microsoft/node-jsonc-parser -- License: MIT -- Copyright (c) Microsoft Corporation - ---- - -Both projects are distributed under the MIT License. Full license text: - -``` -MIT License - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the "Software"), -to deal in the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. -``` diff --git a/embed-grammar.js b/embed-grammar.js deleted file mode 100644 index 1188a35..0000000 --- a/embed-grammar.js +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env node - -// Embed jsonc-grammar.jsonic into TypeScript and Go source files. -// Run via: npm run embed (or: node embed-grammar.js) - -const fs = require('fs') -const path = require('path') - -const GRAMMAR_FILE = path.join(__dirname, 'jsonc-grammar.jsonic') -const TS_FILE = path.join(__dirname, 'src', 'jsonc.ts') -const GO_FILE = path.join(__dirname, 'go', 'jsonc.go') - -const BEGIN = '// --- BEGIN EMBEDDED jsonc-grammar.jsonic ---' -const END = '// --- END EMBEDDED jsonc-grammar.jsonic ---' - -const grammar = fs.readFileSync(GRAMMAR_FILE, 'utf8') - -// --- TypeScript embedding --- -function embedTS() { - let src = fs.readFileSync(TS_FILE, 'utf8') - const startIdx = src.indexOf(BEGIN) - const endIdx = src.indexOf(END) - if (startIdx === -1 || endIdx === -1) { - console.error('TS markers not found in', TS_FILE) - process.exit(1) - } - - // Escape backticks and template expressions for a JS template literal. - const escaped = grammar - .replace(/\\/g, '\\\\') - .replace(/`/g, '\\`') - .replace(/\$\{/g, '\\${') - - const replacement = - BEGIN + - '\nconst grammarText = `\n' + - escaped + - '`\n' + - END - - src = src.substring(0, startIdx) + replacement + src.substring(endIdx + END.length) - fs.writeFileSync(TS_FILE, src) - console.log('Embedded grammar into', TS_FILE) -} - -// --- Go embedding --- -function embedGo() { - let src = fs.readFileSync(GO_FILE, 'utf8') - const startIdx = src.indexOf(BEGIN) - const endIdx = src.indexOf(END) - if (startIdx === -1 || endIdx === -1) { - console.error('Go markers not found in', GO_FILE) - process.exit(1) - } - - if (grammar.includes('`')) { - console.error('Grammar contains backticks, incompatible with Go raw strings') - process.exit(1) - } - - const replacement = - BEGIN + - '\nconst grammarText = `\n' + - grammar + - '`\n' + - END - - src = src.substring(0, startIdx) + replacement + src.substring(endIdx + END.length) - fs.writeFileSync(GO_FILE, src) - console.log('Embedded grammar into', GO_FILE) -} - -embedTS() -embedGo() diff --git a/go/go.mod b/go/go.mod deleted file mode 100644 index f58d839..0000000 --- a/go/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module github.com/jsonicjs/jsonc/go - -go 1.24.7 - -require github.com/jsonicjs/jsonic/go v0.1.22 diff --git a/go/go.sum b/go/go.sum deleted file mode 100644 index 5c34bfc..0000000 --- a/go/go.sum +++ /dev/null @@ -1,2 +0,0 @@ -github.com/jsonicjs/jsonic/go v0.1.22 h1:sam238fTyjDq0nby9TYS+aCCHprLl91ArQPWLCg2O0Y= -github.com/jsonicjs/jsonic/go v0.1.22/go.mod h1:ObNKlCG7esWoi4AHCpdgkILvPINV8bpvkbCd4llGGUg= diff --git a/go/jsonc.go b/go/jsonc.go deleted file mode 100644 index 3bc74e5..0000000 --- a/go/jsonc.go +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2021-2025 Richard Rodger, MIT License */ - -package jsonc - -import ( - jsonic "github.com/jsonicjs/jsonic/go" -) - -const Version = "0.1.2" - -// --- BEGIN EMBEDDED jsonc-grammar.jsonic --- -const grammarText = ` -# JSONC Grammar Definition -# Parsed by a standard Jsonic instance and passed to jsonic.grammar() -# Extends standard JSON grammar with end-of-input value handling. - -{ - options: text: { lex: false } - options: number: { hex: false oct: false bin: false sep: null exclude: "@/^\\./" } - options: string: { chars: '"' multiChars: '' allowUnknown: false } - options: string: escape: { v: null } - options: map: { extend: false } - options: lex: { empty: false } - options: rule: { finish: false } - - rule: val: open: { - alts: [ - { s: '#ZZ' g: jsonc } - ] - inject: { append: true } - } - - rule: pair: close: { - alts: [ - { s: '#CA #CB' b: 1 g: comma } - ] - inject: {} - } - - rule: elem: close: { - alts: [ - { s: '#CA #CS' b: 1 g: comma } - ] - inject: {} - } -} -` -// --- END EMBEDDED jsonc-grammar.jsonic --- - -// Jsonc configures a jsonic instance for JSONC parsing. -func Jsonc(j *jsonic.Jsonic, pluginOpts map[string]any) error { - commentLex := true != toBool(pluginOpts["disallowComments"]) - ruleExclude := "comma" - if toBool(pluginOpts["allowTrailingComma"]) { - ruleExclude = "" - } - - // Apply grammar: static options, rules, and trailing comma alts. - if err := j.GrammarText(grammarText, &jsonic.GrammarSetting{ - Rule: &jsonic.GrammarSettingRule{ - Alt: &jsonic.GrammarSettingAlt{G: "jsonc"}, - }, - }); err != nil { - return err - } - - // Runtime options that depend on plugin arguments. - j.SetOptions(jsonic.Options{ - Comment: &jsonic.CommentOptions{Lex: &commentLex}, - Rule: &jsonic.RuleOptions{ - Include: "jsonc,json", - Exclude: ruleExclude, - }, - }) - - return nil -} - -func toBool(v any) bool { - b, _ := v.(bool) - return b -} diff --git a/go/jsonc_test.go b/go/jsonc_test.go deleted file mode 100644 index fd93e3f..0000000 --- a/go/jsonc_test.go +++ /dev/null @@ -1,634 +0,0 @@ -/* Copyright (c) 2021-2025 Richard Rodger, MIT License */ - -package jsonc - -import ( - "reflect" - "strings" - "testing" - - jsonic "github.com/jsonicjs/jsonic/go" -) - -func makeJsonc(opts ...map[string]any) *jsonic.Jsonic { - j := jsonic.Make() - if len(opts) > 0 { - j.Use(Jsonc, opts[0]) - } else { - j.Use(Jsonc) - } - return j -} - -func assert(t *testing.T, name string, got, want any) { - t.Helper() - if !reflect.DeepEqual(got, want) { - t.Errorf("%s:\n got: %#v\n want: %#v", name, got, want) - } -} - -func assertError(t *testing.T, name string, err error, contains string) { - t.Helper() - if err == nil { - t.Errorf("%s: expected error containing %q, got nil", name, contains) - return - } - if !strings.Contains(err.Error(), contains) { - t.Errorf("%s: expected error containing %q, got: %v", name, contains, err) - } -} - -var j = makeJsonc() - -func parse(src string) (any, error) { return j.Parse(src) } - -func TestHappy(t *testing.T) { - r, err := parse(`{"a":1}`) - if err != nil { - t.Fatal(err) - } - assert(t, "basic", r, map[string]any{"a": float64(1)}) -} - -func TestComments(t *testing.T) { - r, err := parse("// this is a comment") - if err != nil { - t.Fatal(err) - } - assert(t, "single-line", r, nil) - - r, err = parse("// this is a comment\n") - if err != nil { - t.Fatal(err) - } - assert(t, "single-line-newline", r, nil) - - r, err = parse("/* this is a comment*/") - if err != nil { - t.Fatal(err) - } - assert(t, "block", r, nil) - - r, err = parse("/* this is a \r\ncomment*/") - if err != nil { - t.Fatal(err) - } - assert(t, "block-crlf", r, nil) - - r, err = parse("/* this is a \ncomment*/") - if err != nil { - t.Fatal(err) - } - assert(t, "block-lf", r, nil) - - _, err = parse("/* this is a") - assertError(t, "unterminated-block", err, "unterminated_comment") - - _, err = parse("/* this is a \ncomment") - assertError(t, "unterminated-block-multiline", err, "unterminated_comment") - - _, err = parse("/ ttt") - assertError(t, "invalid-comment", err, "unexpected") -} - -func TestStrings(t *testing.T) { - r, err := parse(`"test"`) - if err != nil { - t.Fatal(err) - } - assert(t, "simple", r, "test") - - r, _ = parse(`"\""`) - assert(t, "escape-quote", r, `"`) - - r, _ = parse(`"\/"`) - assert(t, "escape-slash", r, "/") - - r, _ = parse(`"\b"`) - assert(t, "escape-backspace", r, "\b") - - r, _ = parse(`"\f"`) - assert(t, "escape-formfeed", r, "\f") - - r, _ = parse(`"\n"`) - assert(t, "escape-newline", r, "\n") - - r, _ = parse(`"\r"`) - assert(t, "escape-return", r, "\r") - - r, _ = parse(`"\t"`) - assert(t, "escape-tab", r, "\t") - - r, _ = parse(`"\u00DC"`) - assert(t, "unicode", r, "\u00DC") - - // Note: \v is accepted by the jsonic Go string matcher as a built-in escape. - // This is a minor deviation from strict JSONC spec which only allows - // \", \\, \/, \b, \f, \n, \r, \t, and \uXXXX. - - _, err = parse(`"test`) - assertError(t, "unterminated", err, "unterminated_string") -} - -func TestNumbers(t *testing.T) { - r, _ := parse("0") - assert(t, "zero", r, float64(0)) - - r, _ = parse("0.1") - assert(t, "decimal", r, 0.1) - - r, _ = parse("-0.1") - assert(t, "neg-decimal", r, -0.1) - - r, _ = parse("-1") - assert(t, "neg", r, float64(-1)) - - r, _ = parse("1") - assert(t, "one", r, float64(1)) - - r, _ = parse("123456789") - assert(t, "large", r, float64(123456789)) - - r, _ = parse("10") - assert(t, "ten", r, float64(10)) - - r, _ = parse("90") - assert(t, "ninety", r, float64(90)) - - r, _ = parse("90E+123") - assert(t, "sci-upper-plus", r, 90E+123) - - r, _ = parse("90e+123") - assert(t, "sci-lower-plus", r, 90e+123) - - r, _ = parse("90e-123") - assert(t, "sci-lower-minus", r, 90e-123) - - r, _ = parse("90E-123") - assert(t, "sci-upper-minus", r, 90E-123) - - r, _ = parse("90E123") - assert(t, "sci-upper", r, 90E123) - - r, _ = parse("90e123") - assert(t, "sci-lower", r, 90e123) - - _, err := parse("-") - if err == nil { - t.Error("expected error for bare minus") - } - - _, err = parse(".0") - if err == nil { - t.Error("expected error for leading dot number") - } -} - -func TestKeywords(t *testing.T) { - r, _ := parse("true") - assert(t, "true", r, true) - - r, _ = parse("false") - assert(t, "false", r, false) - - r, _ = parse("null") - assert(t, "null", r, nil) - - _, err := parse("True") - if err == nil { - t.Error("expected error for capitalized True") - } - - r, _ = parse("false//hello") - assert(t, "value-with-comment", r, false) -} - -func TestTrivia(t *testing.T) { - r, _ := parse(" ") - assert(t, "space", r, nil) - - r, _ = parse(" \t ") - assert(t, "tabs", r, nil) - - r, _ = parse(" \t \n \t ") - assert(t, "tabs-newlines", r, nil) - - r, _ = parse("\r\n") - assert(t, "crlf", r, nil) - - r, _ = parse("\r") - assert(t, "cr", r, nil) - - r, _ = parse("\n") - assert(t, "lf", r, nil) - - r, _ = parse("\n\r") - assert(t, "lfcr", r, nil) - - r, _ = parse("\n \n") - assert(t, "newlines-spaces", r, nil) -} - -func TestLiterals(t *testing.T) { - r, _ := parse("true") - assert(t, "true", r, true) - - r, _ = parse("false") - assert(t, "false", r, false) - - r, _ = parse("null") - assert(t, "null", r, nil) - - r, _ = parse(`"foo"`) - assert(t, "string", r, "foo") - - r, _ = parse(`"\"-\\-\/-\b-\f-\n-\r-\t"`) - assert(t, "escapes", r, "\"-\\-/-\b-\f-\n-\r-\t") - - r, _ = parse(`"\u00DC"`) - assert(t, "unicode", r, "\u00DC") - - r, _ = parse("9") - assert(t, "nine", r, float64(9)) - - r, _ = parse("-9") - assert(t, "neg-nine", r, float64(-9)) - - r, _ = parse("0.129") - assert(t, "decimal", r, 0.129) - - r, _ = parse("23e3") - assert(t, "sci", r, 23e3) - - r, _ = parse("1.2E+3") - assert(t, "sci-plus", r, 1.2E+3) - - r, _ = parse("1.2E-3") - assert(t, "sci-minus", r, 1.2E-3) - - r, _ = parse("1.2E-3 // comment") - assert(t, "num-comment", r, 1.2E-3) -} - -func TestObjects(t *testing.T) { - r, _ := parse("{}") - assert(t, "empty", r, map[string]any{}) - - r, _ = parse(`{ "foo": true }`) - assert(t, "one-field", r, map[string]any{"foo": true}) - - r, _ = parse(`{ "bar": 8, "xoo": "foo" }`) - assert(t, "two-fields", r, map[string]any{"bar": float64(8), "xoo": "foo"}) - - r, _ = parse(`{ "hello": [], "world": {} }`) - assert(t, "empty-nested", r, map[string]any{"hello": []any{}, "world": map[string]any{}}) - - r, _ = parse(`{ "a": false, "b": true, "c": [ 7.4 ] }`) - assert(t, "mixed", r, map[string]any{"a": false, "b": true, "c": []any{7.4}}) - - r, _ = parse(`{ "hello": { "again": { "inside": 5 }, "world": 1 }}`) - assert(t, "deep-nested", r, map[string]any{ - "hello": map[string]any{ - "again": map[string]any{"inside": float64(5)}, - "world": float64(1), - }, - }) - - r, _ = parse(`{ "foo": /*hello*/true }`) - assert(t, "comment-in-obj", r, map[string]any{"foo": true}) - - r, _ = parse(`{ "": true }`) - assert(t, "empty-key", r, map[string]any{"": true}) -} - -func TestArrays(t *testing.T) { - r, _ := parse("[]") - assert(t, "empty", r, []any{}) - - r, _ = parse("[ [], [ [] ]]") - assert(t, "nested-empty", r, []any{[]any{}, []any{[]any{}}}) - - r, _ = parse("[ 1, 2, 3 ]") - assert(t, "numbers", r, []any{float64(1), float64(2), float64(3)}) - - r, _ = parse(`[ { "a": null } ]`) - assert(t, "obj-in-array", r, []any{map[string]any{"a": nil}}) -} - -func TestObjectErrors(t *testing.T) { - _, err := parse("{,}") - if err == nil { - t.Error("expected error for leading comma in object") - } - - _, err = parse(`{ "foo": true, }`) - if err == nil { - t.Error("expected error for trailing comma in object (default)") - } - - _, err = parse(`{ "bar": 8 "xoo": "foo" }`) - if err == nil { - t.Error("expected error for missing comma in object") - } - - _, err = parse(`{ ,"bar": 8 }`) - if err == nil { - t.Error("expected error for leading comma") - } - - _, err = parse(`{ "bar": 8, "foo": }`) - if err == nil { - t.Error("expected error for missing value") - } - - _, err = parse(`{ 8, "foo": 9 }`) - if err == nil { - t.Error("expected error for number as key") - } -} - -func TestArrayErrors(t *testing.T) { - _, err := parse("[,]") - if err == nil { - t.Error("expected error for leading comma in array") - } - - _, err = parse("[ 1 2, 3 ]") - if err == nil { - t.Error("expected error for missing comma in array") - } - - _, err = parse("[ ,1, 2, 3 ]") - if err == nil { - t.Error("expected error for leading comma in array") - } - - _, err = parse("[ ,1, 2, 3, ]") - if err == nil { - t.Error("expected error for commas in array") - } -} - -func TestErrors(t *testing.T) { - _, err := parse("1,1") - if err == nil { - t.Error("expected error for extra content after value") - } - - _, err = parse("") - if err == nil { - t.Error("expected error for empty input") - } -} - -func TestDisallowComments(t *testing.T) { - nc := makeJsonc(map[string]any{"disallowComments": true}) - - r, err := nc.Parse(`[ 1, 2, null, "foo" ]`) - if err != nil { - t.Fatal(err) - } - assert(t, "array", r, []any{float64(1), float64(2), nil, "foo"}) - - r, err = nc.Parse(`{ "hello": [], "world": {} }`) - if err != nil { - t.Fatal(err) - } - assert(t, "object", r, map[string]any{"hello": []any{}, "world": map[string]any{}}) - - _, err = nc.Parse(`{ "foo": /*comment*/ true }`) - if err == nil { - t.Error("expected error for comment when comments are disallowed") - } -} - -func TestTrailingComma(t *testing.T) { - jc := makeJsonc(map[string]any{"allowTrailingComma": true}) - - r, err := jc.Parse(`{ "hello": [], }`) - if err != nil { - t.Fatal(err) - } - assert(t, "obj-trailing", r, map[string]any{"hello": []any{}}) - - r, err = jc.Parse(`{ "hello": [] }`) - if err != nil { - t.Fatal(err) - } - assert(t, "obj-no-trailing", r, map[string]any{"hello": []any{}}) - - r, err = jc.Parse(`{ "hello": [], "world": {}, }`) - if err != nil { - t.Fatal(err) - } - assert(t, "obj-multi-trailing", r, map[string]any{"hello": []any{}, "world": map[string]any{}}) - - r, err = jc.Parse(`[ 1, 2, ]`) - if err != nil { - t.Fatal(err) - } - assert(t, "arr-trailing", r, []any{float64(1), float64(2)}) - - r, err = jc.Parse(`[ 1, 2 ]`) - if err != nil { - t.Fatal(err) - } - assert(t, "arr-no-trailing", r, []any{float64(1), float64(2)}) - - // Default parser should reject trailing commas. - _, err = j.Parse(`{ "hello": [], }`) - if err == nil { - t.Error("expected error for trailing comma with default options") - } - - _, err = j.Parse(`[ 1, 2, ]`) - if err == nil { - t.Error("expected error for trailing comma in array with default options") - } -} - -func TestMisc(t *testing.T) { - r, _ := j.Parse(`{ "foo": "bar" }`) - assert(t, "simple-obj", r, map[string]any{"foo": "bar"}) - - r, _ = j.Parse(`{ "foo": {"bar": 1, "car": 2 } }`) - assert(t, "nested-obj", r, map[string]any{ - "foo": map[string]any{"bar": float64(1), "car": float64(2)}, - }) - - r, _ = j.Parse(`{ "foo": {"bar": 1, "car": 8 }, "goo": {} }`) - assert(t, "multi-nested", r, map[string]any{ - "foo": map[string]any{"bar": float64(1), "car": float64(8)}, - "goo": map[string]any{}, - }) - - _, err := j.Parse(`{ "dep": {"bar": 1, "car": `) - if err == nil { - t.Error("expected error for unterminated object") - } - - _, err = j.Parse(`{ "dep": {"bar": 1,, "car": `) - if err == nil { - t.Error("expected error for double comma") - } - - _, err = j.Parse(`{ "dep": {"bar": "na", "dar": "ma", "car": } }`) - if err == nil { - t.Error("expected error for missing value") - } - - r, _ = j.Parse(`["foo", null ]`) - assert(t, "arr-mixed", r, []any{"foo", nil}) - - _, err = j.Parse(`["foo", null, ]`) - if err == nil { - t.Error("expected error for trailing comma in array") - } - - _, err = j.Parse(`["foo", null,, ]`) - if err == nil { - t.Error("expected error for double comma in array") - } - - r, _ = j.Parse("true") - assert(t, "bare-true", r, true) - - r, _ = j.Parse("false") - assert(t, "bare-false", r, false) - - r, _ = j.Parse("null") - assert(t, "bare-null", r, nil) - - r, _ = j.Parse("23") - assert(t, "bare-num", r, float64(23)) - - r, _ = j.Parse("-1.93e-19") - assert(t, "sci-notation", r, -1.93e-19) - - r, _ = j.Parse(`"hello"`) - assert(t, "bare-string", r, "hello") - - r, _ = j.Parse("[]") - assert(t, "empty-arr", r, []any{}) - - r, _ = j.Parse("[ 1 ]") - assert(t, "single-arr", r, []any{float64(1)}) - - r, _ = j.Parse(`[ 1, "x"]`) - assert(t, "mixed-arr", r, []any{float64(1), "x"}) - - r, _ = j.Parse("[[]]") - assert(t, "nested-arr", r, []any{[]any{}}) - - r, _ = j.Parse("{ }") - assert(t, "empty-obj", r, map[string]any{}) - - r, _ = j.Parse(`{ "val": 1 }`) - assert(t, "val-obj", r, map[string]any{"val": float64(1)}) - - r, _ = j.Parse(`{"id": "$", "v": [ null, null] }`) - assert(t, "complex-obj", r, map[string]any{"id": "$", "v": []any{nil, nil}}) - - _, err = j.Parse(`{ "id": { "foo": { } } , }`) - if err == nil { - t.Error("expected error for trailing comma") - } - - r, _ = j.Parse(`{ "foo": { "goo": 3 } }`) - assert(t, "nested-num", r, map[string]any{"foo": map[string]any{"goo": float64(3)}}) - - r, _ = j.Parse("[\r\n0,\r\n1,\r\n2\r\n]") - assert(t, "crlf-arr", r, []any{float64(0), float64(1), float64(2)}) - - r, _ = j.Parse(`/* g */ { "foo": //f` + "\n" + `"bar" }`) - assert(t, "comments-mixed", r, map[string]any{"foo": "bar"}) - - r, _ = j.Parse("/* g\r\n */ { \"foo\": //f\n\"bar\" }") - assert(t, "comments-crlf", r, map[string]any{"foo": "bar"}) - - r, _ = j.Parse("/* g\n */ { \"foo\": //f\n\"bar\"\n}") - assert(t, "comments-lf", r, map[string]any{"foo": "bar"}) - - r, _ = j.Parse(`{ "key1": { "key11": [ "val111", "val112" ] }, "key2": [ { "key21": false, "key22": 221 }, null, [{}] ] }`) - assert(t, "complex", r, map[string]any{ - "key1": map[string]any{"key11": []any{"val111", "val112"}}, - "key2": []any{ - map[string]any{"key21": false, "key22": float64(221)}, - nil, - []any{map[string]any{}}, - }, - }) -} - -func TestUsePlugin(t *testing.T) { - j := makeJsonc() - result, err := j.Parse(`{"a": 1, "b": "hello"}`) - if err != nil { - t.Fatal(err) - } - m, ok := result.(map[string]any) - if !ok { - t.Fatalf("expected map, got %T", result) - } - assert(t, "plugin", m, map[string]any{"a": float64(1), "b": "hello"}) -} - -// TestAltGJsoncTag verifies the GrammarText setting {Rule:{Alt:{G:'jsonc'}}} -// tagged every alt installed by the jsonc plugin with 'jsonc'. -func TestAltGJsoncTag(t *testing.T) { - // Fresh instance with allowTrailingComma so pair/elem close alts survive - // the rule-exclude filter and can be inspected here. - jc := jsonic.Make() - if err := jc.Use(Jsonc, map[string]any{"allowTrailingComma": true}); err != nil { - t.Fatal(err) - } - - rsm := jc.RSM() - checks := []struct { - rule string - isOpen bool - tokSig string - }{ - {"val", true, "#ZZ"}, - {"pair", false, "#CA #CB"}, - {"elem", false, "#CA #CS"}, - } - - for _, c := range checks { - rs, ok := rsm[c.rule] - if !ok { - t.Errorf("rule %q missing", c.rule) - continue - } - alts := rs.Close - if c.isOpen { - alts = rs.Open - } - - found := false - for _, a := range alts { - if !strings.Contains(a.G, "jsonc") { - continue - } - // Match the alt introduced by this plugin by its group tag(s). - // We only need to confirm 'jsonc' is present on at least one alt - // per rule; more importantly, every alt tagged as coming from - // the plugin must carry 'jsonc'. - found = true - tags := strings.Split(a.G, ",") - has := false - for _, t := range tags { - if strings.TrimSpace(t) == "jsonc" { - has = true - break - } - } - if !has { - t.Errorf("rule %q alt missing 'jsonc' tag: g=%q", c.rule, a.G) - } - } - if !found { - t.Errorf("rule %q: no alt with 'jsonc' tag found", c.rule) - } - } -} diff --git a/jsonc-grammar.jsonic b/jsonc-grammar.jsonic deleted file mode 100644 index 092da2f..0000000 --- a/jsonc-grammar.jsonic +++ /dev/null @@ -1,34 +0,0 @@ -# JSONC Grammar Definition -# Parsed by a standard Jsonic instance and passed to jsonic.grammar() -# Extends standard JSON grammar with end-of-input value handling. - -{ - options: text: { lex: false } - options: number: { hex: false oct: false bin: false sep: null exclude: "@/^\\./" } - options: string: { chars: '"' multiChars: '' allowUnknown: false } - options: string: escape: { v: null } - options: map: { extend: false } - options: lex: { empty: false } - options: rule: { finish: false } - - rule: val: open: { - alts: [ - { s: '#ZZ' g: jsonc } - ] - inject: { append: true } - } - - rule: pair: close: { - alts: [ - { s: '#CA #CB' b: 1 g: comma } - ] - inject: {} - } - - rule: elem: close: { - alts: [ - { s: '#CA #CS' b: 1 g: comma } - ] - inject: {} - } -} diff --git a/package.json b/package.json index 47f2670..e305c68 100644 --- a/package.json +++ b/package.json @@ -1,37 +1,39 @@ { - "name": "@jsonic/jsonc", - "version": "0.8.0", - "description": "This plugin allows the Jsonic JSON parser to support JSONC syntax.", + "name": "@jsonic/c", + "version": "0.1.0", + "description": "Jsonic plugin that parses C source into a concrete syntax tree, preserving macros and compiler extensions.", "author": "Richard Rodger (http://richardrodger.com)", "license": "MIT", - "main": "dist/jsonc.js", - "types": "dist/jsonc.d.ts", - "homepage": "https://github.com/jsonicjs/jsonc", - "repository": "github:jsonicjs/jsonc", + "main": "dist/c.js", + "types": "dist/c.d.ts", + "homepage": "https://github.com/jsonicjs/c", + "repository": "github:jsonicjs/c", "keywords": [ - "jsonc", - "json", - "comments", + "c", "parser", + "concrete-syntax-tree", + "cst", "jsonic" ], "scripts": { "test": "node --enable-source-maps --test \"dist-test/*.test.js\"", "test-some": "node --enable-source-maps --test-name-pattern=\"$npm_config_pattern\" --test \"dist-test/*.test.js\"", - "embed": "node embed-grammar.js", "watch": "tsc --build src test -w", - "build": "node embed-grammar.js && tsc --build src test", + "build": "tsc --build src test", "clean": "rm -rf dist dist-test node_modules yarn.lock package-lock.json", - "reset": "npm run clean && npm i && npm run build && npm test", - "repo-tag": "REPO_VERSION=`node -e \"console.log(require('./package').version)\"` && echo TAG: v$REPO_VERSION && git commit -a -m v$REPO_VERSION && git push && git tag v$REPO_VERSION && git push --tags;", - "repo-publish": "npm run clean && npm i && npm run repo-publish-quick", - "repo-publish-quick": "npm run build && npm run test && npm run repo-tag && npm publish --access public --registry https://registry.npmjs.org " + "reset": "npm run clean && npm i && npm run build && npm test" }, "peerDependencies": { "jsonic": ">=2" }, + "dependencies": { + "@jsonic/path": "^2.1.0", + "@jsonic/directive": "^2.2.0", + "@jsonic/expr": "^2.2.0" + }, "devDependencies": { "@types/node": "25.6.0", + "jsonic": "^2.28.0", "typescript": "6.0.3" }, "files": [ diff --git a/src/c.ts b/src/c.ts new file mode 100644 index 0000000..a2087ff --- /dev/null +++ b/src/c.ts @@ -0,0 +1,738 @@ +/* Copyright (c) 2026 Richard Rodger and contributors, MIT License */ + +// @jsonic/c — Jsonic plugin that parses C source into a concrete syntax +// tree, preserving macros and compiler extensions. +// +// First-slice scope: +// - Focused lex matchers (./matchers.ts) for whitespace, line +// continuation, comments, preprocessor directive boundaries, header +// names, identifiers (with keyword/typedef-name reclassification), +// integer/float/char/string literals, and punctuators. +// - SymbolTable + MacroTable installed on ctx.meta.cmeta for shared +// access from lex matchers and rule actions. +// - A coarse top-level grammar that splits the translation unit into +// external-declaration units terminated by `;` or by a brace-balanced +// block. Each unit captures its tokens verbatim. When the unit looks +// like `typedef ;` the trailing identifier is registered +// as a typedef-name in the symbol table — this is exactly what the +// identifier matcher needs to reclassify subsequent occurrences as +// TYPEDEF_NAME. +// +// Subsequent slices will refine each unit into the full C grammar +// (declarators, statements, expressions via @jsonic/expr, full +// preprocessor handling) without disturbing this foundation. + +import type { Jsonic, Rule, Context, RuleSpec, Token } from 'jsonic' +import { allMatchers } from './matchers.js' +import { makeCMeta, type CMeta } from './symbols.js' +import { + C23_KEYWORDS, + EXT_KEYWORDS, + PUNCTUATORS, + keywordTokenName, +} from './tokens.js' +import { structureExternalDeclaration, structureConditionalGroups } from './structure.js' + +export interface COptions { + // Reserved for future flags (strict mode, dialect selection, etc.) +} + +// ---- AST types ------------------------------------------------------ + +export interface Span { start: number; end: number; line: number; col: number } + +export interface CTokenRef { + kind: 'token' + tname: string + src: string + span: Span +} + +export interface CNode { + kind: string + span: Span + children: Array + trivia: { leading: CTokenRef[]; trailing: CTokenRef[] } + [extra: string]: any +} + +function tokenSpan(tkn: Token): Span { + return { start: tkn.sI, end: tkn.sI + tkn.len, line: tkn.rI, col: tkn.cI } +} + +function tokenRef(tkn: Token): CTokenRef { + return { kind: 'token', tname: tkn.name, src: tkn.src, span: tokenSpan(tkn) } +} + +function makeNode(kind: string, startTkn?: Token): CNode { + return { + kind, + span: startTkn ? tokenSpan(startTkn) : { start: 0, end: 0, line: 1, col: 1 }, + children: [], + trivia: { leading: [], trailing: [] }, + } +} + +function getCMeta(ctx: Context): CMeta { + return (ctx.meta as any).cmeta as CMeta +} + +// ---- Plugin --------------------------------------------------------- + +const C: any = function C(jsonic: Jsonic, _options: COptions): void { + + // 1. Register punctuator token names with their fixed sources, and + // keyword token names. We disable jsonic's built-in fixed-token + // matcher so identifier boundaries (e.g. `int_value`) aren't broken + // by a `int` keyword cut. + const fixedTokens: Record = {} + for (const [name, src] of PUNCTUATORS) fixedTokens[name] = src + for (const kw of [...C23_KEYWORDS, ...EXT_KEYWORDS]) { + fixedTokens[keywordTokenName(kw)!] = kw + } + + jsonic.options({ + fixed: { lex: false, token: fixedTokens }, + space: { lex: false }, + line: { lex: false }, + text: { lex: false }, + number: { lex: false }, + string: { lex: false }, + comment: { lex: false }, + value: { lex: false }, + match: { lex: true }, + // Trivia tokens are skipped by the parser (so grammar alts stay + // free of trivia clutter) but the sub-lex hook below still sees + // them and stashes them on the next non-trivia token's use.leading + // so source fidelity is preserved. + tokenSet: { + IGNORE: [ + '#SP', '#LN', '#CM', + 'TRIVIA_LINE_COMMENT', 'TRIVIA_BLOCK_COMMENT', 'TRIVIA_LINE_CONT', + ], + }, + rule: { + start: 'translation_unit', + finish: false, + }, + }) + + const matchEntries: Record = {} + for (const m of allMatchers()) { + matchEntries[m.name] = { order: m.order, make: () => m.make() } + } + jsonic.options({ lex: { match: matchEntries as any } }) + + // Register all special token names so they have stable Tins. + for (const name of [ + 'ID', 'TYPEDEF_NAME', 'MACRO_NAME', + 'LIT_INT', 'LIT_FLOAT', 'LIT_CHAR', 'LIT_STRING', 'LIT_HEADER_NAME', + 'PP_HASH', 'PP_NEWLINE', 'PP_RAW', + 'TRIVIA_LINE_COMMENT', 'TRIVIA_BLOCK_COMMENT', 'TRIVIA_LINE_CONT', + ]) { + jsonic.token(name as any) + } + + // Install CMeta on ctx.meta before parsing. + jsonic.options({ + parse: { + prepare: { + cmeta: ((_jsonic: Jsonic, ctx: Context, meta?: any) => { + const m = ctx.meta as any + if (!m.cmeta) m.cmeta = makeCMeta() + if (meta && meta.cmeta) m.cmeta = meta.cmeta + }) as any, + }, + }, + }) + + // Sub-lex hook: every emitted token (including ignored ones) flows + // through here. Trivia tokens get pushed onto cmeta.pendingTrivia; + // the next non-trivia token receives them via use.leading. The + // chomper (and future grammar rules) drain use.leading into the AST + // so comments survive in source order even though IGNORE'd at + // parse time. + jsonic.sub({ + lex: (tkn: Token, _rule: Rule, ctx: Context) => { + if (!tkn || !tkn.isToken) return + const m = (ctx.meta as any).cmeta as CMeta + if (!m) return + // Comments and line continuations are preserved; whitespace and + // jsonic's #LN/#CM are silently dropped. + if (PRESERVE_TRIVIA_NAMES.has(tkn.name)) { + m.pendingTrivia.push(tkn) + return + } + if (DROP_TRIVIA_NAMES.has(tkn.name)) return + if (m.pendingTrivia.length > 0) { + ;(tkn as any).use = (tkn as any).use || {} + ;(tkn as any).use.leading = m.pendingTrivia + m.pendingTrivia = [] + } + }, + }) + + // 2. Grammar. + // + // The translation unit holds a list of external_declaration nodes. + // extdecl_loop is the iterating rule that inherits translation_unit's + // node and accumulates children. Each external_declaration is itself + // an iteration over tokens (no helper sub-rule) — it appends one token + // per cycle to its own node and uses r.k for state that survives + // jsonic's r:-recursion (r.k is propagated, r.u is not). + + jsonic.rule('translation_unit', (rs: RuleSpec) => { + rs + .bo((rule: Rule) => { rule.node = makeNode('translation_unit') }) + .open([ + { s: ['#ZZ'], b: 1, g: 'tu-empty' }, + { p: 'extdecl_loop', g: 'tu-loop' }, + ]) + .bc((rule: Rule) => { + // After all external_declarations have accumulated, fold + // #if … #endif sequences into conditional_group nodes. + structureConditionalGroups(rule.node) + }) + .close([ + { s: ['#ZZ'], g: 'tu-end' }, + ]) + }) + + // extdecl_loop iterates external_declaration units. Its r.node is the + // translation_unit node (inherited). bc pushes each completed child. + jsonic.rule('extdecl_loop', (rs: RuleSpec) => { + rs + .open([ + { p: 'external_declaration', g: 'loop-one' }, + ]) + .bc((rule: Rule) => { + const child = rule.child + if (child && child.node && child.node.kind === 'external_declaration') { + rule.node.children.push(child.node) + } + }) + .close([ + { s: ['#ZZ'], b: 1, g: 'loop-end' }, + { r: 'extdecl_loop', g: 'loop-more' }, + ]) + }) + + // external_declaration absorbs one token per iteration into its node, + // then either terminates (top-level `;` or closing `}` at depth 0) or + // recurses with r:'external_declaration'. State (token list, depth) + // travels via r.k since u is not propagated across r:. + // + // bo guards against reset on r: by checking r.node.kind. + jsonic.rule('external_declaration', (rs: RuleSpec) => { + rs + .bo((rule: Rule) => { + if (!rule.node || rule.node.kind !== 'external_declaration') { + rule.node = makeNode('external_declaration') + } + if (!rule.k.tokens) rule.k.tokens = [] + if (rule.k.depth === undefined) rule.k.depth = 0 + if (rule.k.terminated === undefined) rule.k.terminated = false + }) + .open([ + // Terminate on EOF without consuming. + { s: ['#ZZ'], b: 1, g: 'extdecl-eof' }, + // Otherwise consume any single token. + { + s: [anyTokenSet(jsonic)], + a: (rule: Rule) => { + // The matched token lives in rule.o0 once the open-state alt + // has fired; ctx.t0 at this point is the next lookahead token. + const tkn = rule.o0 as Token + // Emit any leading trivia (comments, line continuations) the + // sub-lex hook stashed on tkn.use.leading, in source order, + // before the token itself. + const leading = (tkn as any).use && (tkn as any).use.leading + if (Array.isArray(leading)) { + for (const lt of leading) { + rule.node.children.push(tokenRef(lt)) + rule.k.tokens.push(lt) + } + } + rule.k.tokens.push(tkn) + rule.node.children.push(tokenRef(tkn)) + rule.k.justClosedBrace = false + if (tkn.name === 'PUNC_LBRACE') rule.k.depth++ + else if (tkn.name === 'PUNC_RBRACE') { + rule.k.depth-- + if (rule.k.depth <= 0) { + // Don't auto-terminate. A closing top-level brace ends a + // function body, but for a struct/union/enum definition or + // compound literal it's followed by tokens (`S;`, `var,…;`, + // `;` alone). The close-alts decide based on lookahead. + rule.k.justClosedBrace = true + } + } + else if (tkn.name === 'PUNC_SEMI' && rule.k.depth === 0) { + rule.k.terminated = true + } + else if (tkn.name === 'PP_NEWLINE' && rule.k.depth === 0 && + firstNonTriviaIs(rule.k.tokens, 'PP_HASH')) { + // Directive line ends here — each preprocessor directive + // is its own external_declaration. + rule.k.terminated = true + } + }, + g: 'extdecl-tok', + }, + ]) + .close([ + // EOF — wrap up. + { + s: ['#ZZ'], + b: 1, + a: (rule: Rule, ctx: Context) => { + finalizeExternalDeclaration(rule, ctx) + }, + g: 'extdecl-finish-eof', + }, + // We just consumed a top-level `}` and the next non-trivia token + // looks like the start of a brand-new external declaration — + // terminate this one (function-definition body case). + { + c: (rule: Rule, ctx: Context) => + rule.k.justClosedBrace === true && + startsNewExternalDeclaration(ctx), + a: (rule: Rule, ctx: Context) => { + finalizeExternalDeclaration(rule, ctx) + }, + g: 'extdecl-finish-block', + }, + // Hit `;` at depth 0 — terminate. + { + c: (rule: Rule) => rule.k.terminated === true, + a: (rule: Rule, ctx: Context) => { + finalizeExternalDeclaration(rule, ctx) + }, + g: 'extdecl-finish', + }, + // Continue absorbing. + { r: 'external_declaration', g: 'extdecl-more' }, + ]) + }) +} + +C.defaults = {} as any + +// ---- Helpers -------------------------------------------------------- + +// Set of every token tin we want one_token to accept. We compute it lazily +// on first call (after token registration is complete). +let _anyTokenSetCache: number[] | null = null +function anyTokenSet(jsonic: Jsonic): number[] { + if (_anyTokenSetCache) return _anyTokenSetCache + const names: string[] = [ + 'ID', 'TYPEDEF_NAME', 'MACRO_NAME', + 'LIT_INT', 'LIT_FLOAT', 'LIT_CHAR', 'LIT_STRING', 'LIT_HEADER_NAME', + 'PP_HASH', 'PP_NEWLINE', 'PP_RAW', + // TRIVIA_* are IGNORE'd, not matched by alts. They flow into the + // AST via use.leading attachment in tokenRef expansion. + ] + for (const [pn] of PUNCTUATORS) names.push(pn) + for (const kw of [...C23_KEYWORDS, ...EXT_KEYWORDS]) names.push(keywordTokenName(kw)!) + _anyTokenSetCache = names.map((n) => jsonic.token(n as any) as unknown as number) + return _anyTokenSetCache +} + +// Trivia whose source we want to keep in the AST (comments, line +// continuations) — captured by the sub-lex hook and re-emitted as token +// refs ahead of the next non-trivia token. +const PRESERVE_TRIVIA_NAMES = new Set([ + 'TRIVIA_LINE_COMMENT', 'TRIVIA_BLOCK_COMMENT', 'TRIVIA_LINE_CONT', +]) + +// Trivia we drop entirely from the AST (whitespace, raw newlines). +// Spans on real tokens still carry positional info. +const DROP_TRIVIA_NAMES = new Set([ + '#SP', '#LN', '#CM', +]) + +// Union, used by helpers that need to recognise any trivia regardless of +// whether it survives in the tree. +const TRIVIA_TOKEN_NAMES = new Set([ + ...PRESERVE_TRIVIA_NAMES, + ...DROP_TRIVIA_NAMES, +]) + +// Type qualifiers in declarator pointer position. Used to skip past +// `* const`, `* volatile`, etc. when locating a declared name. +const PTR_QUALIFIER_TOKEN_NAMES = new Set([ + 'KW_CONST', 'KW_VOLATILE', 'KW_RESTRICT', 'KW__ATOMIC', + 'KW___CONST__', 'KW___CONST', + 'KW___VOLATILE__', 'KW___VOLATILE', + 'KW___RESTRICT__', 'KW___RESTRICT', +]) + +// Tokens that begin a type-specifier in declaration_specifiers. We use +// this to find the boundary between specifiers and declarators, by +// stopping at a non-specifier token (i.e. *, ID, ( as start-of-decl, +// etc). +const TYPE_SPEC_KEYWORD_NAMES = new Set([ + 'KW_VOID', 'KW_CHAR', 'KW_SHORT', 'KW_INT', 'KW_LONG', 'KW_FLOAT', + 'KW_DOUBLE', 'KW_SIGNED', 'KW_UNSIGNED', 'KW_BOOL', 'KW__BOOL', + 'KW__COMPLEX', 'KW__IMAGINARY', + 'KW___SIGNED__', 'KW___SIGNED', + 'KW___INT8', 'KW___INT16', 'KW___INT32', 'KW___INT64', + 'KW_STRUCT', 'KW_UNION', 'KW_ENUM', + 'KW_TYPEOF', 'KW_TYPEOF_UNQUAL', + 'KW___TYPEOF__', 'KW___TYPEOF', + 'KW__BITINT', +]) +const STORAGE_CLASS_NAMES = new Set([ + 'KW_TYPEDEF', 'KW_EXTERN', 'KW_STATIC', 'KW_AUTO', 'KW_REGISTER', + 'KW__THREAD_LOCAL', 'KW_THREAD_LOCAL', 'KW_CONSTEXPR', + 'KW___THREAD', +]) +const TYPE_QUALIFIER_NAMES = new Set(PTR_QUALIFIER_TOKEN_NAMES) +const FUNCTION_SPECIFIER_NAMES = new Set([ + 'KW_INLINE', 'KW___INLINE__', 'KW___INLINE', + 'KW__NORETURN', +]) + +function isSpecifierKw(name: string): boolean { + return STORAGE_CLASS_NAMES.has(name) || + TYPE_SPEC_KEYWORD_NAMES.has(name) || + TYPE_QUALIFIER_NAMES.has(name) || + FUNCTION_SPECIFIER_NAMES.has(name) || + name === 'TYPEDEF_NAME' +} + +// Find index of the matching closing punctuator for the opener at `from`. +// `open`/`close` are token names (e.g. 'PUNC_LPAREN', 'PUNC_RPAREN'). +// Returns -1 if unbalanced. +function matchClose( + tokens: Token[], from: number, open: string, close: string, +): number { + let depth = 0 + for (let i = from; i < tokens.length; i++) { + const n = tokens[i].name + if (n === open) depth++ + else if (n === close) { + depth-- + if (depth === 0) return i + } + } + return -1 +} + +// Locate the declared name within a declarator token slice. +// A declarator is: pointer* direct_declarator postfix* +// pointer := '*' qualifier* +// direct_decl := ID | '(' declarator ')' +// postfix := '[' ... ']' | '(' params ')' +// The first ID encountered after stripping pointers/qualifiers is the +// declared name; if a parenthesised subdeclarator opens first, recurse. +// Returns the name's source string, or null if no name is found. +function findDeclaredName(tokens: Token[]): string | null { + let i = 0 + while (i < tokens.length) { + const t = tokens[i] + if (TRIVIA_TOKEN_NAMES.has(t.name)) { i++; continue } + if (t.name === 'PUNC_STAR') { i++; continue } + if (PTR_QUALIFIER_TOKEN_NAMES.has(t.name)) { i++; continue } + // Compiler attribute or asm label inside declarators — skip + // balanced-paren attribute groups. + if (t.name === 'KW___ATTRIBUTE__' || t.name === 'KW___ATTRIBUTE' || + t.name === 'KW___ASM__' || t.name === 'KW___ASM' || t.name === 'KW_ASM' || + t.name === 'KW___DECLSPEC') { + // Expect '(' next; skip the balanced group. + let j = i + 1 + while (j < tokens.length && TRIVIA_TOKEN_NAMES.has(tokens[j].name)) j++ + if (j < tokens.length && tokens[j].name === 'PUNC_LPAREN') { + const close = matchClose(tokens, j, 'PUNC_LPAREN', 'PUNC_RPAREN') + if (close < 0) return null + i = close + 1 + continue + } + i++ + continue + } + if (t.name === 'PUNC_LPAREN') { + const close = matchClose(tokens, i, 'PUNC_LPAREN', 'PUNC_RPAREN') + if (close < 0) return null + // Distinguish a parenthesised subdeclarator from a function + // parameter list. A function parameter list starts with a type + // specifier or `void` or `)` (empty); a subdeclarator starts with + // `*`, `(`, an attribute spec, or an ordinary ID that ISN'T a + // typedef-name. + const inner = tokens.slice(i + 1, close) + const firstNonTrivia = inner.find((x) => !TRIVIA_TOKEN_NAMES.has(x.name)) + const looksLikeSubdeclarator = + !!firstNonTrivia && ( + firstNonTrivia.name === 'PUNC_STAR' || + firstNonTrivia.name === 'PUNC_LPAREN' || + firstNonTrivia.name === 'KW___ATTRIBUTE__' || + firstNonTrivia.name === 'KW___ATTRIBUTE' || + firstNonTrivia.name === 'ID' // ordinary ID is the declared name + ) + if (looksLikeSubdeclarator) { + const innerName = findDeclaredName(inner) + if (innerName) return innerName + } + // Otherwise treat as function postfix — skip past it. + i = close + 1 + continue + } + if (t.name === 'PUNC_LBRACKET') { + const close = matchClose(tokens, i, 'PUNC_LBRACKET', 'PUNC_RBRACKET') + if (close < 0) return null + i = close + 1 + continue + } + if (t.name === 'ID' || t.name === 'TYPEDEF_NAME') { + return t.src + } + return null + } + return null +} + +// Split the init-declarator-list portion of the token stream by top-level +// commas. Returns one token slice per declarator (initializer included). +function splitDeclarators(tokens: Token[]): Token[][] { + const out: Token[][] = [] + let start = 0 + let parenDepth = 0 + let bracketDepth = 0 + let braceDepth = 0 + for (let i = 0; i < tokens.length; i++) { + const n = tokens[i].name + if (n === 'PUNC_LPAREN') parenDepth++ + else if (n === 'PUNC_RPAREN') parenDepth-- + else if (n === 'PUNC_LBRACKET') bracketDepth++ + else if (n === 'PUNC_RBRACKET') bracketDepth-- + else if (n === 'PUNC_LBRACE') braceDepth++ + else if (n === 'PUNC_RBRACE') braceDepth-- + else if (n === 'PUNC_COMMA' && + parenDepth === 0 && bracketDepth === 0 && braceDepth === 0) { + out.push(tokens.slice(start, i)) + start = i + 1 + } + } + out.push(tokens.slice(start)) + return out +} + +// Slice an init-declarator at the first top-level `=`, returning just the +// declarator part (initializer is dropped from name-search). +function declaratorPart(tokens: Token[]): Token[] { + let parenDepth = 0 + let bracketDepth = 0 + for (let i = 0; i < tokens.length; i++) { + const n = tokens[i].name + if (n === 'PUNC_LPAREN') parenDepth++ + else if (n === 'PUNC_RPAREN') parenDepth-- + else if (n === 'PUNC_LBRACKET') bracketDepth++ + else if (n === 'PUNC_RBRACKET') bracketDepth-- + else if (n === 'PUNC_ASSIGN' && parenDepth === 0 && bracketDepth === 0) { + return tokens.slice(0, i) + } + } + return tokens +} + +// Identify the boundary between declaration-specifiers and the first +// declarator. Returns the index of the first non-specifier token. +// +// Specifiers are storage-classes, type-specifiers, type-qualifiers, +// function-specifiers, and a single TYPEDEF_NAME (after which any further +// ID is a declarator). struct/union/enum specifiers may also include a +// brace-balanced body — those are absorbed wholesale. +function findSpecBoundary(tokens: Token[]): number { + let i = 0 + let sawTypedefName = false + let sawTagSpec = false + while (i < tokens.length) { + const t = tokens[i] + if (TRIVIA_TOKEN_NAMES.has(t.name)) { i++; continue } + // After a TYPEDEF_NAME, a following ID belongs to the declarator. + if (t.name === 'TYPEDEF_NAME') { + if (sawTypedefName) return i + sawTypedefName = true + i++ + continue + } + if (t.name === 'KW_STRUCT' || t.name === 'KW_UNION' || t.name === 'KW_ENUM') { + sawTagSpec = true + i++ + // Optional tag name (ID). + while (i < tokens.length && TRIVIA_TOKEN_NAMES.has(tokens[i].name)) i++ + if (i < tokens.length && (tokens[i].name === 'ID' || tokens[i].name === 'TYPEDEF_NAME')) { + i++ + } + // Optional body. + while (i < tokens.length && TRIVIA_TOKEN_NAMES.has(tokens[i].name)) i++ + if (i < tokens.length && tokens[i].name === 'PUNC_LBRACE') { + const close = matchClose(tokens, i, 'PUNC_LBRACE', 'PUNC_RBRACE') + if (close < 0) return tokens.length + i = close + 1 + } + continue + } + if (isSpecifierKw(t.name) && t.name !== 'TYPEDEF_NAME') { + i++ + continue + } + // `__attribute__((...))` / `__declspec(...)` attached to the + // declaration: absorb as part of specifiers. + if (t.name === 'KW___ATTRIBUTE__' || t.name === 'KW___ATTRIBUTE' || + t.name === 'KW___DECLSPEC') { + i++ + while (i < tokens.length && TRIVIA_TOKEN_NAMES.has(tokens[i].name)) i++ + if (i < tokens.length && tokens[i].name === 'PUNC_LPAREN') { + const close = matchClose(tokens, i, 'PUNC_LPAREN', 'PUNC_RPAREN') + if (close < 0) return tokens.length + i = close + 1 + } + continue + } + return i + } + return i +} + +function registerTypedefIfApplicable(tokens: Token[], ctx: Context): void { + // Strip trivia for analysis (the original tokens still live on the AST). + const filtered = tokens.filter((t) => !TRIVIA_TOKEN_NAMES.has(t.name)) + if (filtered.length < 3) return + if (filtered[0].name !== 'KW_TYPEDEF') return + const last = filtered[filtered.length - 1] + if (last.name !== 'PUNC_SEMI') return + // Drop the trailing `;` from the body we examine. + const body = filtered.slice(0, filtered.length - 1) + const specEnd = findSpecBoundary(body) + const declList = body.slice(specEnd) + if (declList.length === 0) return + const cmeta = getCMeta(ctx) + for (const decl of splitDeclarators(declList)) { + const justDecl = declaratorPart(decl) + const name = findDeclaredName(justDecl) + if (name) { + cmeta.symbols.bindTypedef(name) + reclassifyAsTypedef(ctx, name) + } + } +} + +// Run after the chomper terminates an external declaration: register +// typedef-names and try to upgrade the flat token-ref list to a +// structured tree (declaration / function_definition / preprocessor). +function finalizeExternalDeclaration(rule: Rule, ctx: Context): void { + const tokens = rule.k.tokens as Token[] + registerTypedefIfApplicable(tokens, ctx) + const structured = structureExternalDeclaration(tokens) + if (structured) { + rule.node.children = structured.children + rule.node.declKind = structured.declKind + registerMacrosFromTree(rule.node, ctx) + } else { + rule.node.declKind = 'unknown' + } +} + +// Walk a freshly-structured node and register any define_directive +// macros into cmeta.macros (and #undef removes them). The walk only +// touches the top-level external_declaration's tree — surrounding +// translation_unit accumulation visits each child in order, so macro +// state evolves as the parse progresses. +function registerMacrosFromTree(node: any, ctx: Context): void { + const cmeta = (ctx.meta as any).cmeta as CMeta + if (!cmeta) return + const visit = (n: any) => { + if (!n) return + if (n.kind === 'define_directive' && n.macroName) { + cmeta.macros.define({ + name: n.macroName, + isFunctionLike: n.macroKind === 'function-like', + params: n.macroParams, + variadic: !!n.macroVariadic, + }) + // Reclassify any already-lexed lookahead tokens with this name + // from ID to MACRO_NAME. Subsequent macros are first picked up by + // the identifier matcher itself, but tokens fetched into ctx.t / + // pnt.token *before* this define ran need a manual fix-up. + reclassifyAsMacro(ctx, n.macroName) + } else if (n.kind === 'undef_directive' && n.macroName) { + cmeta.macros.undefine(n.macroName) + } + if (Array.isArray(n.children)) for (const c of n.children) visit(c) + } + visit(node) +} + +function reclassifyAsMacro(ctx: Context, name: string): void { + const lex = (ctx as any).lex + if (!lex) return + const idTin = (ctx.cfg as any).t['ID'] + const mnTin = (ctx.cfg as any).t['MACRO_NAME'] + const fix = (tkn: any) => { + if (!tkn || !tkn.isToken) return + if (tkn.tin === idTin && tkn.src === name) { + tkn.tin = mnTin + tkn.name = 'MACRO_NAME' + } + } + if (Array.isArray(ctx.t)) for (const tkn of ctx.t) fix(tkn) + if (lex.pnt && Array.isArray(lex.pnt.token)) for (const tkn of lex.pnt.token) fix(tkn) +} + +// True iff the first non-trivia entry in `tokens` has the given name. +function firstNonTriviaIs(tokens: Token[], name: string): boolean { + for (const t of tokens) { + if (TRIVIA_TOKEN_NAMES.has(t.name)) continue + return t.name === name + } + return false +} + +// True when ctx.t0 (the next token to be consumed) is one of the tokens +// that unambiguously begin a new external declaration. Used by the +// chomper to decide that a top-level `}` was the end of a function body. +function startsNewExternalDeclaration(ctx: Context): boolean { + // Skip trivia in the lookahead. + let i = 0 + while (i < ctx.t.length) { + const tkn = ctx.t[i] + if (!tkn) break + if (TRIVIA_TOKEN_NAMES.has(tkn.name)) { i++; continue } + const n = tkn.name + if (n === '#ZZ') return true + if (n === 'PP_HASH') return true + if (n === 'PUNC_HASH') return true + if (STORAGE_CLASS_NAMES.has(n)) return true + if (TYPE_SPEC_KEYWORD_NAMES.has(n)) return true + if (TYPE_QUALIFIER_NAMES.has(n)) return true + if (FUNCTION_SPECIFIER_NAMES.has(n)) return true + if (n === 'KW___ATTRIBUTE__' || n === 'KW___ATTRIBUTE') return true + if (n === 'KW___DECLSPEC') return true + if (n === 'KW___EXTENSION__') return true + if (n === 'TYPEDEF_NAME') return true + // ID (could be a macro that expands to a declaration, or the + // declared name from `typedef struct { } S;`). Assume continuation. + return false + } + return false +} + +function reclassifyAsTypedef(ctx: Context, name: string): void { + const lex = (ctx as any).lex + if (!lex) return + const idTin = (ctx.cfg as any).t['ID'] + const tdTin = (ctx.cfg as any).t['TYPEDEF_NAME'] + const fix = (tkn: any) => { + if (!tkn || !tkn.isToken) return + if (tkn.tin === idTin && tkn.src === name) { + tkn.tin = tdTin + tkn.name = 'TYPEDEF_NAME' + } + } + if (Array.isArray(ctx.t)) for (const tkn of ctx.t) fix(tkn) + if (lex.pnt && Array.isArray(lex.pnt.token)) for (const tkn of lex.pnt.token) fix(tkn) +} + +export { C } +export default C diff --git a/src/expr.ts b/src/expr.ts new file mode 100644 index 0000000..926ab71 --- /dev/null +++ b/src/expr.ts @@ -0,0 +1,564 @@ +/* Copyright (c) 2026 Richard Rodger and contributors, MIT License */ + +// Hand-rolled Pratt-style parser for C expressions, with the full +// operator-precedence table from C23 §6.5. Used by structure.ts in +// every expression context (expression_statement bodies, initializer +// values, jump-statement return values, condition headers). +// +// Output: a tree of nodes whose leaves are token-refs and whose +// branches are kind-tagged shapes. Walking depth-first still yields +// the original token sequence, so source fidelity is preserved. +// +// What's covered: +// - literal, identifier, parenthesised, generic_selection, sizeof, +// _Alignof, statement-expression `({ ... })` (GCC), compound +// literal `(type){ ... }` +// - postfix: call_expression, subscript_expression, +// member_expression, postfix_unary_expression +// - prefix: unary_expression (++ -- + - ! ~ * & sizeof _Alignof +// __real__ __imag__) +// - cast: cast_expression `( type-name ) operand` — only when +// the parenthesised head is unambiguously a type-name +// (typedef-name or simple type keyword). +// - binary: 11 levels (multiplicative through logical-or) +// - ternary: conditional_expression (right-assoc) +// - assignment: assignment_expression (right-assoc, all =/+=/-=/...) +// - comma: comma_expression +// +// Missing (left for future slices): +// - GCC type compound expressions like `__builtin_choose_expr(...)` +// beyond plain identifier-call recognition (they parse as ordinary +// calls today). + +import type { Token } from 'jsonic' +import type { TokenStream, CNode, CTokenRef } from './structure.js' + +// We intentionally re-import the helpers we need rather than coupling +// structure.ts to expr.ts in both directions. +import { } from './structure.js' + +const PRESERVED_TRIVIA = new Set([ + 'TRIVIA_LINE_COMMENT', 'TRIVIA_BLOCK_COMMENT', 'TRIVIA_LINE_CONT', +]) + +const TYPE_KEYWORDS = new Set([ + 'KW_VOID', 'KW_CHAR', 'KW_SHORT', 'KW_INT', 'KW_LONG', 'KW_FLOAT', + 'KW_DOUBLE', 'KW_SIGNED', 'KW_UNSIGNED', 'KW_BOOL', 'KW__BOOL', + 'KW__COMPLEX', 'KW__IMAGINARY', + 'KW___SIGNED__', 'KW___SIGNED', + 'KW___INT8', 'KW___INT16', 'KW___INT32', 'KW___INT64', + 'KW_CONST', 'KW_VOLATILE', 'KW_RESTRICT', 'KW__ATOMIC', + 'KW___CONST__', 'KW___CONST', + 'KW___VOLATILE__', 'KW___VOLATILE', + 'KW___RESTRICT__', 'KW___RESTRICT', + 'KW_STRUCT', 'KW_UNION', 'KW_ENUM', + 'KW_TYPEOF', 'KW_TYPEOF_UNQUAL', + 'KW___TYPEOF__', 'KW___TYPEOF', + 'KW__BITINT', +]) + +function makeNode(kind: string, span: any): CNode { + return { kind, span, children: [], trivia: { leading: [], trailing: [] } } as CNode +} + +function tokenRef(t: Token): CTokenRef { + return { + kind: 'token', tname: t.name, src: t.src, + span: { start: t.sI, end: t.sI + t.len, line: t.rI, col: t.cI }, + } +} + +function spanOf(t: Token) { + return { start: t.sI, end: t.sI + t.len, line: t.rI, col: t.cI } +} + +// Push the next real token with leading trivia onto `node`. Returns the +// underlying Token or null at end-of-stream. +function takeTokenInto(ts: TokenStream, node: CNode): Token | null { + return ts.takeInto(node) +} + +// ---- Operator tables ------------------------------------------------ + +interface BinaryOp { name: string; prec: number; rightAssoc?: boolean } + +// Precedence levels — higher number binds tighter. +// 16: unary / postfix (handled separately) +// 13: multiplicative * / % +// 12: additive + - +// 11: shift << >> +// 10: relational < <= > >= +// 9: equality == != +// 8: bitand & +// 7: bitxor ^ +// 6: bitor | +// 5: logical-and && +// 4: logical-or || +// 3: ternary (handled separately as a post-step) +// 2: assignment (right-assoc, handled separately) +// 1: comma (handled separately at the top) +const BINARY_OPS: Record = { + PUNC_STAR: { name: '*', prec: 13 }, + PUNC_SLASH: { name: '/', prec: 13 }, + PUNC_PERCENT: { name: '%', prec: 13 }, + PUNC_PLUS: { name: '+', prec: 12 }, + PUNC_MINUS: { name: '-', prec: 12 }, + PUNC_LSHIFT: { name: '<<', prec: 11 }, + PUNC_RSHIFT: { name: '>>', prec: 11 }, + PUNC_LT: { name: '<', prec: 10 }, + PUNC_LE: { name: '<=', prec: 10 }, + PUNC_GT: { name: '>', prec: 10 }, + PUNC_GE: { name: '>=', prec: 10 }, + PUNC_EQ: { name: '==', prec: 9 }, + PUNC_NE: { name: '!=', prec: 9 }, + PUNC_AMP: { name: '&', prec: 8 }, + PUNC_CARET: { name: '^', prec: 7 }, + PUNC_PIPE: { name: '|', prec: 6 }, + PUNC_AND_AND: { name: '&&', prec: 5 }, + PUNC_OR_OR: { name: '||', prec: 4 }, +} + +const ASSIGN_OPS = new Set([ + 'PUNC_ASSIGN', 'PUNC_PLUS_ASSIGN', 'PUNC_MINUS_ASSIGN', + 'PUNC_STAR_ASSIGN', 'PUNC_SLASH_ASSIGN', 'PUNC_PERCENT_ASSIGN', + 'PUNC_LSHIFT_ASSIGN', 'PUNC_RSHIFT_ASSIGN', + 'PUNC_AMP_ASSIGN', 'PUNC_CARET_ASSIGN', 'PUNC_PIPE_ASSIGN', +]) + +const PREFIX_OPS = new Set([ + 'PUNC_PLUS_PLUS', 'PUNC_MINUS_MINUS', + 'PUNC_PLUS', 'PUNC_MINUS', 'PUNC_BANG', 'PUNC_TILDE', + 'PUNC_STAR', 'PUNC_AMP', + 'KW_SIZEOF', 'KW__ALIGNOF', 'KW_ALIGNOF', 'KW___ALIGNOF__', 'KW___ALIGNOF', + 'KW___REAL__', 'KW___IMAG__', 'KW___EXTENSION__', +]) + +const POSTFIX_OPS = new Set(['PUNC_PLUS_PLUS', 'PUNC_MINUS_MINUS']) + +// ---- Stoppers helpers ---------------------------------------------- + +function isStop(name: string | null, stoppers: Set): boolean { + return name === null || stoppers.has(name) +} + +// ---- Entry --------------------------------------------------------- + +export function parseExpression( + ts: TokenStream, stoppers: Set, +): CNode | null { + return parseComma(ts, stoppers) +} + +// Comma: lowest precedence. Right-assoc isn't needed semantically; +// model as a left-grown list so consumers see operands in source order. +function parseComma(ts: TokenStream, stoppers: Set): CNode | null { + let first = parseAssignment(ts, stoppers) + if (!first) return null + if (ts.peekName() !== 'PUNC_COMMA' || stoppers.has('PUNC_COMMA')) return first + const node = makeNode('comma_expression', first.span) + node.children.push(first) + while (ts.peekName() === 'PUNC_COMMA' && !stoppers.has('PUNC_COMMA')) { + takeTokenInto(ts, node) // ',' + const next = parseAssignment(ts, stoppers) + if (!next) break + node.children.push(next) + } + return node +} + +// Assignment: right-associative; one of the assignment operators. +function parseAssignment(ts: TokenStream, stoppers: Set): CNode | null { + const left = parseConditional(ts, stoppers) + if (!left) return null + const opName = ts.peekName() + if (opName && ASSIGN_OPS.has(opName)) { + const node = makeNode('assignment_expression', left.span) + node.children.push(left) + node.left = left + const opTkn = takeTokenInto(ts, node)! + node.op = opTkn.src + const right = parseAssignment(ts, stoppers) // right-assoc + if (right) { + node.children.push(right) + node.right = right + } + return node + } + return left +} + +// Conditional / ternary: right-associative. +function parseConditional(ts: TokenStream, stoppers: Set): CNode | null { + const cond = parseBinary(ts, stoppers, 0) + if (!cond) return null + if (ts.peekName() !== 'PUNC_QUESTION') return cond + const node = makeNode('conditional_expression', cond.span) + node.children.push(cond) + node.cond = cond + takeTokenInto(ts, node) // '?' + // Middle: full expression up to ':'. + const then = parseExpression(ts, new Set([...stoppers, 'PUNC_COLON'])) + if (then) { + node.children.push(then) + node.then = then + } + if (ts.peekName() === 'PUNC_COLON') takeTokenInto(ts, node) + const els = parseAssignment(ts, stoppers) + if (els) { + node.children.push(els) + node.else = els + } + return node +} + +// Binary operators with precedence. `minPrec` is the lowest precedence +// we'll keep absorbing. +function parseBinary( + ts: TokenStream, stoppers: Set, minPrec: number, +): CNode | null { + let left = parseUnary(ts, stoppers) + if (!left) return null + while (true) { + const n = ts.peekName() + if (!n || stoppers.has(n)) break + const op = BINARY_OPS[n] + if (!op || op.prec < minPrec) break + const node = makeNode('binary_expression', left.span) + node.children.push(left) + node.left = left + const opTkn = takeTokenInto(ts, node)! + node.op = opTkn.src + const right = parseBinary(ts, stoppers, op.prec + 1) + if (!right) { + // Recovery: bail out of the loop. The incomplete node is still + // useful for downstream consumers. + break + } + node.children.push(right) + node.right = right + left = node + } + return left +} + +// Prefix unary operators, including sizeof/_Alignof/typeof in their +// expression forms. Recurses into the operand. +function parseUnary(ts: TokenStream, stoppers: Set): CNode | null { + const n = ts.peekName() + if (n && PREFIX_OPS.has(n)) { + const startTkn = ts.peek()! + const node = makeNode('unary_expression', spanOf(startTkn)) + const opTkn = takeTokenInto(ts, node)! + node.op = opTkn.src + // sizeof / _Alignof can take a parenthesised type-name, not an + // expression. We detect by peeking: `sizeof ( ...` is a type-form. + if ((n === 'KW_SIZEOF' || n === 'KW__ALIGNOF' || n === 'KW_ALIGNOF' || + n === 'KW___ALIGNOF__' || n === 'KW___ALIGNOF') && + ts.peekName() === 'PUNC_LPAREN' && + looksLikeTypeName(ts, 1)) { + const tn = makeNode('type_name', spanOf(ts.peek()!)) + consumeBalanced(ts, tn, 'PUNC_LPAREN', 'PUNC_RPAREN') + node.children.push(tn) + node.operand = tn + return node + } + const operand = parseUnary(ts, stoppers) + if (operand) { + node.children.push(operand) + node.operand = operand + } + return node + } + return parsePostfix(ts, stoppers) +} + +// Postfix loop: subscript, call, member access, increment/decrement. +function parsePostfix(ts: TokenStream, stoppers: Set): CNode | null { + let target = parsePrimary(ts, stoppers) + if (!target) return null + while (true) { + const n = ts.peekName() + if (!n || stoppers.has(n)) break + if (n === 'PUNC_LBRACKET') { + const node = makeNode('subscript_expression', target.span) + node.children.push(target) + node.target = target + const idx = makeNode('index_list', spanOf(ts.peek()!)) + consumeBalanced(ts, idx, 'PUNC_LBRACKET', 'PUNC_RBRACKET') + node.children.push(idx) + target = node + continue + } + if (n === 'PUNC_LPAREN') { + const node = makeNode('call_expression', target.span) + node.children.push(target) + // Tag isMacro when the immediate target is an identifier_expression + // whose token was MACRO_NAME. + const callee = unwrapCallee(target) + if (callee) { + node.callee = callee.src + node.isMacro = callee.tname === 'MACRO_NAME' + } + const args = makeNode('argument_list', spanOf(ts.peek()!)) + // Parse comma-separated assignment-expressions as arguments. + takeTokenInto(ts, args) // '(' + while (!ts.done() && ts.peekName() !== 'PUNC_RPAREN') { + const a = parseAssignment(ts, new Set(['PUNC_COMMA', 'PUNC_RPAREN'])) + if (a) args.children.push(a) + else { + // Defensive: avoid infinite loop on something we don't grok. + takeTokenInto(ts, args) + } + if (ts.peekName() === 'PUNC_COMMA') takeTokenInto(ts, args) + } + if (ts.peekName() === 'PUNC_RPAREN') takeTokenInto(ts, args) + node.children.push(args) + target = node + continue + } + if (n === 'PUNC_DOT' || n === 'PUNC_ARROW') { + const node = makeNode('member_expression', target.span) + node.children.push(target) + node.object = target + const opTkn = takeTokenInto(ts, node)! + node.op = opTkn.src + // The member name is the next ID (or possibly a TYPEDEF_NAME, in + // exceptional code). + const memTkn = ts.peek() + if (memTkn && (memTkn.name === 'ID' || memTkn.name === 'TYPEDEF_NAME' || + memTkn.name === 'MACRO_NAME')) { + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.memberName = taken.tkn.src + } + target = node + continue + } + if (POSTFIX_OPS.has(n)) { + const node = makeNode('postfix_unary_expression', target.span) + node.children.push(target) + node.target = target + const opTkn = takeTokenInto(ts, node)! + node.op = opTkn.src + target = node + continue + } + break + } + return target +} + +function unwrapCallee(node: CNode): CTokenRef | null { + if (node.kind !== 'identifier_expression') return null + const t = node.children.find((c: any) => c.kind === 'token') + return (t as CTokenRef) || null +} + +// Primary: literal, identifier, parenthesised, generic, statement-expr, +// compound literal. +function parsePrimary(ts: TokenStream, stoppers: Set): CNode | null { + const t = ts.peek() + if (!t) return null + const n = t.name + if (n === 'LIT_INT' || n === 'LIT_FLOAT' || + n === 'LIT_CHAR' || n === 'LIT_STRING') { + const node = makeNode('literal_expression', spanOf(t)) + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.literalKind = n + node.value = taken.tkn.src + // Adjacent string literals concatenate ("foo" "bar") — keep all in + // one literal_expression node. + if (n === 'LIT_STRING') { + while (ts.peekName() === 'LIT_STRING') { + const more = ts.take()! + for (const tr of more.trivia) node.children.push(tr) + node.children.push(more.ref) + } + } + return node + } + if (n === 'ID' || n === 'MACRO_NAME' || n === 'TYPEDEF_NAME') { + const node = makeNode('identifier_expression', spanOf(t)) + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.name = taken.tkn.src + return node + } + if (n === 'KW__GENERIC') { + return parseGenericSelection(ts) + } + if (n === 'PUNC_LPAREN') { + // GCC statement-expression `({ ... })`. + if (ts.peekName(1) === 'PUNC_LBRACE') { + const node = makeNode('statement_expression', spanOf(t)) + consumeBalanced(ts, node, 'PUNC_LPAREN', 'PUNC_RPAREN') + return node + } + // Cast vs parenthesised expression vs compound literal: peek ahead + // one token. If it begins a type-name, the form is `( type-name ) X` + // (cast) or `( type-name ) { … }` (compound literal). + if (looksLikeTypeName(ts, 1)) { + // Find the closing ) and check what follows. + const m = ts.mark() + const opener = ts.take()! // '(' + const tn = makeNode('type_name', spanOf(opener.tkn)) + // Take the `(` we already consumed back as a child token. + tn.children.push(opener.ref) + // Consume balanced contents up to and including the matching ')' + // (we treat the type-name's body as opaque tokens for now). + let depth = 1 + while (!ts.done() && depth > 0) { + const nn = ts.peekName() + if (nn === 'PUNC_LPAREN') depth++ + else if (nn === 'PUNC_RPAREN') { + depth-- + if (depth === 0) { + takeTokenInto(ts, tn) // closing ')' + break + } + } + takeTokenInto(ts, tn) + } + // Compound literal: followed by `{`. + if (ts.peekName() === 'PUNC_LBRACE') { + const cl = makeNode('compound_literal', tn.span) + cl.children.push(tn) + cl.typeName = tn + const init = makeNode('initializer_list', spanOf(ts.peek()!)) + consumeBalanced(ts, init, 'PUNC_LBRACE', 'PUNC_RBRACE') + cl.children.push(init) + return cl + } + // Cast: followed by an expression. + const operand = parseUnary(ts, stoppers) + if (operand) { + const cast = makeNode('cast_expression', tn.span) + cast.children.push(tn) + cast.children.push(operand) + cast.typeName = tn + cast.operand = operand + return cast + } + // Couldn't parse as cast — restore and fall through to plain + // parenthesised expression. + ts.restore(m) + } + // Plain parenthesised expression. + const node = makeNode('paren_expression', spanOf(t)) + takeTokenInto(ts, node) // '(' + const inner = parseExpression(ts, new Set(['PUNC_RPAREN'])) + if (inner) node.children.push(inner) + if (ts.peekName() === 'PUNC_RPAREN') takeTokenInto(ts, node) + return node + } + return null +} + +// Lookahead helper: does the token at offset `off` begin a type-name? +// True for type keywords and TYPEDEF_NAMEs. (Enough for the common +// cast / sizeof / compound-literal cases; full type-name detection is +// more involved and lives in structure.ts's specifier path.) +function looksLikeTypeName(ts: TokenStream, off: number): boolean { + const t = ts.peek(off) + if (!t) return false + if (t.name === 'TYPEDEF_NAME') return true + if (TYPE_KEYWORDS.has(t.name)) return true + return false +} + +// _Generic ( ctrl-expr , association ( , association )* ) +// association: +// type-name : assignment-expr | 'default' : assignment-expr +function parseGenericSelection(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('generic_selection', spanOf(startTkn)) + takeTokenInto(ts, node) // '_Generic' + if (ts.peekName() !== 'PUNC_LPAREN') return node + takeTokenInto(ts, node) // '(' + + // Controlling expression: assignment up to ',' or ')'. + const ctrl = parseExpression(ts, new Set(['PUNC_COMMA', 'PUNC_RPAREN'])) + if (ctrl) { + const wrap = makeNode('generic_controlling_expression', ctrl.span) + wrap.children.push(ctrl) + wrap.expression = ctrl + node.children.push(wrap) + node.controlling = wrap + } + + node.associations = [] as any[] + while (ts.peekName() === 'PUNC_COMMA') { + takeTokenInto(ts, node) // ',' + const ga = parseGenericAssociation(ts) + if (ga) { + node.children.push(ga) + node.associations.push(ga) + } else break + } + if (ts.peekName() === 'PUNC_RPAREN') takeTokenInto(ts, node) + return node +} + +function parseGenericAssociation(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('generic_association', spanOf(startTkn)) + + // Either 'default' or a type-name. We model the type-name as an + // opaque-balanced sequence up to ':' (a real type-name parser belongs + // in structure.ts and is mid-flight; the contents are still + // preserved verbatim). + if (startTkn.name === 'KW_DEFAULT') { + takeTokenInto(ts, node) + node.associationKind = 'default' + } else { + const tn = makeNode('type_name', spanOf(startTkn)) + let parenD = 0, bracketD = 0 + while (!ts.done()) { + const n = ts.peekName() + if (n === 'PUNC_LPAREN') { parenD++; takeTokenInto(ts, tn); continue } + if (n === 'PUNC_RPAREN') { + if (parenD === 0) break + parenD--; takeTokenInto(ts, tn); continue + } + if (n === 'PUNC_LBRACKET') { bracketD++; takeTokenInto(ts, tn); continue } + if (n === 'PUNC_RBRACKET') { + if (bracketD === 0) break + bracketD--; takeTokenInto(ts, tn); continue + } + if (parenD === 0 && bracketD === 0 && + (n === 'PUNC_COLON' || n === 'PUNC_COMMA' || n === 'PUNC_RPAREN')) break + takeTokenInto(ts, tn) + } + node.children.push(tn) + node.typeName = tn + node.associationKind = 'type' + } + if (ts.peekName() === 'PUNC_COLON') takeTokenInto(ts, node) + const expr = parseExpression(ts, new Set(['PUNC_COMMA', 'PUNC_RPAREN'])) + if (expr) { + node.children.push(expr) + node.value = expr + } + return node +} + +function consumeBalanced( + ts: TokenStream, node: CNode, open: string, close: string, +): boolean { + if (ts.peekName() !== open) return false + takeTokenInto(ts, node) + let depth = 1 + while (depth > 0 && !ts.done()) { + const n = ts.peekName() + if (n === open) depth++ + else if (n === close) depth-- + takeTokenInto(ts, node) + } + return depth === 0 +} diff --git a/src/jsonc.ts b/src/jsonc.ts deleted file mode 100644 index 00d0cb8..0000000 --- a/src/jsonc.ts +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2021-2025 Richard Rodger, MIT License */ - -// Import Jsonic types used by plugin. -import { Jsonic } from 'jsonic' - -type JsoncOptions = { - allowTrailingComma?: boolean - disallowComments?: boolean -} - -// --- BEGIN EMBEDDED jsonc-grammar.jsonic --- -const grammarText = ` -# JSONC Grammar Definition -# Parsed by a standard Jsonic instance and passed to jsonic.grammar() -# Extends standard JSON grammar with end-of-input value handling. - -{ - options: text: { lex: false } - options: number: { hex: false oct: false bin: false sep: null exclude: "@/^\\\\./" } - options: string: { chars: '"' multiChars: '' allowUnknown: false } - options: string: escape: { v: null } - options: map: { extend: false } - options: lex: { empty: false } - options: rule: { finish: false } - - rule: val: open: { - alts: [ - { s: '#ZZ' g: jsonc } - ] - inject: { append: true } - } - - rule: pair: close: { - alts: [ - { s: '#CA #CB' b: 1 g: comma } - ] - inject: {} - } - - rule: elem: close: { - alts: [ - { s: '#CA #CS' b: 1 g: comma } - ] - inject: {} - } -} -` -// --- END EMBEDDED jsonc-grammar.jsonic --- - -function Jsonc(jsonic: Jsonic, options: JsoncOptions) { - const comment_lex = true !== options.disallowComments - const rule_exclude = options.allowTrailingComma ? '' : 'comma' - - // Apply grammar: static options and val ZZ rule alt. - jsonic.grammar(Jsonic.make()(grammarText), { rule: { alt: { g: 'jsonc' } } }) - - // Runtime options that depend on plugin arguments. - jsonic.options({ - comment: { - lex: comment_lex, - }, - rule: { - include: 'jsonc,json', - exclude: rule_exclude, - }, - }) -} - -export { Jsonc } - -export type { JsoncOptions } diff --git a/src/matchers.ts b/src/matchers.ts new file mode 100644 index 0000000..7cb36ab --- /dev/null +++ b/src/matchers.ts @@ -0,0 +1,513 @@ +/* Copyright (c) 2026 Richard Rodger and contributors, MIT License */ + +// Focused lex matchers for the C parser. Each matcher does one job and +// returns either a Token or undefined (signalling "not my prefix"). +// +// All matchers share access to the symbol/macro/mode state via +// ctx.meta.cmeta; see ./symbols.ts for the shape. +// +// Matcher contract (jsonic LexMatcher): +// (lex, rule, tI?) => Token | undefined +// On a hit: +// - Call lex.token(name, val, src) to construct the Token. +// - Advance lex.pnt.sI / rI / cI by the consumed length. + +import type { Lex, Rule, Token } from 'jsonic' +import { + C23_KEYWORDS, + EXT_KEYWORDS, + PUNCTUATORS, + keywordTokenName, +} from './tokens.js' +import type { CMeta } from './symbols.js' + +const RESERVED = new Set([...C23_KEYWORDS, ...EXT_KEYWORDS]) + +// Helpers -------------------------------------------------------------- + +function getMeta(lex: Lex): CMeta { + return (lex.ctx.meta as any).cmeta as CMeta +} + +function source(lex: Lex): string { + return lex.ctx.src() +} + +// Advance the lex point by `len` characters, updating row/col counters. +function advance(lex: Lex, src: string, len: number): void { + const pnt = lex.pnt + for (let i = 0; i < len; i++) { + const c = src.charCodeAt(pnt.sI + i) + if (c === 10) { // \n + pnt.rI++ + pnt.cI = 1 + } else { + pnt.cI++ + } + } + pnt.sI += len +} + +// Build a token, then advance the point. lex.token captures the current +// pnt at construction, so positions reflect the start of the token. +function emit( + lex: Lex, + name: string, + val: any, + src: string, + consumed: number, +): Token { + const tkn = lex.token(name as any, val, src) + advance(lex, source(lex), consumed) + return tkn +} + +// Whitespace ------------------------------------------------------------ +// C whitespace excludes newline when we are inside a preprocessor directive +// (newline terminates the directive). Outside directives newline is just +// space. + +export function makeWhitespaceMatcher() { + return function whitespace(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + if (sI >= src.length) return undefined + const c0 = src.charCodeAt(sI) + // Quick reject: only spaces, tabs, vertical tab, form feed, CR, LF. + if (c0 !== 32 && c0 !== 9 && c0 !== 11 && c0 !== 12 && c0 !== 13 && c0 !== 10) { + return undefined + } + const meta = getMeta(lex) + let i = sI + while (i < src.length) { + const c = src.charCodeAt(i) + if (c === 32 || c === 9 || c === 11 || c === 12 || c === 13) { + i++ + continue + } + if (c === 10) { + if (meta.mode.inDirective) break // newline ends a directive + i++ + continue + } + break + } + if (i === sI) return undefined + // Emit as a space token jsonic will treat as ignorable; we use the + // built-in '#SP' machinery here by returning undefined? No — we want to + // preserve trivia. Instead emit an ignored token with our own name. + return emit(lex, '#SP', src.substring(sI, i), src.substring(sI, i), i - sI) + } +} + +// Line continuation (backslash + newline) ------------------------------ +// In C this is logical-line splicing performed by the preprocessor before +// tokenisation. We treat it as trivia so it survives in the CST. + +export function makeLineContMatcher() { + return function lineCont(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + if (src.charCodeAt(sI) !== 92 /* \ */) return undefined + let consumed = 0 + if (src.charCodeAt(sI + 1) === 10) consumed = 2 + else if (src.charCodeAt(sI + 1) === 13 && src.charCodeAt(sI + 2) === 10) consumed = 3 + else if (src.charCodeAt(sI + 1) === 13) consumed = 2 + else return undefined + const text = src.substring(sI, sI + consumed) + return emit(lex, 'TRIVIA_LINE_CONT', text, text, consumed) + } +} + +// Line comment // ... --------------------------------------------------- + +export function makeLineCommentMatcher() { + return function lineComment(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + if (src.charCodeAt(sI) !== 47 /* / */) return undefined + if (src.charCodeAt(sI + 1) !== 47) return undefined + let i = sI + 2 + while (i < src.length) { + const c = src.charCodeAt(i) + if (c === 10 || c === 13) break + i++ + } + const text = src.substring(sI, i) + return emit(lex, 'TRIVIA_LINE_COMMENT', text, text, i - sI) + } +} + +// Block comment /* ... */ ---------------------------------------------- + +export function makeBlockCommentMatcher() { + return function blockComment(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + if (src.charCodeAt(sI) !== 47 /* / */) return undefined + if (src.charCodeAt(sI + 1) !== 42 /* * */) return undefined + let i = sI + 2 + while (i < src.length - 1) { + if (src.charCodeAt(i) === 42 && src.charCodeAt(i + 1) === 47) { + i += 2 + const text = src.substring(sI, i) + return emit(lex, 'TRIVIA_BLOCK_COMMENT', text, text, i - sI) + } + i++ + } + return lex.bad('unterminated_comment', sI, src.length) + } +} + +// Preprocessor directive opener ---------------------------------------- +// Emits PP_HASH only when '#' (or '%:') appears at the start of a logical +// line — i.e. preceded only by whitespace since the last unspliced newline. + +function atLineStart(src: string, sI: number): boolean { + let i = sI - 1 + while (i >= 0) { + const c = src.charCodeAt(i) + if (c === 10) return true + if (c === 32 || c === 9) { i--; continue } + if (c === 13) return true + return false + } + return true +} + +export function makePPDirectiveOpenerMatcher() { + return function ppDirective(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + const c0 = src.charCodeAt(sI) + let consumed = 0 + if (c0 === 35 /* # */) { + consumed = 1 + } else if (c0 === 37 /* % */ && src.charCodeAt(sI + 1) === 58 /* : */) { + consumed = 2 + } else { + return undefined + } + if (!atLineStart(src, sI)) return undefined + const meta = getMeta(lex) + meta.mode.inDirective = true + meta.mode.directiveName = null + meta.mode.expectHeaderName = false + const text = src.substring(sI, sI + consumed) + return emit(lex, 'PP_HASH', text, text, consumed) + } +} + +// Directive newline terminator ----------------------------------------- +// Emits PP_NEWLINE only when in directive mode; resets mode flags. + +export function makePPNewlineMatcher() { + return function ppNewline(lex: Lex, _rule: Rule): Token | undefined { + const meta = getMeta(lex) + if (!meta.mode.inDirective) return undefined + const src = source(lex) + const sI = lex.pnt.sI + const c0 = src.charCodeAt(sI) + if (c0 !== 10 && c0 !== 13) return undefined + let consumed = 1 + if (c0 === 13 && src.charCodeAt(sI + 1) === 10) consumed = 2 + meta.mode.inDirective = false + meta.mode.directiveName = null + meta.mode.expectHeaderName = false + const text = src.substring(sI, sI + consumed) + return emit(lex, 'PP_NEWLINE', text, text, consumed) + } +} + +// Header name or "foo.h" — only valid inside #include / #embed -- + +export function makeHeaderNameMatcher() { + return function headerName(lex: Lex, _rule: Rule): Token | undefined { + const meta = getMeta(lex) + if (!meta.mode.inDirective || !meta.mode.expectHeaderName) return undefined + const src = source(lex) + const sI = lex.pnt.sI + const c0 = src.charCodeAt(sI) + let close: number + if (c0 === 60 /* < */) close = 62 /* > */ + else if (c0 === 34 /* " */) close = 34 + else return undefined + let i = sI + 1 + while (i < src.length) { + const c = src.charCodeAt(i) + if (c === 10) return lex.bad('unterminated_header_name', sI, i) + if (c === close) { + i++ + meta.mode.expectHeaderName = false + const text = src.substring(sI, i) + return emit(lex, 'LIT_HEADER_NAME', text, text, i - sI) + } + i++ + } + return lex.bad('unterminated_header_name', sI, src.length) + } +} + +// Identifier (and keyword/typedef-name/macro-name reclassification) ---- + +const ID_RE = /^[A-Za-z_$][A-Za-z0-9_$]*/ + +export function makeIdentifierMatcher() { + return function identifier(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + const c0 = src.charCodeAt(sI) + // Fast reject: must start with letter, _, or $ (gcc allows $). + const isStart = + (c0 >= 65 && c0 <= 90) || + (c0 >= 97 && c0 <= 122) || + c0 === 95 || c0 === 36 + if (!isStart) return undefined + const m = ID_RE.exec(src.substring(sI)) + if (!m) return undefined + const word = m[0] + const meta = getMeta(lex) + + // Reserved word? + if (RESERVED.has(word)) { + const tname = keywordTokenName(word)! + // If we are in a directive and this is the first identifier, record + // the directive name and arm the header-name flag for include/embed. + if (meta.mode.inDirective && meta.mode.directiveName === null) { + meta.mode.directiveName = word + } + return emit(lex, tname, word, word, word.length) + } + + // Inside a directive, the first identifier names the directive. + if (meta.mode.inDirective && meta.mode.directiveName === null) { + meta.mode.directiveName = word + if (word === 'include' || word === 'embed' || word === 'include_next') { + meta.mode.expectHeaderName = true + } + // Common directives are not C reserved words; emit a normal ID and + // let the directive grammar dispatch on the value. + return emit(lex, 'ID', word, word, word.length) + } + + // Typedef-name disambiguation. A name in scope as a typedef becomes + // TYPEDEF_NAME; the parser uses this distinction to choose between + // declaration and expression alts. + // + // Note: we do NOT emit TYPEDEF_NAME inside directive bodies — there the + // name is just a token, semantics deferred. + if (!meta.mode.inDirective && meta.symbols.isTypedef(word)) { + return emit(lex, 'TYPEDEF_NAME', word, word, word.length) + } + + // Macro-name tagging: identifiers previously seen in a #define + // surface as MACRO_NAME so call sites can be distinguished from + // ordinary function calls. The grammar accepts MACRO_NAME wherever + // it accepts ID. + if (!meta.mode.inDirective && meta.macros.has(word)) { + return emit(lex, 'MACRO_NAME', word, word, word.length) + } + + return emit(lex, 'ID', word, word, word.length) + } +} + +// Integer literal ------------------------------------------------------ +// dec, hex, oct, binary, with C23 ' digit separators and integer suffixes +// (u/U, l/L, ll/LL, wb/WB, plus combinations). + +const INT_RE = new RegExp( + '^(' + + '0[xX][0-9a-fA-F](?:[\'0-9a-fA-F])*' + // hex + '|0[bB][01](?:[\'01])*' + // binary (C23) + '|0(?:[\'0-7])*' + // octal (also matches lone 0) + '|[1-9](?:[\'0-9])*' + // decimal + ')([uUlL]*[wWbBzZ]*[uUlL]*)?', +) + +export function makeIntegerMatcher() { + return function integer(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + const c0 = src.charCodeAt(sI) + if (c0 < 48 || c0 > 57) return undefined + const rest = src.substring(sI) + const m = INT_RE.exec(rest) + if (!m) return undefined + // Disambiguate from float: if the next char after the integer part is + // '.', 'e', 'E', 'p', 'P', defer to the float matcher. + const after = rest.charCodeAt(m[0].length) + if (after === 46 /* . */ || after === 101 /* e */ || after === 69 /* E */) { + // For hex literals only p/P signals exponent. + if (m[1].startsWith('0x') || m[1].startsWith('0X')) { + // hex without . or p in m[1]: a trailing 'e' is not a float exponent + // (it could be a hex digit), so we keep the int. + } else { + return undefined + } + } + if ((m[1].startsWith('0x') || m[1].startsWith('0X')) && + (after === 46 || after === 112 /* p */ || after === 80 /* P */)) { + return undefined + } + const text = m[0] + return emit(lex, 'LIT_INT', text, text, text.length) + } +} + +// Floating literal ----------------------------------------------------- + +const FLOAT_DEC_RE = new RegExp( + '^(?:' + + '(?:[0-9](?:[\'0-9])*)?\\.[0-9](?:[\'0-9])*(?:[eE][+-]?[0-9](?:[\'0-9])*)?' + + '|[0-9](?:[\'0-9])*\\.(?:[eE][+-]?[0-9](?:[\'0-9])*)?' + + '|[0-9](?:[\'0-9])*[eE][+-]?[0-9](?:[\'0-9])*' + + ')[fFlLdD]?[fFlL]?', +) + +const FLOAT_HEX_RE = new RegExp( + '^0[xX](?:' + + '[0-9a-fA-F](?:[\'0-9a-fA-F])*\\.(?:[0-9a-fA-F](?:[\'0-9a-fA-F])*)?' + + '|\\.[0-9a-fA-F](?:[\'0-9a-fA-F])*' + + '|[0-9a-fA-F](?:[\'0-9a-fA-F])*' + + ')[pP][+-]?[0-9](?:[\'0-9])*[fFlL]?', +) + +export function makeFloatMatcher() { + return function float(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + const c0 = src.charCodeAt(sI) + // Must start with digit or '.' followed by digit. + const c1 = src.charCodeAt(sI + 1) + const startsDigit = c0 >= 48 && c0 <= 57 + const startsDot = c0 === 46 && c1 >= 48 && c1 <= 57 + if (!startsDigit && !startsDot) return undefined + const rest = src.substring(sI) + let m = FLOAT_HEX_RE.exec(rest) + if (!m) m = FLOAT_DEC_RE.exec(rest) + if (!m) return undefined + // Reject pure integers without dot, exponent or float suffix; let the + // integer matcher take them. + const text = m[0] + if (!/[.eEpPfFlL]/.test(text) && !text.startsWith('0x') && !text.startsWith('0X')) { + return undefined + } + return emit(lex, 'LIT_FLOAT', text, text, text.length) + } +} + +// Character literal ---------------------------------------------------- + +const CHAR_PREFIX_RE = /^(L|u8|u|U)?'/ + +export function makeCharLiteralMatcher() { + return function charLit(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + const rest = src.substring(sI) + const pm = CHAR_PREFIX_RE.exec(rest) + if (!pm) return undefined + let i = sI + pm[0].length + while (i < src.length) { + const c = src.charCodeAt(i) + if (c === 10) return lex.bad('unterminated_char', sI, i) + if (c === 92 /* \ */) { i += 2; continue } + if (c === 39 /* ' */) { + i++ + const text = src.substring(sI, i) + return emit(lex, 'LIT_CHAR', text, text, i - sI) + } + i++ + } + return lex.bad('unterminated_char', sI, src.length) + } +} + +// String literal ------------------------------------------------------- +// Encoding-prefixes: u8, u, U, L. Raw strings (R"...") are a C++ feature +// that some compilers extend to C; supported as an extension. + +const STR_PREFIX_RE = /^(u8|u|U|L)?(R)?"/ + +export function makeStringLiteralMatcher() { + return function stringLit(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + const rest = src.substring(sI) + const pm = STR_PREFIX_RE.exec(rest) + if (!pm) return undefined + const isRaw = pm[2] === 'R' + let i = sI + pm[0].length + if (isRaw) { + // R"delim(...)delim" + let delimEnd = i + while (delimEnd < src.length && src.charCodeAt(delimEnd) !== 40 /* ( */) { + delimEnd++ + } + if (delimEnd >= src.length) return lex.bad('unterminated_string', sI, src.length) + const delim = src.substring(i, delimEnd) + const closer = ')' + delim + '"' + const close = src.indexOf(closer, delimEnd + 1) + if (close < 0) return lex.bad('unterminated_string', sI, src.length) + const end = close + closer.length + const text = src.substring(sI, end) + return emit(lex, 'LIT_STRING', text, text, end - sI) + } + while (i < src.length) { + const c = src.charCodeAt(i) + if (c === 10) return lex.bad('unterminated_string', sI, i) + if (c === 92 /* \ */) { i += 2; continue } + if (c === 34 /* " */) { + i++ + const text = src.substring(sI, i) + return emit(lex, 'LIT_STRING', text, text, i - sI) + } + i++ + } + return lex.bad('unterminated_string', sI, src.length) + } +} + +// Punctuator dispatch -------------------------------------------------- +// One matcher; tries longest-first against the punctuator catalog. + +export function makePunctuatorMatcher() { + // Pre-sort once, longest source first. + const sorted = [...PUNCTUATORS].sort((a, b) => b[1].length - a[1].length) + return function punctuator(lex: Lex, _rule: Rule): Token | undefined { + const src = source(lex) + const sI = lex.pnt.sI + for (const [name, p] of sorted) { + let ok = true + for (let i = 0; i < p.length; i++) { + if (src.charCodeAt(sI + i) !== p.charCodeAt(i)) { ok = false; break } + } + if (ok) { + return emit(lex, name, p, p, p.length) + } + } + return undefined + } +} + +// All matchers, ordered. Lower order = tried first. +// jsonic dispatches by ascending order; we want trivia and special-mode +// matchers to win against generic ones. +export function allMatchers(): Array<{ name: string; order: number; make: () => any }> { + return [ + { name: 'c_line_cont', order: 100, make: () => makeLineContMatcher() }, + { name: 'c_block_comment', order: 110, make: () => makeBlockCommentMatcher() }, + { name: 'c_line_comment', order: 120, make: () => makeLineCommentMatcher() }, + { name: 'c_pp_newline', order: 130, make: () => makePPNewlineMatcher() }, + { name: 'c_pp_open', order: 140, make: () => makePPDirectiveOpenerMatcher() }, + { name: 'c_header_name', order: 150, make: () => makeHeaderNameMatcher() }, + { name: 'c_whitespace', order: 160, make: () => makeWhitespaceMatcher() }, + { name: 'c_string', order: 200, make: () => makeStringLiteralMatcher() }, + { name: 'c_char', order: 210, make: () => makeCharLiteralMatcher() }, + { name: 'c_float', order: 220, make: () => makeFloatMatcher() }, + { name: 'c_int', order: 230, make: () => makeIntegerMatcher() }, + { name: 'c_identifier', order: 240, make: () => makeIdentifierMatcher() }, + { name: 'c_punctuator', order: 900, make: () => makePunctuatorMatcher() }, + ] +} diff --git a/src/structure.ts b/src/structure.ts new file mode 100644 index 0000000..8fc56bd --- /dev/null +++ b/src/structure.ts @@ -0,0 +1,2102 @@ +/* Copyright (c) 2026 Richard Rodger and contributors, MIT License */ + +// Post-processing pass that turns the flat token list captured by the +// external_declaration chomper into a structured concrete-syntax tree. +// +// Approach: recursive-descent over a TokenStream that hides trivia from +// grammar-level decisions but emits trivia tokens in source order as +// siblings of the next real token. Each parse* function returns a node +// (or null) and advances the stream; the caller wires it into a parent +// node. +// +// This is a deliberate trade: we get a clean structured tree without +// reworking jsonic's grammar machinery for the full C grammar, at the +// cost of doing the parse twice (once to chomp the tokens, once here to +// structure them). Future slices may collapse this into in-line jsonic +// rules; for now the approach lets each C construct live as a small +// composable function. + +import type { Token } from 'jsonic' +import { parseExpression } from './expr.js' + +export interface Span { + start: number; end: number; line: number; col: number +} + +export interface CTokenRef { + kind: 'token' + tname: string + src: string + span: Span +} + +export interface CNode { + kind: string + span: Span + children: Array + trivia: { leading: CTokenRef[]; trailing: CTokenRef[] } + [extra: string]: any +} + +const PRESERVED_TRIVIA = new Set([ + 'TRIVIA_LINE_COMMENT', 'TRIVIA_BLOCK_COMMENT', 'TRIVIA_LINE_CONT', +]) + +const STORAGE_CLASS = new Set([ + 'KW_TYPEDEF', 'KW_EXTERN', 'KW_STATIC', 'KW_AUTO', 'KW_REGISTER', + 'KW__THREAD_LOCAL', 'KW_THREAD_LOCAL', 'KW_CONSTEXPR', + 'KW___THREAD', +]) + +const TYPE_QUALIFIER = new Set([ + 'KW_CONST', 'KW_VOLATILE', 'KW_RESTRICT', 'KW__ATOMIC', + 'KW___CONST__', 'KW___CONST', + 'KW___VOLATILE__', 'KW___VOLATILE', + 'KW___RESTRICT__', 'KW___RESTRICT', +]) + +const FUNCTION_SPECIFIER = new Set([ + 'KW_INLINE', 'KW___INLINE__', 'KW___INLINE', + 'KW__NORETURN', +]) + +const SIMPLE_TYPE_SPEC = new Set([ + 'KW_VOID', 'KW_CHAR', 'KW_SHORT', 'KW_INT', 'KW_LONG', 'KW_FLOAT', + 'KW_DOUBLE', 'KW_SIGNED', 'KW_UNSIGNED', 'KW_BOOL', 'KW__BOOL', + 'KW__COMPLEX', 'KW__IMAGINARY', + 'KW___SIGNED__', 'KW___SIGNED', + 'KW___INT8', 'KW___INT16', 'KW___INT32', 'KW___INT64', +]) + +const ATTRIBUTE_OPENERS = new Set([ + 'KW___ATTRIBUTE__', 'KW___ATTRIBUTE', + 'KW___DECLSPEC', +]) + +// True for identifier-like tokens (plain IDs and macro-name IDs that +// the lexer flagged via the macro table). TYPEDEF_NAME is NOT included +// here — it's treated specially by callers when relevant. +function isIdLike(name: string | null): boolean { + return name === 'ID' || name === 'MACRO_NAME' +} + +function isSpecifierStart(name: string): boolean { + return STORAGE_CLASS.has(name) || + TYPE_QUALIFIER.has(name) || + FUNCTION_SPECIFIER.has(name) || + SIMPLE_TYPE_SPEC.has(name) || + ATTRIBUTE_OPENERS.has(name) || + name === 'KW_STRUCT' || name === 'KW_UNION' || name === 'KW_ENUM' || + name === 'KW_TYPEOF' || name === 'KW_TYPEOF_UNQUAL' || + name === 'KW___TYPEOF__' || name === 'KW___TYPEOF' || + name === 'KW__BITINT' || + name === 'KW_ALIGNAS' || name === 'KW__ALIGNAS' || + name === 'KW___EXTENSION__' || + name === 'TYPEDEF_NAME' +} + +function tokenRef(t: Token): CTokenRef { + return { + kind: 'token', + tname: t.name, + src: t.src, + span: { start: t.sI, end: t.sI + t.len, line: t.rI, col: t.cI }, + } +} + +function makeNode(kind: string, startSpan?: Span): CNode { + return { + kind, + span: startSpan ?? { start: 0, end: 0, line: 1, col: 1 }, + children: [], + trivia: { leading: [], trailing: [] }, + } +} + +// Clone-ish span from a token. +function spanOf(t: Token): Span { + return { start: t.sI, end: t.sI + t.len, line: t.rI, col: t.cI } +} + +// ---- TokenStream ---------------------------------------------------- + +export class TokenStream { + i: number = 0 + constructor(public tokens: Token[]) {} + + // Skip past trivia and return the next real token, or null at end. + peek(off: number = 0): Token | null { + let i = this.i + let seen = 0 + while (i < this.tokens.length) { + const t = this.tokens[i] + if (PRESERVED_TRIVIA.has(t.name)) { i++; continue } + if (seen === off) return t + seen++ + i++ + } + return null + } + + peekName(off: number = 0): string | null { + return this.peek(off)?.name ?? null + } + + done(): boolean { return this.peek() === null } + + // Consume the next real token along with any preceding trivia. + // Returns the trivia refs followed by the real token's ref. + take(): { trivia: CTokenRef[]; tkn: Token; ref: CTokenRef } | null { + const trivia: CTokenRef[] = [] + while (this.i < this.tokens.length) { + const t = this.tokens[this.i] + if (PRESERVED_TRIVIA.has(t.name)) { + trivia.push(tokenRef(t)) + this.i++ + continue + } + this.i++ + return { trivia, tkn: t, ref: tokenRef(t) } + } + return null + } + + // Push the trivia and the just-taken real token onto a node's children. + takeInto(node: CNode): Token | null { + const taken = this.take() + if (!taken) return null + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + return taken.tkn + } + + mark(): number { return this.i } + restore(m: number) { this.i = m } +} + +// ---- Helpers for balanced punctuator skipping ----------------------- + +function consumeBalanced( + ts: TokenStream, node: CNode, + open: string, close: string, +): boolean { + if (ts.peekName() !== open) return false + ts.takeInto(node) // open + let depth = 1 + while (depth > 0 && !ts.done()) { + const n = ts.peekName() + if (n === open) depth++ + else if (n === close) depth-- + ts.takeInto(node) + } + return depth === 0 +} + +// ---- Specifier parsing ---------------------------------------------- + +export function parseDeclarationSpecifiers(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + // C23 [[ ... ]] at the head also opens a declaration. + const c23Head = isC23AttributeOpen(ts) + if (!isSpecifierStart(startTkn.name) && !c23Head) return null + const node = makeNode('declaration_specifiers', spanOf(startTkn)) + + // The legal sequence permits a single TYPEDEF_NAME (after which any + // further ID belongs to the declarator). Track it to avoid confusing + // `T x` with `T int`-style nonsense. + let sawTypedefName = false + + while (true) { + const tkn = ts.peek() + if (!tkn) break + const n = tkn.name + + if (n === 'TYPEDEF_NAME') { + if (sawTypedefName) break + sawTypedefName = true + ts.takeInto(node) + continue + } + if (STORAGE_CLASS.has(n) || TYPE_QUALIFIER.has(n) || + FUNCTION_SPECIFIER.has(n) || SIMPLE_TYPE_SPEC.has(n) || + n === 'KW___EXTENSION__' || + n === 'KW_TYPEOF' || n === 'KW_TYPEOF_UNQUAL' || + n === 'KW___TYPEOF__' || n === 'KW___TYPEOF' || + n === 'KW__BITINT' || + n === 'KW_ALIGNAS' || n === 'KW__ALIGNAS') { + // typeof/_BitInt/alignas have a parenthesised argument list — fold + // it into the specifier node. + ts.takeInto(node) + if ((n === 'KW_TYPEOF' || n === 'KW_TYPEOF_UNQUAL' || + n === 'KW___TYPEOF__' || n === 'KW___TYPEOF' || + n === 'KW__BITINT' || + n === 'KW_ALIGNAS' || n === 'KW__ALIGNAS') && + ts.peekName() === 'PUNC_LPAREN') { + consumeBalanced(ts, node, 'PUNC_LPAREN', 'PUNC_RPAREN') + } + continue + } + if (ATTRIBUTE_OPENERS.has(n)) { + const attr = parseAttributeSpec(ts) + if (attr) node.children.push(attr) + else ts.takeInto(node) + continue + } + if (isC23AttributeOpen(ts)) { + const attr = parseC23AttributeSpec(ts) + if (attr) node.children.push(attr) + else ts.takeInto(node) + continue + } + if (n === 'KW_STRUCT' || n === 'KW_UNION') { + const sus = parseStructOrUnionSpec(ts) + if (sus) node.children.push(sus) + continue + } + if (n === 'KW_ENUM') { + const en = parseEnumSpec(ts) + if (en) node.children.push(en) + continue + } + break + } + + if (node.children.length === 0) return null + return node +} + +// True when the head of `ts` is the C23 `[[` attribute opener: two +// adjacent PUNC_LBRACKETs in the source (no intervening characters). +function isC23AttributeOpen(ts: TokenStream): boolean { + const a = ts.peek() + const b = ts.peek(1) + if (!a || !b) return false + if (a.name !== 'PUNC_LBRACKET' || b.name !== 'PUNC_LBRACKET') return false + return a.sI + a.len === b.sI +} + +// True when the head of `ts` is the C23 `]]` attribute closer. +function isC23AttributeClose(ts: TokenStream): boolean { + const a = ts.peek() + const b = ts.peek(1) + if (!a || !b) return false + if (a.name !== 'PUNC_RBRACKET' || b.name !== 'PUNC_RBRACKET') return false + return a.sI + a.len === b.sI +} + +// Parse a C23 `[[ items ]]` attribute spec. Returns null if the head +// isn't `[[`. +export function parseC23AttributeSpec(ts: TokenStream): CNode | null { + if (!isC23AttributeOpen(ts)) return null + const startTkn = ts.peek()! + const node = makeNode('attribute_spec', spanOf(startTkn)) + node.attributeForm = 'c23' + node.items = [] as any[] + ts.takeInto(node) // first '[' + ts.takeInto(node) // second '[' + + while (!ts.done()) { + if (isC23AttributeClose(ts)) { + ts.takeInto(node) // first ']' + ts.takeInto(node) // second ']' + break + } + if (ts.peekName() === 'PUNC_COMMA') { + ts.takeInto(node) + continue + } + const item = parseAttributeItem(ts) + if (item) { + node.children.push(item) + node.items.push(item) + } else { + ts.takeInto(node) + } + } + return node +} + +// Generic attribute spec parser that dispatches between GCC, MSVC, and +// C23 forms. Returns null if no attribute starts here. +export function parseAnyAttributeSpec(ts: TokenStream): CNode | null { + const head = ts.peek() + if (!head) return null + if (ATTRIBUTE_OPENERS.has(head.name)) return parseAttributeSpec(ts) + if (isC23AttributeOpen(ts)) return parseC23AttributeSpec(ts) + return null +} + +export function parseAttributeSpec(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn || !ATTRIBUTE_OPENERS.has(startTkn.name)) return null + const node = makeNode('attribute_spec', spanOf(startTkn)) + node.attributeForm = startTkn.src.startsWith('__attribute') + ? 'gcc' + : startTkn.src === '__declspec' + ? 'msvc' + : 'unknown' + ts.takeInto(node) // __attribute__ / __declspec / __attribute + + // GCC: __attribute__((...)) — double parens. The inner parens hold a + // comma-separated attribute list. + // MSVC: __declspec(...) — single parens, list. + if (ts.peekName() !== 'PUNC_LPAREN') return node + ts.takeInto(node) // outer '(' + + let needsCloseOuter = false + if (node.attributeForm === 'gcc' && ts.peekName() === 'PUNC_LPAREN') { + ts.takeInto(node) // inner '(' + needsCloseOuter = true + } + + // Attribute item list inside the (innermost) parentheses. + node.items = [] as any[] + while (!ts.done() && ts.peekName() !== 'PUNC_RPAREN') { + if (ts.peekName() === 'PUNC_COMMA') { + ts.takeInto(node) + continue + } + const item = parseAttributeItem(ts) + if (item) { + node.children.push(item) + node.items.push(item) + } else { + // Defensive: avoid infinite loop on unrecognised tokens. + ts.takeInto(node) + } + } + + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(node) // inner / sole ')' + if (needsCloseOuter && ts.peekName() === 'PUNC_RPAREN') ts.takeInto(node) + return node +} + +// Single GCC / MSVC attribute item: name (optional :: namespace) plus +// optional argument list. +function parseAttributeItem(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + // The name slot can be an identifier OR a reserved word like + // `const`, `__const__`, `nothrow` etc. Accept any non-punctuator. + const nameOk = startTkn.name === 'ID' || + startTkn.name === 'TYPEDEF_NAME' || + startTkn.name === 'MACRO_NAME' || + startTkn.name.startsWith('KW_') + if (!nameOk) return null + const node = makeNode('attribute_item', spanOf(startTkn)) + const nameTaken = ts.take()! + for (const tr of nameTaken.trivia) node.children.push(tr) + node.children.push(nameTaken.ref) + node.attributeName = nameTaken.tkn.src + + // C23 namespaced form: `prefix :: name`. + if (ts.peekName() === 'PUNC_COLON' && ts.peekName(1) === 'PUNC_COLON') { + // Take both colons. + ts.takeInto(node) + ts.takeInto(node) + const tail = ts.peek() + if (tail && (tail.name === 'ID' || tail.name === 'TYPEDEF_NAME' || + tail.name === 'MACRO_NAME' || tail.name.startsWith('KW_'))) { + const t = ts.take()! + for (const tr of t.trivia) node.children.push(tr) + node.children.push(t.ref) + node.attributePrefix = node.attributeName + node.attributeName = t.tkn.src + } + } + + // Optional argument list. + if (ts.peekName() === 'PUNC_LPAREN') { + const args = makeNode('attribute_argument_list', spanOf(ts.peek()!)) + ts.takeInto(args) // '(' + while (!ts.done() && ts.peekName() !== 'PUNC_RPAREN') { + // Each argument is an assignment-expression; commas separate. + const a = parseExpression( + ts, new Set(['PUNC_COMMA', 'PUNC_RPAREN']), + ) + if (a) args.children.push(a) + else ts.takeInto(args) + if (ts.peekName() === 'PUNC_COMMA') ts.takeInto(args) + } + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(args) + node.children.push(args) + node.argumentList = args + } + + return node +} + +export function parseStructOrUnionSpec(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn || (startTkn.name !== 'KW_STRUCT' && startTkn.name !== 'KW_UNION')) return null + const kind = startTkn.name === 'KW_STRUCT' ? 'struct_specifier' : 'union_specifier' + const node = makeNode(kind, spanOf(startTkn)) + ts.takeInto(node) // 'struct' or 'union' + + // Optional attribute spec between keyword and tag/body. + while (ts.peek() && ATTRIBUTE_OPENERS.has(ts.peekName()!)) { + const a = parseAttributeSpec(ts) + if (a) node.children.push(a) + } + + // Optional tag identifier. + const next = ts.peek() + if (next && (isIdLike(next.name) || next.name === 'TYPEDEF_NAME')) { + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.tagName = taken.tkn.src + } + + // Optional body — split into struct_declaration nodes. + if (ts.peekName() === 'PUNC_LBRACE') { + const body = makeNode('member_decl_list', spanOf(ts.peek()!)) + ts.takeInto(body) // '{' + while (!ts.done() && ts.peekName() !== 'PUNC_RBRACE') { + const member = parseStructDeclaration(ts) + if (member) { + body.children.push(member) + } else { + // Defensive: take one token to avoid infinite loop. + ts.takeInto(body) + } + } + if (ts.peekName() === 'PUNC_RBRACE') ts.takeInto(body) // '}' + node.children.push(body) + } + + return node +} + +// struct_declaration: +// specifier_qualifier_list struct_declarator_list? ';' +// | static_assert_declaration +// | ';' (empty member, GCC extension) +export function parseStructDeclaration(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + + // static_assert at member level. + const n0 = startTkn.name + if (n0 === 'KW_STATIC_ASSERT' || n0 === 'KW__STATIC_ASSERT') { + return parseStaticAssertDeclaration(ts) + } + + // Empty member: just `;`. + if (n0 === 'PUNC_SEMI') { + const empty = makeNode('struct_declaration', spanOf(startTkn)) + ts.takeInto(empty) + return empty + } + + const node = makeNode('struct_declaration', spanOf(startTkn)) + // specifier_qualifier_list — same shape as declaration_specifiers but + // without storage classes and function specifiers. We reuse the + // common parser; bogus storage-classes inside a struct are a semantic + // error, not a parse error. + const sql = parseDeclarationSpecifiers(ts) + if (sql) { + sql.kind = 'specifier_qualifier_list' + node.children.push(sql) + } + + // Optional struct_declarator_list. + if (ts.peekName() !== 'PUNC_SEMI' && !ts.done()) { + const sdl = parseStructDeclaratorList(ts) + if (sdl) node.children.push(sdl) + } + + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(node) + return node +} + +// struct_declarator_list: +// struct_declarator (',' struct_declarator)* +// struct_declarator: +// declarator +// | declarator? ':' constant_expression (bitfield) +export function parseStructDeclaratorList(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('struct_declarator_list', spanOf(startTkn)) + const first = parseStructDeclarator(ts) + if (!first) return null + node.children.push(first) + while (ts.peekName() === 'PUNC_COMMA') { + ts.takeInto(node) + const next = parseStructDeclarator(ts) + if (!next) break + node.children.push(next) + } + return node +} + +export function parseStructDeclarator(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('struct_declarator', spanOf(startTkn)) + + // Optional declarator (absent in `: 4` anonymous bitfields). + if (ts.peekName() !== 'PUNC_COLON') { + const d = parseDeclarator(ts, false) + if (d) { + node.children.push(d) + if (d.declaredName) node.declaredName = d.declaredName + } + } + + // Optional bitfield width: `:` constant-expression. + if (ts.peekName() === 'PUNC_COLON') { + const bf = makeNode('bitfield_width', spanOf(ts.peek()!)) + ts.takeInto(bf) // ':' + // Constant-expression — opaque until top-level `,` or `;`. + let parenD = 0, bracketD = 0 + while (!ts.done()) { + const n = ts.peekName() + if (n === 'PUNC_LPAREN') { parenD++; ts.takeInto(bf); continue } + if (n === 'PUNC_RPAREN') { + if (parenD === 0) break + parenD--; ts.takeInto(bf); continue + } + if (n === 'PUNC_LBRACKET') { bracketD++; ts.takeInto(bf); continue } + if (n === 'PUNC_RBRACKET') { + if (bracketD === 0) break + bracketD--; ts.takeInto(bf); continue + } + if (parenD === 0 && bracketD === 0 && + (n === 'PUNC_COMMA' || n === 'PUNC_SEMI')) break + ts.takeInto(bf) + } + node.children.push(bf) + } + + // Optional trailing attribute spec. + while (ts.peek() && ATTRIBUTE_OPENERS.has(ts.peekName()!)) { + const a = parseAttributeSpec(ts) + if (a) node.children.push(a); else break + } + + return node.children.length > 0 ? node : null +} + +export function parseEnumSpec(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn || startTkn.name !== 'KW_ENUM') return null + const node = makeNode('enum_specifier', spanOf(startTkn)) + ts.takeInto(node) // 'enum' + + while (ts.peek() && ATTRIBUTE_OPENERS.has(ts.peekName()!)) { + const a = parseAttributeSpec(ts) + if (a) node.children.push(a) + } + + const next = ts.peek() + if (next && (isIdLike(next.name) || next.name === 'TYPEDEF_NAME')) { + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.tagName = taken.tkn.src + } + + // C23: optional ': type-specifier' for fixed-underlying-type enums. + if (ts.peekName() === 'PUNC_COLON') { + ts.takeInto(node) + const ts2 = parseDeclarationSpecifiers(ts) + if (ts2) node.children.push(ts2) + } + + // Optional body — split into enumerator nodes. + if (ts.peekName() === 'PUNC_LBRACE') { + const body = makeNode('enumerator_list', spanOf(ts.peek()!)) + ts.takeInto(body) // '{' + while (!ts.done() && ts.peekName() !== 'PUNC_RBRACE') { + const e = parseEnumerator(ts) + if (e) body.children.push(e) + else ts.takeInto(body) + if (ts.peekName() === 'PUNC_COMMA') ts.takeInto(body) + } + if (ts.peekName() === 'PUNC_RBRACE') ts.takeInto(body) // '}' + node.children.push(body) + } + + return node +} + +// enumerator: enumeration-constant attribute-specifier-seq? ('=' const-expr)? +export function parseEnumerator(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const n = startTkn.name + if (n !== 'ID' && n !== 'TYPEDEF_NAME' && n !== 'MACRO_NAME') return null + const node = makeNode('enumerator', spanOf(startTkn)) + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.declaredName = taken.tkn.src + + // Optional [[attribute]] in C23 / __attribute__ in GCC. + while (true) { + const a = parseAnyAttributeSpec(ts) + if (!a) break + node.children.push(a) + } + + if (ts.peekName() === 'PUNC_ASSIGN') { + ts.takeInto(node) // '=' + // Constant-expression — opaque until top-level `,` or `}`. + const init = makeNode('initializer', spanOf(ts.peek() || startTkn)) + let parenD = 0, bracketD = 0 + while (!ts.done()) { + const nn = ts.peekName() + if (nn === 'PUNC_LPAREN') { parenD++; ts.takeInto(init); continue } + if (nn === 'PUNC_RPAREN') { + if (parenD === 0) break + parenD--; ts.takeInto(init); continue + } + if (nn === 'PUNC_LBRACKET') { bracketD++; ts.takeInto(init); continue } + if (nn === 'PUNC_RBRACKET') { + if (bracketD === 0) break + bracketD--; ts.takeInto(init); continue + } + if (parenD === 0 && bracketD === 0 && + (nn === 'PUNC_COMMA' || nn === 'PUNC_RBRACE')) break + ts.takeInto(init) + } + node.children.push(init) + } + return node +} + +// ---- Declarator parsing --------------------------------------------- + +export function parseDeclarator(ts: TokenStream, abstract = false): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode(abstract ? 'abstract_declarator' : 'declarator', spanOf(startTkn)) + + // Pointer prefix: '*' qualifier* (repeated). + while (ts.peekName() === 'PUNC_STAR') { + const ptr = makeNode('pointer', spanOf(ts.peek()!)) + ts.takeInto(ptr) // '*' + while (true) { + const n = ts.peekName() + if (n && (TYPE_QUALIFIER.has(n) || + n === 'KW___PTR32' || n === 'KW___PTR64' || + n === 'KW___UNALIGNED')) { + ts.takeInto(ptr) + continue + } + if (n && ATTRIBUTE_OPENERS.has(n)) { + const a = parseAttributeSpec(ts) + if (a) ptr.children.push(a); else break + continue + } + break + } + node.children.push(ptr) + } + + // direct declarator + const dd = parseDirectDeclarator(ts, abstract) + if (!dd) { + if (abstract && node.children.length > 0) return node + return node.children.length > 0 ? node : null + } + node.children.push(dd) + if (dd.declaredName) node.declaredName = dd.declaredName + return node +} + +function parseDirectDeclarator(ts: TokenStream, abstract: boolean): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode(abstract ? 'direct_abstract_declarator' : 'direct_declarator', spanOf(startTkn)) + + // Primary: ID, or '(' declarator ')', or empty (abstract). + const n0 = ts.peekName() + if (isIdLike(n0)) { + const taken = ts.take()! + for (const tr of taken.trivia) node.children.push(tr) + node.children.push(taken.ref) + node.declaredName = taken.tkn.src + } else if (n0 === 'PUNC_LPAREN') { + // Could be parenthesised subdeclarator OR the start of a function + // postfix (parameter list) on an abstract declarator. Disambiguate: + // peek the first non-trivia token inside `(`. + const m = ts.mark() + ts.takeInto(node) // '(' + const inner = ts.peek() + if (inner && (inner.name === 'PUNC_STAR' || + inner.name === 'PUNC_LPAREN' || + isIdLike(inner.name) || + ATTRIBUTE_OPENERS.has(inner.name))) { + // Subdeclarator + const sub = parseDeclarator(ts, abstract) + if (sub) { + node.children.push(sub) + if (sub.declaredName) node.declaredName = sub.declaredName + } + // Expect ')' + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(node) + } else { + // Looked like a function parameter list directly — rewind and let + // postfix loop pick it up. + ts.restore(m) + } + } else if (!abstract) { + return null + } + + // Postfixes: '[' ... ']' or '(' parameter_list ')'. + while (!ts.done()) { + const n = ts.peekName() + if (n === 'PUNC_LBRACKET') { + const arr = makeNode('array_postfix', spanOf(ts.peek()!)) + consumeBalanced(ts, arr, 'PUNC_LBRACKET', 'PUNC_RBRACKET') + node.children.push(arr) + continue + } + if (n === 'PUNC_LPAREN') { + const fn = parseFunctionPostfix(ts) + if (fn) node.children.push(fn) + continue + } + break + } + + if (node.children.length === 0 && !abstract) return null + return node +} + +// '(' parameter_type_list? ')' or '(' identifier_list? ')' (K&R) +export function parseFunctionPostfix(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn || startTkn.name !== 'PUNC_LPAREN') return null + const node = makeNode('function_postfix', spanOf(startTkn)) + ts.takeInto(node) // '(' + + // Empty list — `()` (K&R unspecified prototype). + if (ts.peekName() === 'PUNC_RPAREN') { + ts.takeInto(node) + return node + } + + // ANSI prototype with explicit `void` and no params. + if (ts.peekName() === 'KW_VOID' && ts.peekName(1) === 'PUNC_RPAREN') { + const ptl = makeNode('parameter_type_list', spanOf(ts.peek()!)) + const voidParam = makeNode('parameter_declaration', spanOf(ts.peek()!)) + const voidSpec = makeNode('declaration_specifiers', spanOf(ts.peek()!)) + voidParam.children.push(voidSpec) + ts.takeInto(voidSpec) // 'void' + ptl.children.push(voidParam) + node.children.push(ptl) + ts.takeInto(node) // ')' + return node + } + + // Detect K&R identifier list: every comma-separated item is a single + // ID with no specifier. Lookahead-only — falls back to ANSI parsing + // if the head doesn't match. + if (looksLikeKRIdentifierList(ts)) { + const list = makeNode('identifier_list', spanOf(ts.peek()!)) + while (!ts.done() && ts.peekName() !== 'PUNC_RPAREN') { + ts.takeInto(list) + } + node.children.push(list) + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(node) + return node + } + + // ANSI parameter type list. + const ptl = makeNode('parameter_type_list', spanOf(ts.peek()!)) + while (!ts.done() && ts.peekName() !== 'PUNC_RPAREN') { + if (ts.peekName() === 'PUNC_ELLIPSIS') { + const ell = makeNode('parameter_variadic', spanOf(ts.peek()!)) + ts.takeInto(ell) + ptl.children.push(ell) + ptl.variadic = true + break + } + const p = parseParameterDeclaration(ts) + if (p) ptl.children.push(p) + else { + // Defensive: avoid infinite loop on tokens we don't recognise. + ts.takeInto(ptl) + } + if (ts.peekName() === 'PUNC_COMMA') ts.takeInto(ptl) + } + node.children.push(ptl) + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(node) + return node +} + +function looksLikeKRIdentifierList(ts: TokenStream): boolean { + // The current position is the first token after `(`. K&R: every + // comma-separated item is exactly one ID and the closing ')' follows + // the last ID. + let i = 0 + let expectId = true + while (true) { + const t = ts.peek(i) + if (!t) return false + const n = t.name + if (expectId) { + if (!isIdLike(n)) return false + expectId = false + } else { + if (n === 'PUNC_RPAREN') return i > 0 + if (n === 'PUNC_COMMA') { expectId = true } else { return false } + } + i++ + if (i > 256) return false // safety + } +} + +// parameter_declaration: +// declaration_specifiers (declarator | abstract_declarator)? +export function parseParameterDeclaration(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('parameter_declaration', spanOf(startTkn)) + const specs = parseDeclarationSpecifiers(ts) + if (specs) node.children.push(specs) + + // Optional declarator. Decide concrete vs abstract: if the next + // non-trivia token is `,` or `)` (no declarator at all) we still emit + // an empty parameter (just the specs). Otherwise try a concrete + // declarator first, fall back to abstract. + const next = ts.peekName() + if (next === 'PUNC_COMMA' || next === 'PUNC_RPAREN' || next === null) { + return node.children.length > 0 ? node : null + } + + const m = ts.mark() + let d = parseDeclarator(ts, false) + if (!d || (!d.declaredName && !findKind(d, 'declaredName'))) { + // No identifier — fall back to abstract declarator. + ts.restore(m) + d = parseDeclarator(ts, true) + } + if (d) { + node.children.push(d) + if (d.declaredName) node.declaredName = d.declaredName + } + return node.children.length > 0 ? node : null +} + +// Tiny helper used above to detect whether a (possibly-abstract) +// declarator has any concrete name in it. Searches recursively. +function findKind(node: any, key: string): any { + if (!node) return null + if (node[key] !== undefined) return node + if (Array.isArray(node.children)) { + for (const c of node.children) { + const hit = findKind(c, key) + if (hit) return hit + } + } + return null +} + +// ---- init-declarator-list ------------------------------------------- + +export function parseInitDeclaratorList(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('init_declarator_list', spanOf(startTkn)) + + const first = parseInitDeclarator(ts) + if (!first) return null + node.children.push(first) + + while (ts.peekName() === 'PUNC_COMMA') { + ts.takeInto(node) // ',' + const next = parseInitDeclarator(ts) + if (!next) break + node.children.push(next) + } + return node +} + +export function parseInitDeclarator(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const decl = parseDeclarator(ts, false) + if (!decl) return null + // Optional asm-label and attribute-specifiers between declarator and `=`. + const node = makeNode('init_declarator', spanOf(startTkn)) + node.children.push(decl) + if (decl.declaredName) node.declaredName = decl.declaredName + + while (true) { + const n = ts.peekName() + if (!n) break + if (n === 'KW___ASM__' || n === 'KW___ASM' || n === 'KW_ASM') { + const asmNode = makeNode('asm_label', spanOf(ts.peek()!)) + ts.takeInto(asmNode) + if (ts.peekName() === 'PUNC_LPAREN') { + consumeBalanced(ts, asmNode, 'PUNC_LPAREN', 'PUNC_RPAREN') + } + node.children.push(asmNode) + continue + } + if (ATTRIBUTE_OPENERS.has(n)) { + const a = parseAttributeSpec(ts) + if (a) node.children.push(a); else break + continue + } + break + } + + if (ts.peekName() === 'PUNC_ASSIGN') { + ts.takeInto(node) // '=' + const init = parseInitializer(ts) + if (init) node.children.push(init) + } + return node +} + +// Promote `ID(args)` and `MACRO_NAME(args)` patterns inside `node`'s +// flat children list into nested call_expression nodes. Recurses into +// any non-token children. The grammatical context is "anywhere an +// expression can appear" — call sites have the same shape regardless +// of the surrounding statement form. +// +// Sets isMacro: true on calls whose callee token was MACRO_NAME. +function structureCallsInPlace(node: CNode): void { + if (!Array.isArray(node.children)) return + const ch = node.children + const out: Array = [] + let i = 0 + while (i < ch.length) { + const c = ch[i] as any + // Recurse into existing nested nodes first. + if (c.kind !== 'token') { + structureCallsInPlace(c) + out.push(c) + i++ + continue + } + // Identifier-ish callee token followed by '(' (skipping trivia)? + if ((c.tname === 'ID' || c.tname === 'MACRO_NAME')) { + let j = i + 1 + // skip trivia between callee and '(' + while (j < ch.length && (ch[j] as any).kind === 'token' && + PRESERVED_TRIVIA.has((ch[j] as any).tname)) j++ + if (j < ch.length && (ch[j] as any).kind === 'token' && + (ch[j] as any).tname === 'PUNC_LPAREN') { + // Find matching ')' in flat children. + let depth = 1 + let k = j + 1 + while (k < ch.length && depth > 0) { + const cl = ch[k] as any + if (cl.kind === 'token') { + if (cl.tname === 'PUNC_LPAREN') depth++ + else if (cl.tname === 'PUNC_RPAREN') { + depth-- + if (depth === 0) break + } + } + k++ + } + if (depth === 0) { + const callNode = makeNode('call_expression', c.span) + callNode.callee = c.src + callNode.isMacro = c.tname === 'MACRO_NAME' + // The callee token + any leading trivia after it (between + // callee and `(`). + callNode.children.push(c) + for (let m = i + 1; m < j; m++) callNode.children.push(ch[m]) + // Argument-list node carrying `(` … `)` plus structured + // sub-call recursion. + const argList = makeNode('argument_list', (ch[j] as any).span) + argList.children.push(ch[j]) // '(' + // Slice out the inner tokens, recurse on a synthetic node to + // structure nested calls, then flatten back. + const inner: any[] = [] + for (let m = j + 1; m < k; m++) inner.push(ch[m]) + const innerNode: CNode = { + kind: '__inner__', + span: argList.span, + children: inner, + trivia: { leading: [], trailing: [] }, + } + structureCallsInPlace(innerNode) + for (const ic of innerNode.children) argList.children.push(ic) + argList.children.push(ch[k]) // ')' + callNode.children.push(argList) + out.push(callNode) + i = k + 1 + continue + } + } + } + out.push(c) + i++ + } + node.children = out +} + +// Initializer: assignment-expression OR brace-enclosed initializer-list. +export function parseInitializer(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('initializer', spanOf(startTkn)) + if (ts.peekName() === 'PUNC_LBRACE') { + const il = parseInitializerList(ts) + if (il) node.children.push(il) + return node + } + // Plain expression initializer — assignment-precedence (no top-level + // comma). + const expr = parseExpression( + ts, new Set(['PUNC_COMMA', 'PUNC_SEMI', 'PUNC_RBRACE']), + ) + if (expr) node.children.push(expr) + return node +} + +// initializer-list: +// '{' (designation? initializer (',' designation? initializer)* ','?)? '}' +// +// Each item becomes an initializer_item node. designation is captured +// as a leading designator_list child when present. +export function parseInitializerList(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn || startTkn.name !== 'PUNC_LBRACE') return null + const node = makeNode('initializer_list', spanOf(startTkn)) + ts.takeInto(node) // '{' + while (!ts.done() && ts.peekName() !== 'PUNC_RBRACE') { + const item = parseInitializerItem(ts) + if (item) node.children.push(item) + else ts.takeInto(node) // defensive — preserve unrecognised tokens + if (ts.peekName() === 'PUNC_COMMA') ts.takeInto(node) + } + if (ts.peekName() === 'PUNC_RBRACE') ts.takeInto(node) + return node +} + +function parseInitializerItem(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('initializer_item', spanOf(startTkn)) + + // Designation — one or more `. ID` / `[ const-expr ]` followed by `=`. + if (ts.peekName() === 'PUNC_DOT' || ts.peekName() === 'PUNC_LBRACKET') { + const desig = parseDesignation(ts) + if (desig) { + node.children.push(desig) + node.designation = desig + } + } + + // The value: a nested initializer-list or an assignment-expression. + if (ts.peekName() === 'PUNC_LBRACE') { + const sub = parseInitializerList(ts) + if (sub) { + const init = makeNode('initializer', sub.span) + init.children.push(sub) + node.children.push(init) + node.value = init + } + } else { + const expr = parseExpression( + ts, new Set(['PUNC_COMMA', 'PUNC_RBRACE']), + ) + if (expr) { + node.children.push(expr) + node.value = expr + } + } + return node.children.length > 0 ? node : null +} + +// _Static_assert ( const-expression [, string-literal] ) ; +// +// Splits the parenthesised arguments into a condition expression and +// an optional message. The condition uses the Pratt parser (so binary +// operators show up structured) and the message is the trailing +// string literal preserved verbatim. +export function parseStaticAssertDeclaration(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('static_assert_declaration', spanOf(startTkn)) + ts.takeInto(node) // 'static_assert' / '_Static_assert' + if (ts.peekName() !== 'PUNC_LPAREN') { + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(node) + return node + } + ts.takeInto(node) // '(' + // Condition: an assignment-expression up to ',' or ')'. + const cond = parseExpression( + ts, new Set(['PUNC_COMMA', 'PUNC_RPAREN']), + ) + if (cond) { + node.children.push(cond) + node.condition = cond + } + if (ts.peekName() === 'PUNC_COMMA') { + ts.takeInto(node) // ',' + const msg = parseExpression(ts, new Set(['PUNC_RPAREN'])) + if (msg) { + node.children.push(msg) + node.message = msg + } + } + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(node) + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(node) + return node +} + +function parseDesignation(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const node = makeNode('designation', spanOf(startTkn)) + let any = false + while (true) { + const n = ts.peekName() + if (n === 'PUNC_DOT') { + const d = makeNode('member_designator', spanOf(ts.peek()!)) + ts.takeInto(d) // '.' + const memTkn = ts.peek() + if (memTkn && (memTkn.name === 'ID' || memTkn.name === 'TYPEDEF_NAME' || + memTkn.name === 'MACRO_NAME')) { + const taken = ts.take()! + for (const tr of taken.trivia) d.children.push(tr) + d.children.push(taken.ref) + d.memberName = taken.tkn.src + } + node.children.push(d) + any = true + continue + } + if (n === 'PUNC_LBRACKET') { + const d = makeNode('index_designator', spanOf(ts.peek()!)) + consumeBalanced(ts, d, 'PUNC_LBRACKET', 'PUNC_RBRACKET') + node.children.push(d) + any = true + continue + } + break + } + if (!any) return null + if (ts.peekName() === 'PUNC_ASSIGN') ts.takeInto(node) + return node +} + +// ---- compound statement & statements -------------------------------- + +// '{' (declaration | statement)* '}' +export function parseCompoundStatement(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn || startTkn.name !== 'PUNC_LBRACE') return null + const node = makeNode('compound_statement', spanOf(startTkn)) + ts.takeInto(node) // '{' + while (!ts.done() && ts.peekName() !== 'PUNC_RBRACE') { + const item = parseBlockItem(ts) + if (item) { + node.children.push(item) + } else { + // Defensive: avoid infinite loop on unrecognised tokens. + ts.takeInto(node) + } + } + if (ts.peekName() === 'PUNC_RBRACE') ts.takeInto(node) // '}' + return node +} + +// Either a declaration or a statement. +export function parseBlockItem(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + + // Preprocessor lines aren't structured at this level — fall through + // to a raw_token to preserve them as opaque siblings. + if (startTkn.name === 'PP_HASH') { + return takePreprocessorLine(ts) + } + + // Declaration if the head is a specifier, attribute, or the C23 + // `static_assert` keyword (or its underscore form), or a C23 `[[` + // attribute spec. + const n0 = startTkn.name + if (isSpecifierStart(n0) || + n0 === 'KW_STATIC_ASSERT' || n0 === 'KW__STATIC_ASSERT' || + isC23AttributeOpen(ts)) { + const decl = parseDeclaration(ts) + if (decl) return decl + } + + return parseStatement(ts) +} + +// Re-usable inner-declaration parser: declaration_specifiers +// init_declarator_list? `;`. Returns a `declaration` node, mirroring +// the shape produced for top-level declarations by structureExternal- +// Declaration. +export function parseDeclaration(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + + // static_assert is its own declaration shape. + if (startTkn.name === 'KW_STATIC_ASSERT' || + startTkn.name === 'KW__STATIC_ASSERT') { + return parseStaticAssertDeclaration(ts) + } + + const node = makeNode('declaration', spanOf(startTkn)) + const specs = parseDeclarationSpecifiers(ts) + if (specs) node.children.push(specs) + if (ts.peekName() !== 'PUNC_SEMI' && !ts.done()) { + const idl = parseInitDeclaratorList(ts) + if (idl) node.children.push(idl) + } + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(node) + return node.children.length > 0 ? node : null +} + +// Top-level statement dispatch. +export function parseStatement(ts: TokenStream): CNode | null { + const startTkn = ts.peek() + if (!startTkn) return null + const n0 = startTkn.name + + if (n0 === 'PUNC_LBRACE') return parseCompoundStatement(ts) + if (n0 === 'PUNC_SEMI') { + const e = makeNode('expression_statement', spanOf(startTkn)) + ts.takeInto(e) + return e + } + + if (n0 === 'KW_IF') return parseIfStatement(ts) + if (n0 === 'KW_SWITCH') return parseSwitchStatement(ts) + if (n0 === 'KW_WHILE') return parseWhileStatement(ts) + if (n0 === 'KW_DO') return parseDoStatement(ts) + if (n0 === 'KW_FOR') return parseForStatement(ts) + + if (n0 === 'KW_GOTO' || n0 === 'KW_CONTINUE' || + n0 === 'KW_BREAK' || n0 === 'KW_RETURN') { + return parseJumpStatement(ts) + } + + if (n0 === 'KW_CASE' || n0 === 'KW_DEFAULT') { + return parseLabeledStatement(ts) + } + + // ID ':' starts a labeled statement; otherwise it's an expression + // statement. + if (isIdLike(n0) && ts.peekName(1) === 'PUNC_COLON') { + return parseLabeledStatement(ts) + } + + // GCC: `__asm__ (…);` as a statement. + if (n0 === 'KW___ASM__' || n0 === 'KW___ASM' || n0 === 'KW_ASM') { + return parseAsmStatement(ts) + } + + return parseExpressionStatement(ts) +} + +function parseIfStatement(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('if_statement', spanOf(startTkn)) + ts.takeInto(node) // 'if' + if (ts.peekName() === 'PUNC_LPAREN') { + const cond = makeNode('paren_condition', spanOf(ts.peek()!)) + consumeBalanced(ts, cond, 'PUNC_LPAREN', 'PUNC_RPAREN') + node.children.push(cond) + } + const thenStmt = parseStatement(ts) + if (thenStmt) node.children.push(thenStmt) + if (ts.peekName() === 'KW_ELSE') { + ts.takeInto(node) // 'else' + const elseStmt = parseStatement(ts) + if (elseStmt) node.children.push(elseStmt) + } + return node +} + +function parseSwitchStatement(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('switch_statement', spanOf(startTkn)) + ts.takeInto(node) // 'switch' + if (ts.peekName() === 'PUNC_LPAREN') { + const cond = makeNode('paren_condition', spanOf(ts.peek()!)) + consumeBalanced(ts, cond, 'PUNC_LPAREN', 'PUNC_RPAREN') + node.children.push(cond) + } + const body = parseStatement(ts) + if (body) node.children.push(body) + return node +} + +function parseWhileStatement(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('while_statement', spanOf(startTkn)) + ts.takeInto(node) // 'while' + if (ts.peekName() === 'PUNC_LPAREN') { + const cond = makeNode('paren_condition', spanOf(ts.peek()!)) + consumeBalanced(ts, cond, 'PUNC_LPAREN', 'PUNC_RPAREN') + node.children.push(cond) + } + const body = parseStatement(ts) + if (body) node.children.push(body) + return node +} + +function parseDoStatement(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('do_statement', spanOf(startTkn)) + ts.takeInto(node) // 'do' + const body = parseStatement(ts) + if (body) node.children.push(body) + if (ts.peekName() === 'KW_WHILE') ts.takeInto(node) + if (ts.peekName() === 'PUNC_LPAREN') { + const cond = makeNode('paren_condition', spanOf(ts.peek()!)) + consumeBalanced(ts, cond, 'PUNC_LPAREN', 'PUNC_RPAREN') + node.children.push(cond) + } + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(node) + return node +} + +function parseForStatement(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('for_statement', spanOf(startTkn)) + ts.takeInto(node) // 'for' + if (ts.peekName() === 'PUNC_LPAREN') { + const ctl = makeNode('for_controls', spanOf(ts.peek()!)) + ts.takeInto(ctl) // '(' + + // init: declaration | expression | empty + const initNode = makeNode('for_init', spanOf(ts.peek() || startTkn)) + if (ts.peekName() !== 'PUNC_SEMI' && !ts.done()) { + const t0 = ts.peek()! + if (isSpecifierStart(t0.name) || + t0.name === 'KW_STATIC_ASSERT' || t0.name === 'KW__STATIC_ASSERT' || + isC23AttributeOpen(ts)) { + const decl = parseDeclaration(ts) + if (decl) { + initNode.children.push(decl) + initNode.value = decl + } + // The declaration's terminating ';' is part of the declaration + // node, so we don't expect to see another `;` here. + } else { + const expr = parseExpression(ts, new Set(['PUNC_SEMI'])) + if (expr) { + initNode.children.push(expr) + initNode.value = expr + } + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(initNode) + } + } else if (ts.peekName() === 'PUNC_SEMI') { + ts.takeInto(initNode) // empty init: just `;` + } + ctl.children.push(initNode) + ctl.init = initNode + + // cond: expression? + const condNode = makeNode('for_cond', spanOf(ts.peek() || startTkn)) + if (ts.peekName() !== 'PUNC_SEMI' && ts.peekName() !== 'PUNC_RPAREN') { + const expr = parseExpression(ts, new Set(['PUNC_SEMI', 'PUNC_RPAREN'])) + if (expr) { + condNode.children.push(expr) + condNode.value = expr + } + } + if (ts.peekName() === 'PUNC_SEMI') ts.takeInto(condNode) + ctl.children.push(condNode) + ctl.cond = condNode + + // iter: expression? + const iterNode = makeNode('for_iter', spanOf(ts.peek() || startTkn)) + if (ts.peekName() !== 'PUNC_RPAREN') { + const expr = parseExpression(ts, new Set(['PUNC_RPAREN'])) + if (expr) { + iterNode.children.push(expr) + iterNode.value = expr + } + } + ctl.children.push(iterNode) + ctl.iter = iterNode + + if (ts.peekName() === 'PUNC_RPAREN') ts.takeInto(ctl) + node.children.push(ctl) + } + const body = parseStatement(ts) + if (body) node.children.push(body) + return node +} + +function parseJumpStatement(ts: TokenStream): CNode { + const startTkn = ts.peek()! + const node = makeNode('jump_statement', spanOf(startTkn)) + node.jumpKind = startTkn.src + ts.takeInto(node) // jump keyword + // For `return ;` and `goto