diff --git a/poc/wasm-treesitter/README.md b/poc/wasm-treesitter/README.md new file mode 100644 index 0000000..8d147c0 --- /dev/null +++ b/poc/wasm-treesitter/README.md @@ -0,0 +1,70 @@ +# PoC: tree-sitter via WASM/wazero (pure-Go, no cgo) + +An alternative to the cgo backend on `feat/chunker-cgo-treesitter`. The **official** +tree-sitter C runtime + the official TypeScript grammar are compiled to a single +standalone `wasm32-wasi` reactor module (`build.sh`, via `zig cc`) and driven +from Go through [wazero](https://github.com/tetratelabs/wazero) — **no cgo, no +JavaScript, no third-party parser**. Only `wasmts.go` (our wazero host) is +bespoke; the parser itself is the unmodified upstream C. + +Goal: give us real **speed + stability** numbers to choose between cgo and wasm. + +## Results — same 852-file vscode TypeScript corpus, full-tree walk + +| backend | wall | files/s | MB/s | ERROR trees | `editorOptions.ts` | +|---|---|---|---|---|---| +| gotreesitter (pure-Go GLR) | 13.83 s | 62 | 0.8 | **13** | 8.77 s → ERROR | +| **WASM (wazero, pure-Go host)** | **~2.5 s** | **~330** | **~4.1** | **0** | **49 ms** | +| cgo (native tree-sitter) | 1.26 s | 675 | 11.5 | 0 | 17 ms | + +- **WASM is ~2× slower than cgo, ~5× faster than gotreesitter, and correct** (0 ERROR trees vs gotreesitter's 13). +- The WASM overhead is the **host↔guest call boundary**, not memory: each of the + 2.68 M nodes costs ~3 wazero calls (`ts_node_type`, `ts_node_child_count`, + `ts_node_child`). Reusing node slots instead of `malloc`/`free` per node moved + the number only 328→357 files/s — so it's the calls. A single batched + "serialize subtree" export would close most of the remaining gap vs cgo + (future work; not done here). + +## Stability (`cmd/stability`) + +- tree-sitter is **robust**: 6 adversarial inputs (100–200 k-deep nesting, 5 MB + single token, invalid UTF-8, unbalanced templates) all parsed without crashing + — this is true of cgo too, so it is **not** a bug WASM uniquely fixes. +- What WASM **adds** is containment: a guest-side fault (resource limit, and in + principle any C bug — stack overflow, OOB) surfaces as a **recoverable Go + error**; the host process stays alive. The memory-capped run demonstrates this. +- Under cgo the equivalent fault is a native **SIGSEGV/abort that kills the whole + cix-server**. So crash-isolation is **insurance against unknown C bugs in + grammars/scanners**, not a fix for an observed crash. + +## Trade-off summary + +| | cgo (current) | WASM/wazero (this PoC) | +|---|---|---| +| Parse speed | 🟢 fastest | 🟡 ~2× slower (≈invisible end-to-end: embeddings dominate) | +| Correctness | 🟢 official | 🟢 official (identical parser) | +| Build | 🟡 needs C toolchain (musl-static solved it) | 🟢 `CGO_ENABLED=0`, trivial cross-compile; `zig` only at wasm-build time (one-off, artifact committed) | +| Crash isolation | 🔴 C fault kills process | 🟢 contained → Go error | +| Binary size | 🔴 ~78 MB (grammar tables linked natively) | 🟢 likely smaller: pure-Go host (~41 MB) + embedded `.wasm` (1.4 MB / grammar, brotli-compressible) | +| Maturity / effort | 🟢 drop-in (official binding + 31 grammar modules) | 🔴 bespoke host; must build/bundle 31 grammar `.wasm` + flesh out node API + batched walk | + +## Honest read + +It's close. cgo is done and fastest. WASM costs ~2× on **parsing**, but since +**embeddings dominate end-to-end indexing time**, that 2× is largely invisible in +production — while WASM's upsides (no cgo, crash-isolation, smaller binary, +toolchain-free server builds) are real. The price of WASM is **engineering +effort** to productionize: build all 31 grammars into the module, write the full +node-walk API the chunker needs (with a batched-walk export to recover speed), +and wire it behind the same `tsgrammars`-style registry. + +## Build & run + +```bash +brew install zig # provides clang + wasi-libc cross-compile +./build.sh # → ts-ts.wasm (official tree-sitter v0.25.10 + tree-sitter-typescript v0.23.2) +go run ./cmd/bench /path/to/vscode/src/vs/editor +go run ./cmd/stability +``` + +`ts-ts.wasm` is committed so the benchmarks run without zig. diff --git a/poc/wasm-treesitter/build.sh b/poc/wasm-treesitter/build.sh new file mode 100755 index 0000000..0a134b3 --- /dev/null +++ b/poc/wasm-treesitter/build.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Builds ts-ts.wasm: the OFFICIAL tree-sitter C runtime + the official +# TypeScript grammar, compiled to a standalone wasm32-wasi reactor module. +# No emscripten, no JS glue, no third-party Go host — just official C sources +# driven from Go via wazero (see wasmts.go). +# +# Requires: zig (provides clang + wasi-libc cross-compilation), git. +# brew install zig +# +# Key point: the only wasmtime-dependent part of the runtime (wasm_store.c) is +# guarded by `#ifdef TREE_SITTER_FEATURE_WASM`, which we do NOT define — so the +# stock amalgamation (lib/src/lib.c) compiles to wasi cleanly with no stubs. +set -euo pipefail +cd "$(dirname "$0")" + +TS_VERSION="${TS_VERSION:-v0.25.10}" # tree-sitter runtime +TS_TS_VERSION="${TS_TS_VERSION:-v0.23.2}" # tree-sitter-typescript grammar +WORK="$(mktemp -d)" +trap 'rm -rf "$WORK"' EXIT + +git clone --depth 1 --branch "$TS_VERSION" https://github.com/tree-sitter/tree-sitter "$WORK/tree-sitter" +git clone --depth 1 --branch "$TS_TS_VERSION" https://github.com/tree-sitter/tree-sitter-typescript "$WORK/ts-typescript" + +zig cc --target=wasm32-wasi-musl -mexec-model=reactor \ + -I "$WORK/tree-sitter/lib/include" -I "$WORK/tree-sitter/lib/src" \ + -I "$WORK/ts-typescript/typescript/src" \ + "$WORK/tree-sitter/lib/src/lib.c" \ + "$WORK/ts-typescript/typescript/src/parser.c" \ + "$WORK/ts-typescript/typescript/src/scanner.c" \ + -o ts-ts.wasm -Oz -fPIC -Wl,--no-entry -Wl,--strip-debug \ + -Wl,--export=malloc -Wl,--export=free \ + -Wl,--export=ts_parser_new -Wl,--export=ts_parser_delete \ + -Wl,--export=ts_parser_set_language -Wl,--export=ts_parser_parse_string \ + -Wl,--export=ts_tree_root_node -Wl,--export=ts_tree_delete \ + -Wl,--export=ts_node_child_count -Wl,--export=ts_node_child \ + -Wl,--export=ts_node_type -Wl,--export=ts_node_start_byte \ + -Wl,--export=ts_node_end_byte -Wl,--export=ts_node_has_error \ + -Wl,--export=tree_sitter_typescript + +echo "built ts-ts.wasm ($(du -h ts-ts.wasm | cut -f1)) from tree-sitter $TS_VERSION + tree-sitter-typescript $TS_TS_VERSION" diff --git a/poc/wasm-treesitter/cmd/bench/main.go b/poc/wasm-treesitter/cmd/bench/main.go new file mode 100644 index 0000000..d416940 --- /dev/null +++ b/poc/wasm-treesitter/cmd/bench/main.go @@ -0,0 +1,82 @@ +// Command bench parses every .ts file under a directory with the WASM backend +// and reports throughput — apples-to-apples with the cgo/gotreesitter numbers +// in ../../README.md (same corpus, same full-tree walk). +// +// go run ./cmd/bench /path/to/vscode/src/vs/editor +package main + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sort" + "time" + + wasmts "github.com/dvcdsys/code-index/poc/wasm-treesitter" +) + +func main() { + if len(os.Args) < 2 { + fmt.Println("usage: bench ") + os.Exit(2) + } + root := os.Args[1] + ctx := context.Background() + eng, err := wasmts.New(ctx, 0) + if err != nil { + panic(err) + } + defer eng.Close() + + var files []string + filepath.WalkDir(root, func(p string, d os.DirEntry, e error) error { + if e == nil && !d.IsDir() && filepath.Ext(p) == ".ts" { + files = append(files, p) + } + return nil + }) + fmt.Printf("corpus: %d .ts files\n\n", len(files)) + + type res struct { + path string + dur time.Duration + isErr bool + } + var all []res + var totalParse time.Duration + var totalBytes, errFiles, totalNodes int + start := time.Now() + for _, f := range files { + src, _ := os.ReadFile(f) + t0 := time.Now() + r, err := eng.Parse("tree_sitter_typescript", src) + d := time.Since(t0) + if err != nil { + fmt.Printf(" trap on %s: %v\n", filepath.Base(f), err) + continue + } + totalParse += d + totalBytes += len(src) + totalNodes += r.Nodes + if r.HasError { + errFiles++ + } + all = append(all, res{f, d, r.HasError}) + } + wall := time.Since(start) + sort.Slice(all, func(i, j int) bool { return all[i].dur > all[j].dur }) + mb := float64(totalBytes) / 1e6 + fmt.Println("=== WASM: official tree-sitter via wazero (pure-Go host, no cgo) ===") + fmt.Printf(" wall: %v parse+walk: %v\n", wall.Round(time.Millisecond), totalParse.Round(time.Millisecond)) + fmt.Printf(" throughput: %.0f files/s, %.1f MB/s\n", float64(len(files))/wall.Seconds(), mb/totalParse.Seconds()) + fmt.Printf(" ERROR trees: %d / %d nodes walked: %d\n", errFiles, len(files), totalNodes) + fmt.Printf(" slowest 5:\n") + for i := 0; i < 5 && i < len(all); i++ { + e := "" + if all[i].isErr { + e = " [ERROR]" + } + fmt.Printf(" %8v %s%s\n", all[i].dur.Round(time.Millisecond), filepath.Base(all[i].path), e) + } +} diff --git a/poc/wasm-treesitter/cmd/stability/main.go b/poc/wasm-treesitter/cmd/stability/main.go new file mode 100644 index 0000000..011f21a --- /dev/null +++ b/poc/wasm-treesitter/cmd/stability/main.go @@ -0,0 +1,77 @@ +// Command stability feeds adversarial inputs to the WASM backend and shows the +// host survives every one, and that a hard resource-limit fault surfaces as a +// recoverable Go error rather than a process crash — the crash-isolation +// property that cgo cannot offer (a C segfault/abort kills the whole process). +// +// go run ./cmd/stability +package main + +import ( + "context" + "fmt" + "strings" + + wasmts "github.com/dvcdsys/code-index/poc/wasm-treesitter" +) + +func main() { + ctx := context.Background() + eng, err := wasmts.New(ctx, 0) + if err != nil { + panic(err) + } + defer eng.Close() + + random := make([]byte, 200000) + for i := range random { + random[i] = byte(i*37 + 11) + } + adversarial := []struct { + name string + src []byte + }{ + {"deeply nested [ ] x100000", []byte(strings.Repeat("[", 100000) + strings.Repeat("]", 100000))}, + {"deeply nested ( x200000", []byte(strings.Repeat("(", 200000))}, + {"deep ternary x50000", []byte("let x=" + strings.Repeat("a?b:", 50000) + "c")}, + {"huge single token 5MB", []byte(strings.Repeat("z", 5_000_000))}, + {"invalid UTF-8 / random 200KB", random}, + {"unbalanced template `${ x80000", []byte(strings.Repeat("`${", 80000))}, + } + + fmt.Println("=== STABILITY: adversarial inputs — host must survive every one ===") + survived := 0 + for _, a := range adversarial { + r, err := eng.Parse("tree_sitter_typescript", a.src) + status := fmt.Sprintf("parsed-ok (nodes=%d, hasError=%v)", r.Nodes, r.HasError) + if err != nil { + status = "TRAP CONTAINED → " + err.Error() + } + fmt.Printf(" %-30s (%7d B): host ALIVE, %s\n", a.name, len(a.src), status) + survived++ + } + fmt.Printf("\n → host survived ALL %d adversarial inputs.\n", survived) + + if r, err := eng.Parse("tree_sitter_typescript", []byte("function f(x: number) { return x + 1; }")); err == nil { + fmt.Printf(" → normal file after barrage: parsed-ok (nodes=%d) — host fully functional\n", r.Nodes) + } + + // Hard resource containment: a memory-capped instance turns an over-limit + // parse into a Go error, not a host crash. + capped, err := wasmts.New(ctx, 24) // ~1.5 MB cap, below the module's static needs + if err != nil { + fmt.Printf(" → memory-capped (1.5MB) instance: over-limit surfaced as a Go error (contained): %s\n", firstline(err.Error())) + } else { + _, perr := capped.Parse("tree_sitter_typescript", []byte(strings.Repeat("const x=[1,2,3];\n", 50000))) + fmt.Printf(" → memory-capped big parse: contained err=%v — host ALIVE\n", perr != nil) + capped.Close() + } + + fmt.Println("\n Contrast: under cgo, an equivalent guest fault (C stack overflow / OOM abort)\n is a native SIGSEGV/abort that kills the whole cix-server process.") +} + +func firstline(s string) string { + if i := strings.IndexByte(s, '\n'); i >= 0 { + return s[:i] + } + return s +} diff --git a/poc/wasm-treesitter/go.mod b/poc/wasm-treesitter/go.mod new file mode 100644 index 0000000..dec921e --- /dev/null +++ b/poc/wasm-treesitter/go.mod @@ -0,0 +1,8 @@ +module github.com/dvcdsys/code-index/poc/wasm-treesitter + +go 1.25.3 + +require ( + github.com/tetratelabs/wazero v1.12.0 // indirect + golang.org/x/sys v0.44.0 // indirect +) diff --git a/poc/wasm-treesitter/go.sum b/poc/wasm-treesitter/go.sum new file mode 100644 index 0000000..a10a279 --- /dev/null +++ b/poc/wasm-treesitter/go.sum @@ -0,0 +1,4 @@ +github.com/tetratelabs/wazero v1.12.0 h1:DuWcpNu/FzgEXgGBDp8J1Spc+CWOvvtvVyjKlaZopYU= +github.com/tetratelabs/wazero v1.12.0/go.mod h1:LvKtzl2RqO4gyF27BiXU+nKAjcV8f38U+kP/q2vgxh0= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= diff --git a/poc/wasm-treesitter/ts-ts.wasm b/poc/wasm-treesitter/ts-ts.wasm new file mode 100755 index 0000000..9251fb8 Binary files /dev/null and b/poc/wasm-treesitter/ts-ts.wasm differ diff --git a/poc/wasm-treesitter/wasmts.go b/poc/wasm-treesitter/wasmts.go new file mode 100644 index 0000000..5e4db13 --- /dev/null +++ b/poc/wasm-treesitter/wasmts.go @@ -0,0 +1,174 @@ +// Package wasmts is a proof-of-concept pure-Go tree-sitter backend: the official +// tree-sitter C runtime + a grammar, compiled to a standalone wasm32-wasi +// reactor module (see build.sh) and driven from Go via wazero — no cgo, no JS. +// +// It exists to compare the WASM approach against the cgo backend on +// feat/chunker-cgo-treesitter (speed + crash-isolation). It is NOT wired into +// the chunker; see README.md for the benchmark/stability results. +// +// The wasm module exports the tree-sitter C API. TSNode is a 24-byte struct +// passed/returned by value; over the wasm C ABI clang lowers a by-value struct +// return to a hidden "sret" pointer first argument, and a by-value struct +// argument to a pointer into linear memory. So every node is a 24-byte slot in +// guest memory and we pass/receive pointers to those slots. +package wasmts + +import ( + "context" + _ "embed" + "fmt" + + "github.com/tetratelabs/wazero" + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1" +) + +//go:embed ts-ts.wasm +var wasmBinary []byte + +const nodeSize = 24 // sizeof(TSNode) on wasm32: uint32[4] + ptr + ptr + +// Engine is a single wazero instance hosting the tree-sitter runtime + grammars. +type Engine struct { + ctx context.Context + rt wazero.Runtime + mod api.Module + mem api.Memory + + malloc, free api.Function + parserNew, setLang, parse, treeDelete api.Function + rootNode, childCount, child api.Function + nodeType, hasError api.Function + + pool []uint32 // reused node slots, indexed by tree depth +} + +// New compiles and instantiates the wasm module. memLimitPages caps guest linear +// memory (0 = wazero default); a runaway/oversized parse then traps and is +// returned as a Go error instead of taking the process down. +func New(ctx context.Context, memLimitPages uint32) (*Engine, error) { + cfg := wazero.NewRuntimeConfigCompiler() + if memLimitPages > 0 { + cfg = cfg.WithMemoryLimitPages(memLimitPages) + } + rt := wazero.NewRuntimeWithConfig(ctx, cfg) + wasi_snapshot_preview1.MustInstantiate(ctx, rt) + mod, err := rt.InstantiateWithConfig(ctx, wasmBinary, + wazero.NewModuleConfig().WithName("ts").WithStartFunctions("_initialize")) + if err != nil { + rt.Close(ctx) + return nil, fmt.Errorf("instantiate: %w", err) + } + e := &Engine{ + ctx: ctx, rt: rt, mod: mod, mem: mod.Memory(), + malloc: mod.ExportedFunction("malloc"), + free: mod.ExportedFunction("free"), + parserNew: mod.ExportedFunction("ts_parser_new"), + setLang: mod.ExportedFunction("ts_parser_set_language"), + parse: mod.ExportedFunction("ts_parser_parse_string"), + treeDelete: mod.ExportedFunction("ts_tree_delete"), + rootNode: mod.ExportedFunction("ts_tree_root_node"), + childCount: mod.ExportedFunction("ts_node_child_count"), + child: mod.ExportedFunction("ts_node_child"), + nodeType: mod.ExportedFunction("ts_node_type"), + hasError: mod.ExportedFunction("ts_node_has_error"), + } + return e, nil +} + +func (e *Engine) Close() { e.rt.Close(e.ctx) } + +// call invokes a wasm export, surfacing a guest trap as a Go error (panic) so +// the caller's recover() can contain it. +func (e *Engine) call(f api.Function, args ...uint64) uint64 { + r, err := f.Call(e.ctx, args...) + if err != nil { + panic(err) + } + if len(r) == 0 { + return 0 + } + return r[0] +} + +// Language returns the TSLanguage pointer for an exported grammar (e.g. +// "tree_sitter_typescript"). +func (e *Engine) Language(export string) uint32 { + return uint32(e.call(e.mod.ExportedFunction(export))) +} + +// ParseResult holds the counts from a parse + full tree walk. +type ParseResult struct { + HasError bool + Nodes int + Errors int +} + +// Parse parses src under the given grammar export and walks the whole tree, +// counting nodes and ERROR nodes. A guest-side trap is returned as an error; +// the Engine (and the host process) stay alive. +func (e *Engine) Parse(langExport string, src []byte) (res ParseResult, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("wasm trap (contained): %v", r) + } + }() + + parser := e.call(e.parserNew) + lang := e.call(e.mod.ExportedFunction(langExport)) + e.call(e.setLang, parser, lang) + defer e.call(e.mod.ExportedFunction("ts_parser_delete"), parser) + + sp := uint32(e.call(e.malloc, uint64(len(src)+1))) + e.mem.Write(sp, src) + e.mem.WriteByte(sp+uint32(len(src)), 0) + defer e.call(e.free, uint64(sp)) + + tree := e.call(e.parse, parser, 0, uint64(sp), uint64(len(src))) + defer e.call(e.treeDelete, tree) + + root := uint32(e.call(e.malloc, nodeSize)) + defer e.call(e.free, uint64(root)) + e.call(e.rootNode, uint64(root), tree) + + res.HasError = e.call(e.hasError, uint64(root)) != 0 + e.walk(root, 0, &res) + return res, nil +} + +// walk recurses the tree, reusing one node slot per depth (no per-node malloc). +// Each node costs ~3 host<->guest calls (type, child_count, child) — the +// dominant WASM overhead vs cgo. +func (e *Engine) walk(nodePtr uint32, depth int, res *ParseResult) { + res.Nodes++ + if e.readCStr(uint32(e.call(e.nodeType, uint64(nodePtr)))) == "ERROR" { + res.Errors++ + } + n := uint32(e.call(e.childCount, uint64(nodePtr))) + if n == 0 { + return + } + for len(e.pool) <= depth { + e.pool = append(e.pool, uint32(e.call(e.malloc, nodeSize))) + } + slot := e.pool[depth] + for i := uint32(0); i < n; i++ { + e.call(e.child, uint64(slot), uint64(nodePtr), uint64(i)) + e.walk(slot, depth+1, res) + } +} + +func (e *Engine) readCStr(ptr uint32) string { + if ptr == 0 { + return "" + } + var b []byte + for off := ptr; ; off++ { + c, ok := e.mem.ReadByte(off) + if !ok || c == 0 { + break + } + b = append(b, c) + } + return string(b) +}