-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyzer.go
More file actions
205 lines (188 loc) · 5.28 KB
/
analyzer.go
File metadata and controls
205 lines (188 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
package analyzer
import (
"fmt"
"os"
"runtime"
"sync"
"sync/atomic"
"time"
"github.com/randomcodespace/codeiq/internal/cache"
"github.com/randomcodespace/codeiq/internal/detector"
"github.com/randomcodespace/codeiq/internal/parser"
)
// DefaultBatchSize matches the Java side's tuned default (CLAUDE.md gotcha).
const DefaultBatchSize = 500
// Options configures an Analyzer.
type Options struct {
Cache *cache.Cache
Registry *detector.Registry
BatchSize int // defaults to DefaultBatchSize
Workers int // defaults to 2 * GOMAXPROCS
Force bool // bypass cache early-exit; re-parse every file
}
// Analyzer orchestrates the index pipeline.
type Analyzer struct {
opts Options
counter runCounter
}
type runCounter struct {
cacheHits atomic.Int64
}
// NewAnalyzer returns an analyzer wired to opts.
func NewAnalyzer(opts Options) *Analyzer {
if opts.BatchSize <= 0 {
opts.BatchSize = DefaultBatchSize
}
if opts.Workers <= 0 {
opts.Workers = runtime.GOMAXPROCS(0) * 2
}
if opts.Registry == nil {
opts.Registry = detector.Default
}
return &Analyzer{opts: opts}
}
// Stats reports per-run counts.
//
// Plan §1.5 — DedupedNodes/DedupedEdges/DroppedEdges expose dedup activity
// so operators can see "graph collapsed 312 duplicate nodes, dropped 14
// phantom edges" — the visibility is what makes "meaningful" diagnosable.
//
// Added/Modified/Deleted/Unchanged/CacheHits are incremental counters,
// zero on full `--force` runs.
type Stats struct {
Files int
Nodes int
Edges int
DedupedNodes int
DedupedEdges int
DroppedEdges int
Added int
Modified int
Deleted int
Unchanged int
CacheHits int
}
// Run executes FileDiscovery → parse → detectors → GraphBuilder → cache writes
// and returns aggregate stats. Errors from individual file processing are
// logged to stderr but do not stop the run — partial output is better than no
// output (matches Java's per-file try/catch behaviour).
//
// On non-Force runs with a cache present, Run first runs Diff() to classify
// files, purges cache rows for deleted files, then proceeds. processFile
// skips parse+detect for UNCHANGED files (content_hash hit in cache).
func (a *Analyzer) Run(root string) (Stats, error) {
a.counter.cacheHits.Store(0)
var d Delta
if a.opts.Cache != nil && !a.opts.Force {
var err error
d, err = a.Diff(root)
if err != nil {
return Stats{}, err
}
for _, path := range d.Deleted {
if err := a.opts.Cache.PurgeByPath(path); err != nil {
fmt.Fprintf(os.Stderr, "codeiq: purge %s: %v\n", path, err)
}
}
}
disc := NewFileDiscovery()
files, err := disc.Discover(root)
if err != nil {
return Stats{}, fmt.Errorf("file discovery: %w", err)
}
gb := NewGraphBuilder()
// Bounded worker pool.
type job struct {
f DiscoveredFile
}
jobs := make(chan job)
var wg sync.WaitGroup
for i := 0; i < a.opts.Workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := range jobs {
if err := a.processFile(j.f, gb); err != nil {
fmt.Fprintf(os.Stderr, "codeiq: %s: %v\n", j.f.RelPath, err)
}
}
}()
}
for _, f := range files {
jobs <- job{f: f}
}
close(jobs)
wg.Wait()
snap := gb.Snapshot()
return Stats{
Files: len(files),
Nodes: len(snap.Nodes),
Edges: len(snap.Edges),
DedupedNodes: snap.DedupedNodes,
DedupedEdges: snap.DedupedEdges,
DroppedEdges: snap.DroppedEdges,
Added: len(d.Added),
Modified: len(d.Modified),
Deleted: len(d.Deleted),
Unchanged: len(d.Unchanged),
CacheHits: int(a.counter.cacheHits.Load()),
}, nil
}
func (a *Analyzer) processFile(f DiscoveredFile, gb *GraphBuilder) error {
content, err := os.ReadFile(f.AbsPath)
if err != nil {
return err
}
hash := cache.HashString(string(content))
// Fast path: cache hit. Reuse the previous emissions; skip parse+detect.
if a.opts.Cache != nil && !a.opts.Force && a.opts.Cache.Has(hash) {
entry, gerr := a.opts.Cache.Get(hash)
if gerr == nil && entry != nil {
gb.Add(&detector.Result{Nodes: entry.Nodes, Edges: entry.Edges})
a.counter.cacheHits.Add(1)
return nil
}
// Has() true but Get() failed — pathological. Fall through to re-parse.
}
tree, err := parser.Parse(f.Language, content)
if err != nil {
// Continue with regex-only detectors when the parser bails — matches
// Java behaviour for non-fatal parse errors.
tree = nil
}
if tree != nil {
defer tree.Close()
}
parsed, _ := parser.ParseStructured(f.Language, content)
ctx := &detector.Context{
FilePath: f.RelPath,
Language: f.Language.String(),
Content: string(content),
Tree: tree,
ParsedData: parsed,
}
entry := &cache.Entry{
ContentHash: hash,
Path: f.RelPath,
Language: f.Language.String(),
ParsedAt: time.Now().UTC().Format(time.RFC3339),
}
for _, d := range a.opts.Registry.For(f.Language.String()) {
r := d.Detect(ctx)
if r == nil {
continue
}
gb.Add(r)
entry.Nodes = append(entry.Nodes, r.Nodes...)
entry.Edges = append(entry.Edges, r.Edges...)
}
if a.opts.Cache != nil {
// MODIFIED files: purge prior (path, old_hash) row so a single path
// never has two cache entries.
_ = a.opts.Cache.PurgeByPath(f.RelPath)
if err := a.opts.Cache.Put(entry); err != nil {
return fmt.Errorf("cache put: %w", err)
}
}
return nil
}