frameworkBreakdown,
- Duration elapsed) {
- this(totalFiles, filesAnalyzed, nodeCount, edgeCount,
- languageBreakdown, nodeBreakdown, edgeBreakdown, frameworkBreakdown,
- elapsed, null);
- }
-}
diff --git a/src/main/java/io/github/randomcodespace/iq/analyzer/Analyzer.java b/src/main/java/io/github/randomcodespace/iq/analyzer/Analyzer.java
deleted file mode 100644
index 707f03b3..00000000
--- a/src/main/java/io/github/randomcodespace/iq/analyzer/Analyzer.java
+++ /dev/null
@@ -1,1844 +0,0 @@
-package io.github.randomcodespace.iq.analyzer;
-
-import java.nio.charset.StandardCharsets;
-import io.github.randomcodespace.iq.analyzer.linker.Linker;
-import io.github.randomcodespace.iq.cache.AnalysisCache;
-import io.github.randomcodespace.iq.cache.FileHasher;
-import io.github.randomcodespace.iq.cli.VersionCommand;
-import io.github.randomcodespace.iq.config.CliStartupConfigOverrides;
-import io.github.randomcodespace.iq.config.CodeIqConfig;
-import io.github.randomcodespace.iq.config.unified.CodeIqUnifiedConfig;
-import io.github.randomcodespace.iq.detector.AbstractAntlrDetector;
-import io.github.randomcodespace.iq.detector.Detector;
-import io.github.randomcodespace.iq.detector.DetectorContext;
-import io.github.randomcodespace.iq.detector.DetectorEmissionDefaults;
-import io.github.randomcodespace.iq.detector.DetectorRegistry;
-import io.github.randomcodespace.iq.detector.DetectorResult;
-import io.github.randomcodespace.iq.detector.DetectorUtils;
-import io.github.randomcodespace.iq.grammar.AntlrParserFactory;
-import io.github.randomcodespace.iq.intelligence.RepositoryIdentity;
-import io.github.randomcodespace.iq.intelligence.resolver.EmptyResolved;
-import io.github.randomcodespace.iq.intelligence.resolver.ResolutionException;
-import io.github.randomcodespace.iq.intelligence.resolver.Resolved;
-import io.github.randomcodespace.iq.intelligence.resolver.ResolverRegistry;
-import io.github.randomcodespace.iq.intelligence.resolver.SymbolResolver;
-import io.github.randomcodespace.iq.model.CodeEdge;
-import io.github.randomcodespace.iq.model.CodeNode;
-import io.github.randomcodespace.iq.model.NodeKind;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Service;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.time.Duration;
-import java.time.Instant;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.function.Consumer;
-
-/**
- * Main analysis pipeline orchestrator.
- *
- * Steps:
- *
- * - Discover files (FileDiscovery)
- * - For each file (virtual threads): read, parse, run detectors
- * - Build graph (batched via GraphBuilder)
- * - Run cross-file linkers
- * - Classify layers
- * - Return AnalysisResult
- *
- *
- * Determinism: files are sorted before processing and results are
- * collected in indexed slots to avoid ordering non-determinism from
- * parallel execution.
- */
-@Service
-public class Analyzer {
- private static final String PROP_FRAMEWORK = "framework";
- private static final String PROP_ROOT = "root";
- private static final String PROP_SERVICE = "service";
-
-
- private static final Logger log = LoggerFactory.getLogger(Analyzer.class);
-
- /** Languages whose content should be fed through the structured parser. */
- private static final Set STRUCTURED_LANGUAGES = Set.of(
- "yaml", "json", "xml", "toml", "ini", "properties"
- );
-
- /** Module boundary file names — presence signals a new module root. */
- static final Set MODULE_BOUNDARY_MARKERS = Set.of(
- "pom.xml", "build.gradle", "build.gradle.kts",
- "package.json", "go.mod", "__init__.py",
- "Cargo.toml", "setup.py", "pyproject.toml"
- );
-
- private final DetectorRegistry registry;
- private final StructuredParser parser;
- private final FileDiscovery fileDiscovery;
- private final LayerClassifier layerClassifier;
- private final List linkers;
- private final CodeIqConfig config;
- private final CodeIqUnifiedConfig unifiedConfig;
- private final ConfigScanner configScanner;
- private final ArchitectureKeywordFilter keywordFilter;
- private final ResolverRegistry resolverRegistry;
-
- /**
- * Projection of the injected {@link CodeIqUnifiedConfig} tree into the flat
- * shape the pipeline consumes: detector category/include filters, language +
- * exclude filters, and a parallelism override ({@code null} = auto-detect).
- *
- * Lists are always non-null; an empty list means "no filter" (same
- * semantics as the pre-Phase-B legacy {@code ProjectConfig.empty()} path).
- */
- private record PipelineFilters(
- List categories,
- List include,
- List languages,
- List exclude,
- Integer parallelism) {}
-
- private PipelineFilters pipelineFilters() {
- var indexing = unifiedConfig.indexing();
- var detectors = unifiedConfig.detectors();
- return new PipelineFilters(
- detectors.categories() == null ? List.of() : detectors.categories(),
- detectors.include() == null ? List.of() : detectors.include(),
- indexing.languages() == null ? List.of() : indexing.languages(),
- indexing.exclude() == null ? List.of() : indexing.exclude(),
- indexing.parallelism());
- }
-
- /** Primary constructor — used by Spring Boot dependency injection. */
- @Autowired
- public Analyzer(
- DetectorRegistry registry,
- StructuredParser parser,
- FileDiscovery fileDiscovery,
- LayerClassifier layerClassifier,
- List linkers,
- CodeIqConfig config,
- CodeIqUnifiedConfig unifiedConfig,
- ConfigScanner configScanner,
- ArchitectureKeywordFilter keywordFilter,
- ResolverRegistry resolverRegistry
- ) {
- this.registry = registry;
- this.parser = parser;
- this.fileDiscovery = fileDiscovery;
- this.layerClassifier = layerClassifier;
- this.linkers = linkers;
- this.config = config;
- this.unifiedConfig = unifiedConfig;
- this.configScanner = configScanner;
- this.keywordFilter = keywordFilter;
- this.resolverRegistry = resolverRegistry;
- }
-
- /**
- * Backward-compatible constructor for tests that don't need smart indexing.
- *
- * Defaults the unified-config overlay to {@link CodeIqUnifiedConfig#empty()} —
- * equivalent to the "no {@code codeiq.yml} present" path
- * (no detector filters, no language filter, auto parallelism). Tests that
- * need to exercise filters should use the primary constructor with a
- * hand-rolled {@link CodeIqUnifiedConfig}. The {@link ResolverRegistry} is
- * defaulted to an empty registry — every {@code resolverFor(...)} call
- * returns the no-op resolver and every {@code resolved()} reads back as
- * {@link EmptyResolved#INSTANCE}, which is the same observable behaviour as
- * the pre-resolver pipeline.
- */
- public Analyzer(
- DetectorRegistry registry,
- StructuredParser parser,
- FileDiscovery fileDiscovery,
- LayerClassifier layerClassifier,
- List linkers,
- CodeIqConfig config
- ) {
- this(registry, parser, fileDiscovery, layerClassifier, linkers, config,
- CodeIqUnifiedConfig.empty(),
- new ConfigScanner(), new ArchitectureKeywordFilter(),
- new ResolverRegistry(List.of()));
- }
-
- /**
- * Bootstrap every registered {@link SymbolResolver} against the project
- * root. Called exactly once per pipeline entry point (run / runBatchedIndex
- * / runSmartIndex), before any file iteration. Per-resolver failures are
- * logged inside {@link ResolverRegistry#bootstrap(Path)} and do not abort
- * the pass — a misbehaving resolver simply returns {@link EmptyResolved}
- * for its language for the rest of the run.
- */
- private void bootstrapResolvers(Path root) {
- try {
- resolverRegistry.bootstrap(root);
- } catch (RuntimeException e) {
- // ResolverRegistry already swallows per-resolver failures; this catch
- // is purely defensive in case the registry itself blows up. The
- // pipeline continues with NOOP resolvers (Optional.of(EmptyResolved)).
- log.warn("Resolver bootstrap failed for {}: {}", root, e.getMessage());
- }
- }
-
- /**
- * Resolve symbols for a single file, swallowing {@link ResolutionException}
- * so one resolver failure can't take down the whole file's detector pass.
- * Returns {@link EmptyResolved#INSTANCE} on any failure (or when the
- * resolver itself returns null, defensive).
- *
- * The orchestrator passes whatever it has: structured languages already
- * have a {@code parsedAst} (YAML/JSON/etc. parse tree); for languages the
- * top-level parser doesn't cover (Java, Python, …) we pass {@code content}
- * as a fallback so language-specific resolvers can lazy-parse the source.
- * Resolvers that don't understand the payload shape return EmptyResolved.
- */
- private Resolved resolveFor(DiscoveredFile file, Object parsedAst, String content) {
- Object payload = parsedAst != null ? parsedAst : content;
- SymbolResolver resolver = resolverRegistry.resolverFor(file.language());
- try {
- Resolved r = resolver.resolve(file, payload);
- return r != null ? r : EmptyResolved.INSTANCE;
- } catch (ResolutionException e) {
- log.debug("resolver {} failed for {}: {}",
- resolver.getClass().getSimpleName(), file.path(), e.getMessage());
- return EmptyResolved.INSTANCE;
- } catch (RuntimeException e) {
- log.debug("resolver {} threw unexpectedly for {}: {}",
- resolver.getClass().getSimpleName(), file.path(), e.toString());
- return EmptyResolved.INSTANCE;
- } catch (StackOverflowError e) {
- // Pathological generic / type-cycle inputs can blow JavaSymbolSolver's
- // recursion stack. Catching the Error keeps the virtual-thread
- // worker alive and the file's resolution simply degrades to lexical.
- // Other Errors (OOM, ThreadDeath) are not caught — they're fatal and
- // should propagate.
- log.warn("resolver {} stack-overflowed for {} — falling back to lexical",
- resolver.getClass().getSimpleName(), file.path());
- return EmptyResolved.INSTANCE;
- }
- }
-
- /**
- * Execute the analysis pipeline on the given repository path.
- *
- * @param repoPath root of the repository to analyze
- * @param onProgress optional callback for progress reporting (may be null)
- * @return the analysis result containing graph data and statistics
- */
- public AnalysisResult run(Path repoPath, Consumer onProgress) {
- return run(repoPath, null, onProgress);
- }
-
- /**
- * Execute the analysis pipeline with optional parallelism control.
- *
- * @param repoPath root of the repository to analyze
- * @param parallelism max parallel threads, or null for adaptive (virtual threads)
- * @param onProgress optional callback for progress reporting (may be null)
- * @return the analysis result containing graph data and statistics
- */
- public AnalysisResult run(Path repoPath, Integer parallelism, Consumer onProgress) {
- return run(repoPath, parallelism, true, onProgress);
- }
-
- /**
- * Execute the analysis pipeline with incremental analysis support.
- *
- * @param repoPath root of the repository to analyze
- * @param parallelism max parallel threads, or null for adaptive (virtual threads)
- * @param incremental if true, use file content hashing to skip unchanged files
- * @param onProgress optional callback for progress reporting (may be null)
- * @return the analysis result containing graph data and statistics
- */
- public AnalysisResult run(Path repoPath, Integer parallelism, boolean incremental,
- Consumer onProgress) {
- Instant start = Instant.now();
- Consumer report = onProgress != null ? onProgress : msg -> {};
-
- final Path root = repoPath.toAbsolutePath().normalize();
-
- bootstrapResolvers(root);
-
- // Open incremental cache if enabled
- AnalysisCache cache = null;
- if (incremental) {
- try {
- Path cachePath = root.resolve(config.getCacheDir()).resolve("analysis-cache.db");
- cache = new AnalysisCache(cachePath);
- report.accept("Incremental analysis enabled");
- } catch (Exception e) {
- log.debug("Could not open incremental cache, running full analysis", e);
- }
- }
-
- try {
- return runWithCache(root, parallelism, cache, report, start);
- } finally {
- if (cache != null) {
- cache.close();
- }
- }
- }
-
- private AnalysisResult runWithCache(Path root, Integer parallelism, AnalysisCache cache,
- Consumer report, Instant start) {
- // 0. Read pipeline filters from the injected unified config (single source of truth
- // resolved at startup by UnifiedConfigBeans — no per-call file I/O).
- PipelineFilters filters = pipelineFilters();
- DetectorRegistry effectiveRegistry = registry;
-
- // Apply detector category filter from unified config
- if (!filters.categories().isEmpty()) {
- effectiveRegistry = effectiveRegistry.filterByCategories(filters.categories());
- report.accept("Detector categories: " + filters.categories());
- }
-
- // Apply detector include filter from unified config
- if (!filters.include().isEmpty()) {
- effectiveRegistry = effectiveRegistry.filterByNames(filters.include());
- report.accept("Detector include: " + filters.include());
- }
-
- // Apply parallelism override from unified config (null = auto-detect)
- if (parallelism == null && filters.parallelism() != null) {
- parallelism = filters.parallelism();
- report.accept("Pipeline parallelism: " + parallelism + " (from config)");
- }
-
- // 1. Discover files
- report.accept("Discovering files...");
- List files = fileDiscovery.discover(root);
-
- // Apply language filter from unified config
- if (!filters.languages().isEmpty()) {
- Set allowedLanguages = new HashSet<>(filters.languages());
- files = files.stream()
- .filter(f -> allowedLanguages.contains(f.language()))
- .toList();
- report.accept("Language filter active: " + filters.languages());
- }
-
- // Apply exclude patterns from unified config
- if (!filters.exclude().isEmpty()) {
- List compiledExcludes = compileExcludePatterns(filters.exclude());
- files = files.stream()
- .filter(f -> !matchesAnyCompiledExclude(f.path().toString(), compiledExcludes))
- .toList();
- report.accept("Exclude patterns: " + filters.exclude());
- }
-
- int totalFiles = files.size();
- report.accept("Found " + totalFiles + " files");
-
- // 1b. Resolve repository identity
- RepositoryIdentity repoIdentity = RepositoryIdentity.resolve(root);
-
- // Compute language breakdown
- Map languageBreakdown = new TreeMap<>();
- for (DiscoveredFile f : files) {
- languageBreakdown.merge(f.language(), 1, Integer::sum);
- }
-
- // 2. Analyze files in parallel with virtual threads
- report.accept("Analyzing " + totalFiles + " files...");
- DetectorResult[] resultSlots = new DetectorResult[files.size()];
- var cacheHitsCounter = new java.util.concurrent.atomic.AtomicInteger(0);
-
- final DetectorRegistry detectorRegistry = effectiveRegistry;
- try (var executor = createExecutor(parallelism)) {
- List> futures = new ArrayList<>(files.size());
- for (int i = 0; i < files.size(); i++) {
- final int idx = i;
- final DiscoveredFile file = files.get(idx);
- final AnalysisCache cacheRef = cache;
- futures.add(executor.submit(() -> {
- // Check cache first
- if (cacheRef != null) {
- try {
- Path absPath = root.resolve(file.path());
- String hash = FileHasher.hash(absPath);
- if (cacheRef.isCached(hash)) {
- var cached = cacheRef.loadCachedResults(hash);
- if (cached != null) {
- resultSlots[idx] = DetectorResult.of(cached.nodes(), cached.edges());
- cacheHitsCounter.incrementAndGet();
- return null;
- }
- }
-
- // Run detectors and cache result
- DetectorResult result = analyzeFile(file, root, detectorRegistry);
- resultSlots[idx] = result;
- if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
- FileClassifier.FileType ft = FileClassifier.classify(file.path(), file.language());
- String snippet = computeSnippetFromFile(root.resolve(file.path()), ft);
- cacheRef.storeResults(hash, file.path().toString(), file.language(),
- result.nodes(), result.edges(), "DETECTED", "antlr",
- ft.name().toLowerCase(), snippet);
- }
- } catch (IOException e) {
- log.debug("Could not hash file {}", file.path(), e);
- resultSlots[idx] = analyzeFile(file, root, detectorRegistry);
- }
- } else {
- resultSlots[idx] = analyzeFile(file, root, detectorRegistry);
- }
- return null;
- }));
- }
-
- // Collect in order -- deterministic regardless of thread completion order
- for (int i = 0; i < futures.size(); i++) {
- try {
- futures.get(i).get(30, java.util.concurrent.TimeUnit.SECONDS);
- } catch (java.util.concurrent.TimeoutException e) {
- futures.get(i).cancel(true);
- DiscoveredFile timedOutFile = files.get(i);
- log.warn("⏱️ ANTLR timed out for {} (30s), running regex fallback", timedOutFile.path());
- DetectorResult regexResult = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
- resultSlots[i] = regexResult;
- // Store regex fallback result to cache with explicit detection_method
- if (cache != null && regexResult != null
- && (!regexResult.nodes().isEmpty() || !regexResult.edges().isEmpty())) {
- try {
- Path absPath = root.resolve(timedOutFile.path());
- String hash = FileHasher.hash(absPath);
- cache.storeResults(hash, timedOutFile.path().toString(),
- timedOutFile.language(), regexResult.nodes(), regexResult.edges(),
- "DETECTED", "regex_fallback");
- } catch (IOException ioe) {
- log.debug("Could not hash for regex fallback cache: {}", timedOutFile.path(), ioe);
- }
- }
- } catch (ExecutionException e) {
- log.warn("Analysis failed for {}", files.get(i).path(), e.getCause());
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- log.warn("Analysis interrupted for {}", files.get(i).path());
- }
- }
- }
-
- if (cache != null && cacheHitsCounter.get() > 0) {
- report.accept("Cache hits: " + cacheHitsCounter.get() + " / " + totalFiles + " files");
- }
-
- // 3. Build graph (batched)
- report.accept("Building graph...");
- var builder = new GraphBuilder(repoIdentity, VersionCommand.VERSION);
- int filesAnalyzed = 0;
- for (int i = 0; i < resultSlots.length; i++) {
- DetectorResult result = resultSlots[i];
- if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
- builder.addResult(result);
- filesAnalyzed++;
- }
- }
-
- // 4. Run cross-file linkers
- report.accept("Linking cross-file relationships...");
- builder.runLinkers(linkers);
-
- // Flush buffered graph state and retry any deferred edges so the
- // side effects (provenance stamping, edge validation, dropped-edge
- // counters) still run even though we read the results straight off
- // the builder below.
- builder.flush();
- builder.flushDeferred();
-
- // 5. Classify layers
- report.accept("Classifying layers...");
- List allNodes = builder.getNodes();
- layerClassifier.classify(allNodes);
-
- // 5b. Detect service boundaries and create SERVICE nodes
- report.accept("Detecting service boundaries...");
- var serviceDetector = new ServiceDetector();
- String projectDirName = java.util.Objects.toString(root.getFileName(), PROP_ROOT);
- var serviceResult = serviceDetector.detect(allNodes, builder.getEdges(), projectDirName, root);
- if (!serviceResult.serviceNodes().isEmpty()) {
- serviceResult.serviceNodes().forEach(n -> n.setProvenance(builder.getProvenance()));
- builder.addNodes(serviceResult.serviceNodes());
- builder.addEdges(serviceResult.serviceEdges());
- allNodes = builder.getNodes(); // refresh reference after adding service nodes
- }
-
- // 5c. Tag nodes with service name if configured (multi-repo mode) -- overrides auto-detected
- String serviceName = config.getServiceName();
- if (serviceName != null && !serviceName.isBlank()) {
- for (CodeNode node : allNodes) {
- node.getProperties().put(PROP_SERVICE, serviceName);
- }
- }
-
- // 6. Attach edges to their source nodes for downstream consumers
- Map nodeById = new HashMap<>(allNodes.size());
- for (CodeNode node : allNodes) {
- nodeById.put(node.getId(), node);
- }
- for (var edge : builder.getEdges()) {
- CodeNode source = nodeById.get(edge.getSourceId());
- if (source != null) {
- source.getEdges().add(edge);
- }
- }
-
- // 7. Compute node breakdown
- Map nodeBreakdown = new TreeMap<>();
- for (CodeNode node : allNodes) {
- String kindValue = node.getKind().getValue();
- nodeBreakdown.merge(kindValue, 1, Integer::sum);
- }
-
- // 8. Compute edge breakdown
- Map edgeBreakdown = new TreeMap<>();
- for (var edge : builder.getEdges()) {
- String kindValue = edge.getKind().getValue();
- edgeBreakdown.merge(kindValue, 1, Integer::sum);
- }
-
- // 7b. Compute framework breakdown from node properties
- Map frameworkBreakdown = new TreeMap<>();
- for (CodeNode node : allNodes) {
- Object fw = node.getProperties().get(PROP_FRAMEWORK);
- if (fw != null && !fw.toString().isEmpty()) {
- frameworkBreakdown.merge(fw.toString(), 1, Integer::sum);
- }
- Object authType = node.getProperties().get("auth_type");
- if (authType != null && !authType.toString().isEmpty()) {
- frameworkBreakdown.merge("auth:" + authType, 1, Integer::sum);
- }
- }
-
- // 8. Record analysis run in cache
- if (cache != null) {
- String commitSha = getGitHead(root);
- cache.recordRun(commitSha, filesAnalyzed);
- }
-
- Duration elapsed = Duration.between(start, Instant.now());
- int nodeCount = builder.getNodeCount();
- int edgeCount = builder.getEdgeCount();
-
- report.accept("Analysis complete - " + nodeCount + " nodes, " + edgeCount + " edges");
- log.debug("Analysis complete: {} nodes, {} edges in {}ms",
- nodeCount, edgeCount, elapsed.toMillis());
-
- return new AnalysisResult(
- totalFiles,
- filesAnalyzed,
- nodeCount,
- edgeCount,
- languageBreakdown,
- nodeBreakdown,
- edgeBreakdown,
- frameworkBreakdown,
- elapsed,
- allNodes
- );
- }
-
- /**
- * Execute the indexing pipeline with batched streaming to H2.
- *
- * Unlike {@link #run}, this method does NOT hold all nodes/edges in memory.
- * It processes files in batches and flushes each batch to H2, then releases
- * the batch memory. No linkers, layer classification, or Neo4j are used.
- *
- * @param repoPath root of the repository to analyze
- * @param parallelism max parallel threads, or null for adaptive (virtual threads)
- * @param batchSize number of files per H2 flush batch
- * @param incremental if true, use file content hashing to skip unchanged files
- * @param onProgress optional callback for progress reporting (may be null)
- * @return the analysis result containing graph data and statistics
- */
- public AnalysisResult runBatchedIndex(Path repoPath, Integer parallelism, int batchSize,
- boolean incremental, Consumer onProgress) {
- Instant start = Instant.now();
- Consumer report = onProgress != null ? onProgress : msg -> {};
-
- final Path root = repoPath.toAbsolutePath().normalize();
-
- bootstrapResolvers(root);
-
- // Always use H2 cache as the primary store during indexing
- Path cachePath = root.resolve(config.getCacheDir()).resolve("analysis-cache.db");
- AnalysisCache cache;
- try {
- cache = new AnalysisCache(cachePath);
- } catch (Exception e) {
- log.error("Failed to open H2 store at {}", cachePath, e);
- return new AnalysisResult(0, 0, 0, 0,
- Map.of(), Map.of(), Map.of(), Map.of(), Duration.ZERO);
- }
-
- try {
- return runBatchedWithCache(root, parallelism, batchSize, incremental, cache, report, start);
- } finally {
- cache.close();
- }
- }
-
- private AnalysisResult runBatchedWithCache(Path root, Integer parallelism, int batchSize,
- boolean incremental, AnalysisCache cache,
- Consumer report, Instant start) {
- // 0. Read pipeline filters from the injected unified config.
- PipelineFilters filters = pipelineFilters();
- DetectorRegistry effectiveRegistry = registry;
-
- if (!filters.categories().isEmpty()) {
- effectiveRegistry = effectiveRegistry.filterByCategories(filters.categories());
- report.accept("Detector categories: " + filters.categories());
- }
- if (!filters.include().isEmpty()) {
- effectiveRegistry = effectiveRegistry.filterByNames(filters.include());
- report.accept("Detector include: " + filters.include());
- }
- if (parallelism == null && filters.parallelism() != null) {
- parallelism = filters.parallelism();
- report.accept("Pipeline parallelism: " + parallelism + " (from config)");
- }
-
- // 1. Discover files
- report.accept("Discovering files...");
- List files = fileDiscovery.discover(root);
-
- if (!filters.languages().isEmpty()) {
- Set allowedLanguages = new HashSet<>(filters.languages());
- files = files.stream()
- .filter(f -> allowedLanguages.contains(f.language()))
- .toList();
- report.accept("Language filter active: " + filters.languages());
- }
- if (!filters.exclude().isEmpty()) {
- List compiledExcludes = compileExcludePatterns(filters.exclude());
- files = files.stream()
- .filter(f -> !matchesAnyCompiledExclude(f.path().toString(), compiledExcludes))
- .toList();
- report.accept("Exclude patterns: " + filters.exclude());
- }
-
- int totalFiles = files.size();
- report.accept("Found " + totalFiles + " files");
-
- // Compute language breakdown
- Map languageBreakdown = new TreeMap<>();
- for (DiscoveredFile f : files) {
- languageBreakdown.merge(f.language(), 1, Integer::sum);
- }
-
- // 2. Process files in batches
- report.accept("Indexing " + totalFiles + " files in batches of " + batchSize + "...");
-
- final DetectorRegistry detectorRegistry = effectiveRegistry;
- int totalNodesWritten = 0;
- int totalEdgesWritten = 0;
- int filesAnalyzed = 0;
- int cacheHits = 0;
- int batchNumber = 0;
- Map nodeBreakdown = new TreeMap<>();
- Map edgeBreakdown = new TreeMap<>();
- Map frameworkBreakdown = new TreeMap<>();
-
- // Clear previous index data if not incremental
- if (!incremental) {
- cache.clear();
- }
-
- try (var batchExecutor = createExecutor(parallelism)) {
- List batch = new ArrayList<>(batchSize);
- for (int fileIdx = 0; fileIdx < files.size(); fileIdx++) {
- batch.add(files.get(fileIdx));
-
- if (batch.size() >= batchSize || fileIdx == files.size() - 1) {
- batchNumber++;
- report.accept("Processing batch " + batchNumber + " (" + batch.size() + " files)...");
- Instant batchStart = Instant.now();
-
- // Analyze batch in parallel
- DetectorResult[] resultSlots = new DetectorResult[batch.size()];
- var batchCacheHits = new java.util.concurrent.atomic.AtomicInteger(0);
-
- {
- List> futures = new ArrayList<>(batch.size());
- for (int i = 0; i < batch.size(); i++) {
- final int idx = i;
- final DiscoveredFile file = batch.get(idx);
- futures.add(batchExecutor.submit(() -> {
- if (incremental) {
- try {
- Path absPath = root.resolve(file.path());
- String hash = FileHasher.hash(absPath);
- if (cache.isCached(hash)) {
- var cached = cache.loadCachedResults(hash);
- if (cached != null) {
- resultSlots[idx] = DetectorResult.of(cached.nodes(), cached.edges());
- batchCacheHits.incrementAndGet();
- return null;
- }
- }
- DetectorResult result = analyzeFile(file, root, detectorRegistry);
- resultSlots[idx] = result;
- if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
- FileClassifier.FileType ft = FileClassifier.classify(file.path(), file.language());
- String snippet = computeSnippetFromFile(root.resolve(file.path()), ft);
- cache.storeResults(hash, file.path().toString(), file.language(),
- result.nodes(), result.edges(), "DETECTED", "antlr",
- ft.name().toLowerCase(), snippet);
- }
- } catch (IOException e) {
- log.debug("Could not hash file {}", file.path(), e);
- resultSlots[idx] = analyzeFile(file, root, detectorRegistry);
- }
- } else {
- resultSlots[idx] = analyzeFile(file, root, detectorRegistry);
- }
- return null;
- }));
- }
-
- // Collect in order
- for (int i = 0; i < futures.size(); i++) {
- try {
- futures.get(i).get(30, java.util.concurrent.TimeUnit.SECONDS);
- } catch (java.util.concurrent.TimeoutException e) {
- futures.get(i).cancel(true);
- // Zero data loss: run regex-only fallback instead of skipping
- DiscoveredFile timedOutFile = batch.get(i);
- log.warn("⏱️ ANTLR timed out for {} (30s), running regex fallback", timedOutFile.path());
- DetectorResult regexResult = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
- resultSlots[i] = regexResult;
- // Store regex fallback result to cache with explicit detection_method
- if (incremental && regexResult != null
- && (!regexResult.nodes().isEmpty() || !regexResult.edges().isEmpty())) {
- try {
- Path absPath = root.resolve(timedOutFile.path());
- String hash = FileHasher.hash(absPath);
- cache.storeResults(hash, timedOutFile.path().toString(),
- timedOutFile.language(), regexResult.nodes(), regexResult.edges(),
- "DETECTED", "regex_fallback");
- } catch (IOException ioe) {
- log.debug("Could not hash for regex fallback cache: {}", timedOutFile.path(), ioe);
- }
- }
- } catch (ExecutionException e) {
- log.warn("Analysis failed for {}", batch.get(i).path(), e.getCause());
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- log.warn("Analysis interrupted for {}", batch.get(i).path());
- }
- int done = i + 1;
- if (done % 100 == 0 || done == futures.size()) {
- report.accept(" " + done + "/" + futures.size() + " files...");
- }
- }
- }
-
- long batchMs = Duration.between(batchStart, Instant.now()).toMillis();
- long avgMs = batch.isEmpty() ? 0 : batchMs / batch.size();
- report.accept(" Batch " + batchNumber + " done: " + batchMs + "ms (" + avgMs + "ms/file avg)");
-
- cacheHits += batchCacheHits.get();
-
- // Collect batch results and flush non-cached to H2
- List batchNodes = new ArrayList<>();
- List batchEdges = new ArrayList<>();
- int batchFilesAnalyzed = 0;
-
- for (int i = 0; i < resultSlots.length; i++) {
- DetectorResult result = resultSlots[i];
- if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
- batchFilesAnalyzed++;
- // Only store non-incremental results (incremental already stored above)
- if (!incremental) {
- batchNodes.addAll(result.nodes());
- batchEdges.addAll(result.edges());
- }
- // Tag nodes with service name if configured (multi-repo mode)
- String svcName = config.getServiceName();
- if (svcName != null && !svcName.isBlank()) {
- for (CodeNode node : result.nodes()) {
- node.getProperties().put(PROP_SERVICE, svcName);
- }
- }
- // Track breakdowns
- for (CodeNode node : result.nodes()) {
- nodeBreakdown.merge(node.getKind().getValue(), 1, Integer::sum);
- Object fw = node.getProperties().get(PROP_FRAMEWORK);
- if (fw != null && !fw.toString().isEmpty()) {
- frameworkBreakdown.merge(fw.toString(), 1, Integer::sum);
- }
- }
- for (var edge : result.edges()) {
- edgeBreakdown.merge(edge.getKind().getValue(), 1, Integer::sum);
- }
- totalNodesWritten += result.nodes().size();
- totalEdgesWritten += result.edges().size();
- }
- }
-
- filesAnalyzed += batchFilesAnalyzed;
-
- // For non-incremental mode, batch-flush to H2
- if (!incremental && (!batchNodes.isEmpty() || !batchEdges.isEmpty())) {
- String batchId = "batch:" + batchNumber + ":" + System.nanoTime();
- cache.storeBatchResults(batchId, "batch-" + batchNumber,
- "mixed", batchNodes, batchEdges);
- }
-
- // Release batch memory
- batch.clear();
- }
- }
- }
-
- if (cacheHits > 0) {
- report.accept("Cache hits: " + cacheHits + " / " + totalFiles + " files");
- }
-
- // Record run
- String commitSha = getGitHead(root);
- cache.recordRun(commitSha, filesAnalyzed);
-
- Duration elapsed = Duration.between(start, Instant.now());
- report.accept("Index complete - " + totalNodesWritten + " nodes, "
- + totalEdgesWritten + " edges written to H2");
-
- return new AnalysisResult(
- totalFiles,
- filesAnalyzed,
- totalNodesWritten,
- totalEdgesWritten,
- languageBreakdown,
- nodeBreakdown,
- edgeBreakdown,
- frameworkBreakdown,
- elapsed
- );
- }
-
- // =========================================================================
- // Smart Index Pipeline (config-first, module-partitioned)
- // =========================================================================
-
- /**
- * Config-first, module-partitioned indexing pipeline.
- *
- * Phase 1 scans config files to build an {@link InfrastructureRegistry}.
- * Phase 2 discovers source files, partitions them by module, pre-filters
- * each module with {@link ArchitectureKeywordFilter}, then runs detectors
- * in parallel and flushes results to H2 per batch.
- *
- * This method is additive — existing {@link #runBatchedIndex} is unchanged.
- *
- * @param repoPath root of the repository to analyze
- * @param parallelism max parallel threads, or null for virtual threads
- * @param batchSize files per H2 flush batch
- * @param incremental if true, use file content hashing to skip unchanged files
- * @param onProgress optional progress callback
- * @return analysis result with phase timing in the report
- */
- public AnalysisResult runSmartIndex(Path repoPath, Integer parallelism, int batchSize,
- boolean incremental, Consumer onProgress) {
- Instant start = Instant.now();
- Consumer report = onProgress != null ? onProgress : msg -> {};
- final Path root = repoPath.toAbsolutePath().normalize();
-
- bootstrapResolvers(root);
-
- Path cachePath = root.resolve(config.getCacheDir()).resolve("analysis-cache.db");
- AnalysisCache cache;
- try {
- cache = new AnalysisCache(cachePath);
- } catch (Exception e) {
- log.error("Failed to open H2 store at {}", cachePath, e);
- return new AnalysisResult(0, 0, 0, 0,
- Map.of(), Map.of(), Map.of(), Map.of(), Duration.ZERO);
- }
-
- try {
- return runSmartWithCache(root, parallelism, batchSize, incremental, cache, report, start);
- } finally {
- cache.close();
- }
- }
-
- private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int batchSize,
- boolean incremental, AnalysisCache cache,
- Consumer report, Instant start) {
- // ── Phase 1: Config scanning ──────────────────────────────────────────
- Instant phase1Start = Instant.now();
- report.accept("Phase 1: Scanning config files...");
- InfrastructureRegistry infraRegistry = configScanner.scan(root);
- long phase1Ms = Duration.between(phase1Start, Instant.now()).toMillis();
- report.accept("Phase 1 complete - " + infraRegistry.size()
- + " infrastructure endpoint(s) in " + phase1Ms + "ms");
- if (infraRegistry.getServiceName() != null) {
- report.accept("Service: " + infraRegistry.getServiceName());
- // Propagate to config if not already set
- if (config.getServiceName() == null || config.getServiceName().isBlank()) {
- CliStartupConfigOverrides.applyServiceName(config, infraRegistry.getServiceName());
- }
- }
-
- // ── Phase 2: File discovery + project config ───────────────────────────
- Instant phase2Start = Instant.now();
- report.accept("Phase 2: Discovering files...");
-
- PipelineFilters filters = pipelineFilters();
- DetectorRegistry effectiveRegistry = registry;
-
- if (!filters.categories().isEmpty()) {
- effectiveRegistry = effectiveRegistry.filterByCategories(filters.categories());
- }
- if (!filters.include().isEmpty()) {
- effectiveRegistry = effectiveRegistry.filterByNames(filters.include());
- }
- if (parallelism == null && filters.parallelism() != null) {
- parallelism = filters.parallelism();
- }
-
- List allFiles = fileDiscovery.discover(root);
-
- if (!filters.languages().isEmpty()) {
- Set allowed = new HashSet<>(filters.languages());
- allFiles = allFiles.stream().filter(f -> allowed.contains(f.language())).toList();
- }
- if (!filters.exclude().isEmpty()) {
- List compiledExcludes =
- compileExcludePatterns(filters.exclude());
- allFiles = allFiles.stream()
- .filter(f -> !matchesAnyCompiledExclude(f.path().toString(), compiledExcludes))
- .toList();
- }
-
- int totalFiles = allFiles.size();
-
- // Compute language breakdown
- Map languageBreakdown = new TreeMap<>();
- for (DiscoveredFile f : allFiles) {
- languageBreakdown.merge(f.language(), 1, Integer::sum);
- }
-
- // ── Phase 3: Module partitioning ──────────────────────────────────────
- Map> modules = detectModules(root, allFiles);
- report.accept("Phase 2 complete - " + totalFiles + " files in "
- + modules.size() + " module(s) in "
- + Duration.between(phase2Start, Instant.now()).toMillis() + "ms");
-
- // ── Phase 4: Per-module analysis with keyword pre-filter ──────────────
- if (!incremental) {
- cache.clear();
- }
-
- final DetectorRegistry detectorRegistry = effectiveRegistry;
- int totalNodesWritten = 0;
- int totalEdgesWritten = 0;
- int filesAnalyzed = 0;
- int filesSkipped = 0;
- int cacheHits = 0;
- int batchNumber = 0;
- Map nodeBreakdown = new TreeMap<>();
- Map edgeBreakdown = new TreeMap<>();
- Map frameworkBreakdown = new TreeMap<>();
-
- // Process modules in sorted order for determinism
- List sortedModuleKeys = new ArrayList<>(modules.keySet());
- sortedModuleKeys.sort(String::compareTo);
-
- try (var executor = createExecutor(parallelism)) {
- List pendingBatch = new ArrayList<>(batchSize);
- List filteredNodes = new ArrayList<>();
- Map contentCache = new HashMap<>();
- int moduleIndex = 0;
-
- for (String moduleKey : sortedModuleKeys) {
- List moduleFiles = modules.get(moduleKey);
- moduleIndex++;
- report.accept("Processing module " + moduleIndex + "/" + sortedModuleKeys.size()
- + ": " + moduleKey + " (" + moduleFiles.size() + " files)");
-
- // Pre-filter source files with keyword filter; always pass structured files.
- // Cache decoded content from the keyword filter to avoid re-reading in analyzeFile.
- List filtered = new ArrayList<>(moduleFiles.size());
- for (DiscoveredFile file : moduleFiles) {
- if (STRUCTURED_LANGUAGES.contains(file.language())) {
- // Always include config/structured files
- filtered.add(file);
- } else {
- // Read and check for architecture keywords
- try {
- Path absPath = root.resolve(file.path());
- byte[] raw = Files.readAllBytes(absPath);
- if (keywordFilter.shouldAnalyze(raw, file.language())) {
- filtered.add(file);
- // Cache decoded content to avoid duplicate read in analyzeFile
- contentCache.put(file.path().toString(),
- DetectorUtils.decodeContent(raw));
- } else {
- filesSkipped++;
- // Zero data loss: create minimal inventory node for filtered files
- String filteredFileName = java.util.Objects.toString(file.path().getFileName(), "");
- CodeNode fileNode = new CodeNode(
- "file:" + file.path() + ":module:" + filteredFileName,
- NodeKind.MODULE,
- filteredFileName);
- fileNode.setFilePath(file.path().toString());
- fileNode.setModule(DetectorUtils.deriveModuleName(file.path().toString(), file.language()));
- fileNode.getProperties().put("status", "filtered");
- fileNode.getProperties().put("language", file.language());
- fileNode.getProperties().put("detection_method", "none");
- filteredNodes.add(fileNode);
- log.debug("⏭️ SKIP: {} ({}, {} bytes) — no architecture keywords",
- file.path(), file.language(), raw.length);
- }
- } catch (IOException e) {
- log.debug("Could not read for keyword filter {}", file.path(), e);
- filtered.add(file); // include on error
- }
- }
- }
-
- // Add filtered files to pending batch
- for (DiscoveredFile file : filtered) {
- pendingBatch.add(file);
- if (pendingBatch.size() >= batchSize) {
- batchNumber++;
- var batchResult = processSmartBatch(pendingBatch, root, executor.delegate(),
- detectorRegistry, infraRegistry, incremental, cache,
- nodeBreakdown, edgeBreakdown, frameworkBreakdown,
- batchNumber, report, contentCache, filteredNodes);
- totalNodesWritten += batchResult[0];
- totalEdgesWritten += batchResult[1];
- filesAnalyzed += batchResult[2];
- cacheHits += batchResult[3];
- pendingBatch.clear();
- filteredNodes.clear();
- }
- }
- }
-
- // Flush remaining files (including any accumulated filtered nodes)
- if (!pendingBatch.isEmpty() || !filteredNodes.isEmpty()) {
- batchNumber++;
- var batchResult = processSmartBatch(pendingBatch, root, executor.delegate(),
- detectorRegistry, infraRegistry, incremental, cache,
- nodeBreakdown, edgeBreakdown, frameworkBreakdown,
- batchNumber, report, contentCache, filteredNodes);
- totalNodesWritten += batchResult[0];
- totalEdgesWritten += batchResult[1];
- filesAnalyzed += batchResult[2];
- cacheHits += batchResult[3];
- pendingBatch.clear();
- filteredNodes.clear();
- }
- // Clear content cache after all batches in this module to free memory
- contentCache.clear();
- }
-
- if (filesSkipped > 0) {
- report.accept("Keyword filter: skipped " + filesSkipped + " / " + totalFiles
- + " files (" + (filesSkipped * 100 / Math.max(1, totalFiles)) + "%)");
- }
- if (cacheHits > 0) {
- report.accept("Cache hits: " + cacheHits + " / " + totalFiles + " files");
- }
-
- String commitSha = getGitHead(root);
- cache.recordRun(commitSha, filesAnalyzed);
-
- Duration elapsed = Duration.between(start, Instant.now());
- report.accept("Smart index complete - " + totalNodesWritten + " nodes, "
- + totalEdgesWritten + " edges written to H2");
-
- return new AnalysisResult(
- totalFiles,
- filesAnalyzed,
- totalNodesWritten,
- totalEdgesWritten,
- languageBreakdown,
- nodeBreakdown,
- edgeBreakdown,
- frameworkBreakdown,
- elapsed
- );
- }
-
- /** Analyze one batch, flush to H2, return [nodes, edges, filesAnalyzed, cacheHits]. */
- private int[] processSmartBatch(
- List batch, Path root,
- java.util.concurrent.ExecutorService executor,
- DetectorRegistry detectorRegistry, InfrastructureRegistry infraRegistry,
- boolean incremental, AnalysisCache cache,
- Map nodeBreakdown, Map edgeBreakdown,
- Map frameworkBreakdown,
- int batchNumber, Consumer report,
- Map contentCache,
- List filteredNodes) {
-
- report.accept("Processing batch " + batchNumber + " (" + batch.size() + " files)...");
- Instant batchStart = Instant.now();
-
- DetectorResult[] slots = new DetectorResult[batch.size()];
- var batchCacheHits = new java.util.concurrent.atomic.AtomicInteger(0);
-
- List> futures = new ArrayList<>(batch.size());
- for (int i = 0; i < batch.size(); i++) {
- final int idx = i;
- final DiscoveredFile file = batch.get(idx);
- final String cachedContent = contentCache.remove(file.path().toString());
- futures.add(executor.submit(() -> {
- if (incremental) {
- try {
- Path absPath = root.resolve(file.path());
- String hash = FileHasher.hash(absPath);
- if (cache.isCached(hash)) {
- var cached = cache.loadCachedResults(hash);
- if (cached != null) {
- slots[idx] = DetectorResult.of(cached.nodes(), cached.edges());
- batchCacheHits.incrementAndGet();
- return null;
- }
- }
- DetectorResult result = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
- slots[idx] = result;
- if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
- FileClassifier.FileType ft = FileClassifier.classify(file.path(), file.language());
- String snippet = cachedContent != null
- ? computeSnippet(cachedContent, ft)
- : computeSnippetFromFile(root.resolve(file.path()), ft);
- cache.storeResults(hash, file.path().toString(), file.language(),
- result.nodes(), result.edges(), "DETECTED", "antlr",
- ft.name().toLowerCase(), snippet);
- }
- } catch (IOException e) {
- log.debug("Could not hash {}", file.path(), e);
- slots[idx] = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
- }
- } else {
- slots[idx] = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
- }
- return null;
- }));
- }
-
- for (int i = 0; i < futures.size(); i++) {
- try {
- futures.get(i).get(30, java.util.concurrent.TimeUnit.SECONDS);
- } catch (java.util.concurrent.TimeoutException e) {
- futures.get(i).cancel(true);
- DiscoveredFile timedOutFile = batch.get(i);
- log.warn("⏱️ ANTLR timed out for {} (30s), running regex fallback", timedOutFile.path());
- DetectorResult regexResult = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
- slots[i] = regexResult;
- // Store regex fallback result to cache with explicit detection_method
- if (incremental && regexResult != null
- && (!regexResult.nodes().isEmpty() || !regexResult.edges().isEmpty())) {
- try {
- Path absPath = root.resolve(timedOutFile.path());
- String hash = FileHasher.hash(absPath);
- cache.storeResults(hash, timedOutFile.path().toString(),
- timedOutFile.language(), regexResult.nodes(), regexResult.edges(),
- "DETECTED", "regex_fallback");
- } catch (IOException ioe) {
- log.debug("Could not hash for regex fallback cache: {}", timedOutFile.path(), ioe);
- }
- }
- } catch (ExecutionException e) {
- log.warn("Analysis failed for {}", batch.get(i).path(), e.getCause());
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- log.warn("Analysis interrupted for {}", batch.get(i).path());
- }
- int done = i + 1;
- if (done % 100 == 0 || done == futures.size()) {
- report.accept(" " + done + "/" + futures.size() + " files...");
- }
- }
-
- long batchMs = Duration.between(batchStart, Instant.now()).toMillis();
- long avgMs = batch.isEmpty() ? 0 : batchMs / batch.size();
- report.accept(" Batch " + batchNumber + " done: " + batchMs + "ms (" + avgMs + "ms/file avg)");
-
- int nodes = 0, edges = 0, analyzed = 0;
- List batchNodes = new ArrayList<>();
- List batchEdges = new ArrayList<>();
-
- for (DetectorResult result : slots) {
- if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
- analyzed++;
- if (!incremental) {
- batchNodes.addAll(result.nodes());
- batchEdges.addAll(result.edges());
- }
- String svcName = config.getServiceName();
- if (svcName != null && !svcName.isBlank()) {
- for (CodeNode node : result.nodes()) {
- node.getProperties().put(PROP_SERVICE, svcName);
- }
- }
- for (CodeNode node : result.nodes()) {
- nodeBreakdown.merge(node.getKind().getValue(), 1, Integer::sum);
- Object fw = node.getProperties().get(PROP_FRAMEWORK);
- if (fw != null && !fw.toString().isEmpty()) {
- frameworkBreakdown.merge(fw.toString(), 1, Integer::sum);
- }
- }
- for (var edge : result.edges()) {
- edgeBreakdown.merge(edge.getKind().getValue(), 1, Integer::sum);
- }
- nodes += result.nodes().size();
- edges += result.edges().size();
- }
- }
-
- // Add filtered file inventory nodes to batch results
- if (filteredNodes != null && !filteredNodes.isEmpty()) {
- batchNodes.addAll(filteredNodes);
- for (CodeNode fn : filteredNodes) {
- nodeBreakdown.merge(fn.getKind().getValue(), 1, Integer::sum);
- }
- nodes += filteredNodes.size();
- }
-
- if (!incremental && (!batchNodes.isEmpty() || !batchEdges.isEmpty())) {
- String batchId = "batch:" + batchNumber + ":" + System.nanoTime();
- cache.storeBatchResults(batchId, "batch-" + batchNumber, "mixed", batchNodes, batchEdges);
- }
-
- return new int[]{nodes, edges, analyzed, batchCacheHits.get()};
- }
-
- /**
- * Partition discovered files into modules based on build-file boundary markers.
- *
- * Files are assigned to the deepest module directory that contains them.
- * Files with no matching module are placed in a {@code PROP_ROOT} partition.
- * The returned map is a {@link TreeMap} for deterministic iteration.
- *
- * @param root absolute repository root (used only for logging)
- * @param files all discovered files (paths relative to root)
- * @return module name → file list, sorted by module name
- */
- Map> detectModules(Path root, List files) {
- // Collect unique module directories from boundary marker files
- Set moduleDirs = new java.util.TreeSet<>();
- for (DiscoveredFile file : files) {
- if (MODULE_BOUNDARY_MARKERS.contains(java.util.Objects.toString(file.path().getFileName(), ""))) {
- Path parent = file.path().getParent();
- moduleDirs.add(parent != null ? parent.toString().replace('\\', '/') : "");
- }
- }
-
- // If no module boundaries found, treat everything as root
- if (moduleDirs.isEmpty()) {
- Map> single = new TreeMap<>();
- single.put(PROP_ROOT, new ArrayList<>(files));
- return single;
- }
-
- Map> result = new TreeMap<>();
-
- for (DiscoveredFile file : files) {
- String fileStr = file.path().toString().replace('\\', '/');
- String bestModule = null;
- int bestDepth = -1;
-
- for (String moduleDir : moduleDirs) {
- boolean matches = moduleDir.isEmpty()
- || fileStr.startsWith(moduleDir + "/")
- || fileStr.equals(moduleDir);
- if (matches) {
- int depth = moduleDir.isEmpty() ? 0 : moduleDir.split("/").length;
- if (depth > bestDepth) {
- bestDepth = depth;
- bestModule = moduleDir;
- }
- }
- }
-
- String key = bestModule != null ? bestModule : PROP_ROOT;
- result.computeIfAbsent(key, k -> new ArrayList<>()).add(file);
- }
-
- log.debug("detectModules: {} module(s) detected in {}", result.size(), root);
- return result;
- }
-
- /**
- * Analyze a single file using the given registries.
- * Identical to {@link #analyzeFile(DiscoveredFile, Path, DetectorRegistry)} but
- * also passes the {@link InfrastructureRegistry} into each {@link DetectorContext}.
- */
- DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
- DetectorRegistry detectorRegistry,
- InfrastructureRegistry infraRegistry) {
- return analyzeFileWithRegistry(file, repoPath, detectorRegistry, infraRegistry, null);
- }
-
- /**
- * Analyze a single file using the given registries, optionally with pre-read content.
- * When {@code preReadContent} is non-null, it is used directly instead of reading from disk,
- * avoiding a duplicate file read (the content was already read during keyword filtering).
- *
- * @param preReadContent decoded file content from the keyword filter, or null to read from disk
- */
- DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
- DetectorRegistry detectorRegistry,
- InfrastructureRegistry infraRegistry,
- String preReadContent) {
- Instant fileStart = Instant.now();
-
- // Classify file type before reading content
- FileClassifier.FileType fileType = FileClassifier.classify(file.path(), file.language());
-
- // Binary files: inventory-only node, no content read needed
- if (fileType == FileClassifier.FileType.BINARY) {
- return createInventoryNode(file, "binary");
- }
-
- // Generated files: inventory-only node, skip detectors
- if (fileType == FileClassifier.FileType.GENERATED) {
- return createInventoryNode(file, "generated");
- }
-
- String content;
- if (preReadContent != null) {
- content = preReadContent;
- } else {
- Path absPath = repoPath.resolve(file.path());
- try {
- byte[] raw = Files.readAllBytes(absPath);
- content = DetectorUtils.decodeContent(raw);
- } catch (IOException e) {
- log.debug("Could not read file: {}", absPath, e);
- return DetectorResult.empty();
- }
- }
-
- // Test files: inventory-only node with file_type=test
- if (fileType == FileClassifier.FileType.TEST) {
- return createInventoryNode(file, "test");
- }
-
- // Text files (unknown language): inventory-only node
- if (fileType == FileClassifier.FileType.TEXT) {
- return createInventoryNode(file, "text");
- }
-
- if (isMinified(file, content)) {
- log.debug("Skipping detectors for minified file: {}", file.path());
- String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
- String fileNameStr = java.util.Objects.toString(file.path().getFileName(), "");
- CodeNode node = new CodeNode(
- "file:" + file.path() + ":module:" + (moduleName != null ? moduleName : fileNameStr),
- NodeKind.MODULE,
- fileNameStr);
- node.setFilePath(file.path().toString());
- node.setModule(moduleName);
- node.setProperties(new java.util.LinkedHashMap<>(Map.of("minified", true, "file_type", "minified")));
- return DetectorResult.of(List.of(node), List.of());
- }
-
- // SOURCE and CONFIG files: run detectors
- String fileTypeStr = (fileType == FileClassifier.FileType.CONFIG) ? "config" : "source";
-
- Object parsedData = null;
- if (STRUCTURED_LANGUAGES.contains(file.language())) {
- parsedData = parser.parse(file.language(), content, file.path().toString());
- }
-
- String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
-
- var ctx = new DetectorContext(
- file.path().toString(),
- file.language(),
- content,
- parsedData,
- moduleName,
- infraRegistry
- ).withResolved(resolveFor(file, parsedData, content));
-
- List detectors = detectorRegistry.detectorsForLanguage(file.language());
- if (detectors.isEmpty()) {
- return DetectorResult.empty();
- }
-
- var allNodes = new ArrayList();
- var allEdges = new ArrayList();
-
- for (Detector detector : detectors) {
- if (Thread.interrupted()) {
- Thread.currentThread().interrupt();
- break;
- }
- try {
- DetectorResult result = detector.detect(ctx);
- // Stamp confidence + source defaults on every emission whose source
- // is null. Detectors that already explicitly stamp are left alone.
- DetectorEmissionDefaults.applyDefaults(result, detector);
- allNodes.addAll(result.nodes());
- allEdges.addAll(result.edges());
- } catch (Throwable e) {
- log.debug("Detector {} failed on {}: {}",
- detector.getName(), file.path(), e.getMessage());
- }
- }
-
- AntlrParserFactory.clearCache();
-
- long fileMs = Duration.between(fileStart, Instant.now()).toMillis();
- if (fileMs > 5000) {
- log.warn("🐢 SLOW: {} took {}ms", file.path(), fileMs);
- } else if (fileMs > 500) {
- log.info("🐢 SLOW: {} took {}ms", file.path(), fileMs);
- }
-
- // Set module and file_type on all nodes
- for (CodeNode node : allNodes) {
- if (moduleName != null && (node.getModule() == null || node.getModule().isEmpty())) {
- node.setModule(moduleName);
- }
- ensureMutableProperties(node).put("file_type", fileTypeStr);
- }
-
- return DetectorResult.of(allNodes, allEdges);
- }
-
- /**
- * Check whether a file is minified and large enough that running detectors
- * would be wasteful or cause ANTLR/regex hangs.
- *
- * Two-tier heuristic:
- *
- * - Filename match (*.min.js, *.bundle.js, etc.) + size > 10 KB + avg line > 500 chars
- * - Content-based: any JS/CSS/MJS file > 50 KB with avg line > 1000 chars (catches
- * minified files without .min suffix, e.g. webpack output named app.js or vendor.js)
- *
- */
- /**
- * Wrapper around ExecutorService that implements AutoCloseable with a bounded
- * shutdown — prevents the default close() from hanging up to 24 hours on stuck
- * ANTLR threads.
- *
- * Package-private so the close/lifecycle behaviour can be regression-tested
- * directly without spinning the full Analyzer pipeline.
- */
- record BoundedExecutor(java.util.concurrent.ExecutorService delegate) implements AutoCloseable {
- Future submit(java.util.concurrent.Callable task) { return delegate.submit(task); }
-
- @Override
- public void close() {
- delegate.shutdown();
- try {
- if (!delegate.awaitTermination(10, java.util.concurrent.TimeUnit.SECONDS)) {
- delegate.shutdownNow();
- if (!delegate.awaitTermination(5, java.util.concurrent.TimeUnit.SECONDS)) {
- log.warn("Executor did not terminate cleanly; stuck ANTLR threads will be reclaimed at JVM exit");
- }
- }
- } catch (InterruptedException e) {
- delegate.shutdownNow();
- Thread.currentThread().interrupt();
- }
- }
- }
-
- private BoundedExecutor createExecutor(Integer parallelism) {
- var exec = parallelism != null && parallelism > 0
- ? Executors.newFixedThreadPool(parallelism, Thread.ofPlatform().daemon(true).factory())
- : Executors.newVirtualThreadPerTaskExecutor();
- return new BoundedExecutor(exec);
- }
-
- private boolean isMinified(DiscoveredFile file, String content) {
- String name = java.util.Objects.toString(file.path().getFileName(), "");
- boolean nameHint = name.endsWith(".min.js") || name.endsWith(".bundle.js")
- || name.endsWith(".min.css") || name.endsWith(".min.mjs");
- boolean jsOrCss = name.endsWith(".js") || name.endsWith(".mjs") || name.endsWith(".cjs")
- || name.endsWith(".css") || name.endsWith(".jsx") || name.endsWith(".ts");
-
- // Small files are never treated as minified
- if (file.sizeBytes() <= 10_240) {
- return false;
- }
-
- // Average line length check
- int newlines = 0;
- for (int i = 0; i < content.length(); i++) {
- if (content.charAt(i) == '\n') newlines++;
- }
- if (newlines == 0) newlines = 1;
- long avgLineLen = content.length() / newlines;
-
- // Tier 1: known minified suffixes with relaxed threshold
- if (nameHint && avgLineLen > 500) {
- return true;
- }
-
- // Tier 2: any JS/CSS file > 50 KB with very long lines (minified without .min suffix)
- if (jsOrCss && file.sizeBytes() > 50_000 && avgLineLen > 1000) {
- return true;
- }
-
- return false;
- }
-
- /**
- * Analyze a single file using the default registry.
- */
- DetectorResult analyzeFile(DiscoveredFile file, Path repoPath) {
- return analyzeFile(file, repoPath, registry);
- }
-
- /**
- * Analyze a single file using the given (possibly filtered) registry.
- */
- DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry detectorRegistry) {
- Instant fileStart = Instant.now();
- Path absPath = repoPath.resolve(file.path());
-
- // Classify file type before reading content
- FileClassifier.FileType fileType = FileClassifier.classify(file.path(), file.language());
-
- // Binary files: inventory-only node, no content read needed
- if (fileType == FileClassifier.FileType.BINARY) {
- return createInventoryNode(file, "binary");
- }
-
- // Generated files: inventory-only node, skip detectors
- if (fileType == FileClassifier.FileType.GENERATED) {
- return createInventoryNode(file, "generated");
- }
-
- // Read file content
- String content;
- try {
- byte[] raw = Files.readAllBytes(absPath);
- content = DetectorUtils.decodeContent(raw);
- } catch (IOException e) {
- log.debug("Could not read file: {}", absPath, e);
- return DetectorResult.empty();
- }
-
- // Test files: inventory-only node with file_type=test
- if (fileType == FileClassifier.FileType.TEST) {
- return createInventoryNode(file, "test");
- }
-
- // Text files (unknown language): inventory-only node
- if (fileType == FileClassifier.FileType.TEXT) {
- return createInventoryNode(file, "text");
- }
-
- // Minified file detection: create a node with minified=true but skip detectors
- if (isMinified(file, content)) {
- log.debug("Skipping detectors for minified file: {}", file.path());
- String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
- String fileNameStr = java.util.Objects.toString(file.path().getFileName(), "");
- CodeNode node = new CodeNode(
- "file:" + file.path() + ":module:" + (moduleName != null ? moduleName : fileNameStr),
- NodeKind.MODULE,
- fileNameStr);
- node.setFilePath(file.path().toString());
- node.setModule(moduleName);
- node.setProperties(new java.util.LinkedHashMap<>(Map.of("minified", true, "file_type", "minified")));
- return DetectorResult.of(List.of(node), List.of());
- }
-
- // SOURCE and CONFIG files: run detectors
- String fileTypeStr = (fileType == FileClassifier.FileType.CONFIG) ? "config" : "source";
-
- // Parse structured data if applicable
- Object parsedData = null;
- if (STRUCTURED_LANGUAGES.contains(file.language())) {
- parsedData = parser.parse(file.language(), content, file.path().toString());
- }
-
- // Derive module name
- String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
-
- // Create context
- var ctx = new DetectorContext(
- file.path().toString(),
- file.language(),
- content,
- parsedData,
- moduleName
- ).withResolved(resolveFor(file, parsedData, content));
-
- // Run matching detectors and merge results
- List detectors = detectorRegistry.detectorsForLanguage(file.language());
- if (detectors.isEmpty()) {
- return DetectorResult.empty();
- }
-
- var allNodes = new ArrayList();
- var allEdges = new ArrayList();
-
- for (Detector detector : detectors) {
- try {
- Instant detStart = Instant.now();
- DetectorResult result = detector.detect(ctx);
- // Stamp orchestrator-managed confidence + source defaults.
- DetectorEmissionDefaults.applyDefaults(result, detector);
- long detMs = Duration.between(detStart, Instant.now()).toMillis();
- if (detMs > 2000) {
- log.warn("🐢 SLOW DETECTOR: {} on {}: {}ms",
- detector.getName(), file.path(), detMs);
- } else if (detMs > 100) {
- log.debug("Slow detector {} on {} ({} bytes): {}ms",
- detector.getName(), file.path(), content.length(), detMs);
- }
- allNodes.addAll(result.nodes());
- allEdges.addAll(result.edges());
- } catch (Throwable e) {
- log.debug("Detector {} failed on {}: {}",
- detector.getName(), file.path(), e.getMessage());
- }
- }
-
- // Clear ANTLR parse cache after all detectors have run for this file
- AntlrParserFactory.clearCache();
-
- long fileMs = Duration.between(fileStart, Instant.now()).toMillis();
- if (fileMs > 5000) {
- log.warn("🐢 SLOW: {} took {}ms", file.path(), fileMs);
- } else if (fileMs > 500) {
- log.info("🐢 SLOW: {} took {}ms", file.path(), fileMs);
- }
-
- // Set module and file_type on all nodes
- for (CodeNode node : allNodes) {
- if (moduleName != null && (node.getModule() == null || node.getModule().isEmpty())) {
- node.setModule(moduleName);
- }
- ensureMutableProperties(node).put("file_type", fileTypeStr);
- }
-
- return DetectorResult.of(allNodes, allEdges);
- }
-
- /**
- * Create an inventory-only node for files that should not have detectors run.
- */
- private static DetectorResult createInventoryNode(DiscoveredFile file, String fileType) {
- String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
- String fileNameStr = java.util.Objects.toString(file.path().getFileName(), "");
- CodeNode node = new CodeNode(
- "file:" + file.path() + ":module:" + (moduleName != null ? moduleName : fileNameStr),
- NodeKind.MODULE,
- fileNameStr);
- node.setFilePath(file.path().toString());
- node.setModule(moduleName);
- node.setProperties(new java.util.LinkedHashMap<>(Map.of(
- "file_type", fileType,
- "language", file.language() != null ? file.language() : "")));
- return DetectorResult.of(List.of(node), List.of());
- }
-
- /**
- * Regex-only analysis fallback for files where ANTLR timed out.
- * Ensures zero data loss — every file produces nodes via regex detection.
- * Nodes are tagged with detection_method=regex_fallback.
- */
- private DetectorResult analyzeFileRegexOnly(DiscoveredFile file, Path repoPath,
- DetectorRegistry detectorRegistry) {
- Path absPath = repoPath.resolve(file.path());
- String content;
- try {
- byte[] raw = Files.readAllBytes(absPath);
- content = DetectorUtils.decodeContent(raw);
- } catch (IOException e) {
- log.debug("Could not read file for regex fallback: {}", absPath, e);
- return DetectorResult.empty();
- }
-
- String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
- var ctx = new DetectorContext(file.path().toString(), file.language(), content, null, moduleName)
- .withResolved(resolveFor(file, null, content));
-
- List detectors = detectorRegistry.detectorsForLanguage(file.language());
- var allNodes = new ArrayList();
- var allEdges = new ArrayList();
-
- for (Detector detector : detectors) {
- try {
- DetectorResult result;
- if (detector instanceof AbstractAntlrDetector antlrDet) {
- result = antlrDet.detectRegexOnly(ctx);
- } else {
- result = detector.detect(ctx);
- }
- // Stamp orchestrator-managed confidence + source defaults.
- DetectorEmissionDefaults.applyDefaults(result, detector);
- allNodes.addAll(result.nodes());
- allEdges.addAll(result.edges());
- } catch (Throwable e) {
- log.debug("Regex fallback detector {} failed on {}: {}",
- detector.getName(), file.path(), e.getMessage());
- }
- }
-
- // Tag all nodes with detection method so users know quality level
- for (CodeNode node : allNodes) {
- node.getProperties().put("detection_method", "regex_fallback");
- if (moduleName != null && (node.getModule() == null || node.getModule().isEmpty())) {
- node.setModule(moduleName);
- }
- }
-
- AntlrParserFactory.clearCache();
- return DetectorResult.of(allNodes, allEdges);
- }
-
- /**
- * Get the current git HEAD commit SHA, or null if not a git repo.
- */
- private String getGitHead(Path repoPath) {
- try {
- ProcessBuilder pb = new ProcessBuilder("git", "rev-parse", "HEAD")
- .directory(repoPath.toFile())
- .redirectErrorStream(true);
- Process proc = pb.start();
- String sha = new String(proc.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
- int exitCode = proc.waitFor();
- if (exitCode == 0 && sha.length() >= 7) {
- return sha;
- }
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- log.debug("Could not determine git HEAD", e);
- return null;
- } catch (Exception e) {
- log.debug("Could not determine git HEAD", e);
- }
- return null;
- }
-
- /**
- * Read file content and compute snippet. Returns null on error or for binary files.
- */
- /**
- * Ensure a node's properties map is mutable. Some nodes are created with
- * immutable Map.of() which throws UnsupportedOperationException on put().
- */
- private static Map ensureMutableProperties(CodeNode node) {
- Map props = node.getProperties();
- try {
- // Test mutability — HashMap/LinkedHashMap will not throw
- props.put("_test", null);
- props.remove("_test");
- return props;
- } catch (UnsupportedOperationException e) {
- var mutable = new java.util.LinkedHashMap<>(props);
- node.setProperties(mutable);
- return mutable;
- }
- }
-
- private static String computeSnippetFromFile(Path absPath, FileClassifier.FileType fileType) {
- if (fileType == FileClassifier.FileType.BINARY) return null;
- try {
- byte[] raw = Files.readAllBytes(absPath);
- String content = DetectorUtils.decodeContent(raw);
- return computeSnippet(content, fileType);
- } catch (IOException e) {
- return null;
- }
- }
-
- /**
- * Compute a snippet from file content for storage in H2.
- * Returns the first 200 lines, capped at 10KB, or null for binary files.
- */
- static String computeSnippet(String content, FileClassifier.FileType fileType) {
- if (fileType == FileClassifier.FileType.BINARY) return null;
- if (content == null || content.isEmpty()) return null;
- // First 200 lines, max 10KB
- String[] lines = content.split("\n", 201);
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < Math.min(lines.length, 200); i++) {
- if (sb.length() + lines[i].length() > 10_000) break;
- if (i > 0) sb.append('\n');
- sb.append(lines[i]);
- }
- return sb.toString();
- }
-
- /**
- * Pre-compile exclude glob patterns into regex Pattern objects.
- */
- private static List compileExcludePatterns(List excludePatterns) {
- if (excludePatterns == null) return List.of();
- return excludePatterns.stream()
- .map(p -> compileGlob(p.replace('\\', '/')))
- .toList();
- }
-
- /**
- * Check whether a file path matches any of the given pre-compiled patterns.
- */
- private static boolean matchesAnyCompiledExclude(String filePath, List compiledPatterns) {
- if (compiledPatterns == null || compiledPatterns.isEmpty()) return false;
- String normalized = filePath.replace('\\', '/');
- for (java.util.regex.Pattern pattern : compiledPatterns) {
- if (pattern.matcher(normalized).matches()) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Compile a glob pattern into a regex Pattern.
- * '*' matches any non-separator sequence, '**' matches everything (including separators).
- * All regex special characters are properly escaped.
- */
- private static java.util.regex.Pattern compileGlob(String pattern) {
- StringBuilder regex = new StringBuilder("^");
- int i = 0;
- while (i < pattern.length()) {
- char c = pattern.charAt(i);
- if (c == '*') {
- if (i + 1 < pattern.length() && pattern.charAt(i + 1) == '*') {
- regex.append(".*");
- i += 2;
- // skip trailing /
- if (i < pattern.length() && pattern.charAt(i) == '/') {
- i++;
- }
- } else {
- regex.append("[^/]*");
- i++;
- }
- } else if (c == '?') {
- regex.append("[^/]");
- i++;
- } else if (".+^${}()|[]\\".indexOf(c) >= 0) {
- // S5: Properly escape all regex special characters
- regex.append('\\').append(c);
- i++;
- } else {
- regex.append(c);
- i++;
- }
- }
- regex.append("$");
- return java.util.regex.Pattern.compile(regex.toString());
- }
-}
diff --git a/src/main/java/io/github/randomcodespace/iq/analyzer/ArchitectureKeywordFilter.java b/src/main/java/io/github/randomcodespace/iq/analyzer/ArchitectureKeywordFilter.java
deleted file mode 100644
index 6510eee0..00000000
--- a/src/main/java/io/github/randomcodespace/iq/analyzer/ArchitectureKeywordFilter.java
+++ /dev/null
@@ -1,129 +0,0 @@
-package io.github.randomcodespace.iq.analyzer;
-
-import org.springframework.stereotype.Component;
-
-import java.nio.charset.StandardCharsets;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Pre-scans file content for architecture-relevant keywords before full parsing.
- *
- * Uses fast substring checks (no regex) to determine whether a file is likely
- * to contain architectural patterns — endpoints, services, entities, config,
- * messaging, auth, or infrastructure. Files with no matching keywords are
- * skipped, targeting a ~60-70% skip rate for pure utilities, POJOs, and DTOs.
- */
-@Component
-public class ArchitectureKeywordFilter {
-
- private static final Set JAVA_KEYWORDS = Set.of(
- "@Controller", "@RestController", "@Service", "@Component",
- "@Repository", "@Entity", "@Table", "@Bean", "@Configuration",
- "@KafkaListener", "KafkaTemplate", "@RabbitListener", "JmsTemplate",
- "DataSource", "JdbcTemplate", "JpaRepository", "EntityManager",
- "RestTemplate", "WebClient", "FeignClient", "@Scheduled", "@Async",
- "@Cacheable", "@PreAuthorize", "@Secured", "@RolesAllowed",
- "HttpSecurity", "WebSecurityConfigurerAdapter", "@Transactional",
- "@Query", "@EventListener", "ApplicationEvent", "ConnectionFactory",
- "RedisTemplate", "GrpcService"
- );
-
- private static final Set PYTHON_KEYWORDS = Set.of(
- "FastAPI", "Django", "Flask", "app.route", "@app.get", "@app.post",
- "SQLAlchemy", "create_engine", "models.Model", "celery", "@task",
- "redis", "kafka", "boto3", "httpx", "requests", "APIRouter",
- "Depends", "BaseModel", "SessionLocal", "AsyncSession"
- );
-
- private static final Set TYPESCRIPT_KEYWORDS = Set.of(
- "@Controller", "@Get", "@Post", "@Injectable", "TypeORM", "Prisma",
- "Sequelize", "Mongoose", "express", "Router", "app.get", "app.post",
- "kafkajs", "amqplib", "ioredis", "bull", "HttpService", "fetch",
- "axios", "@Module", "@Guard", "@Middleware", "Schema", "model("
- );
-
- private static final Set GO_KEYWORDS = Set.of(
- "http.HandleFunc", "http.Handle", "gin.Default", "mux.Router",
- "echo.New", "sql.Open", "gorm", "sqlx", "sarama", "grpc.NewServer",
- "grpc.Dial", "redis.NewClient", "http.Client"
- );
-
- private static final Set CSHARP_KEYWORDS = Set.of(
- "[ApiController]", "[HttpGet]", "[HttpPost]", "DbContext",
- "IDbConnection", "[Authorize]", "MassTransit", "IMediator",
- "ILogger", "IHostedService", "BackgroundService"
- );
-
- private static final Set RUST_KEYWORDS = Set.of(
- "#[get]", "#[post]", "actix_web", "rocket", "axum", "sqlx",
- "diesel", "tokio-postgres", "rdkafka", "redis", "tonic", "reqwest"
- );
-
- private static final Set RUBY_KEYWORDS = Set.of(
- "ActiveRecord", "has_many", "belongs_to", "Sidekiq", "Redis.new",
- "Faraday", "HTTParty", "Devise", "Pundit"
- );
-
- private static final Set GENERIC_KEYWORDS = Set.of(
- "import", "require", "from", "endpoint", "route", "router",
- "middleware", "guard", "interceptor", "filter", "handler",
- "listener", "consumer", "producer", "subscriber", "publisher"
- );
-
- private static final Map> LANGUAGE_KEYWORDS = Map.ofEntries(
- Map.entry("java", JAVA_KEYWORDS),
- Map.entry("python", PYTHON_KEYWORDS),
- Map.entry("typescript", TYPESCRIPT_KEYWORDS),
- Map.entry("javascript", TYPESCRIPT_KEYWORDS),
- Map.entry("go", GO_KEYWORDS),
- Map.entry("csharp", CSHARP_KEYWORDS),
- Map.entry("rust", RUST_KEYWORDS),
- Map.entry("ruby", RUBY_KEYWORDS)
- );
-
- /**
- * Determines whether a file should be analyzed based on its content.
- *
- * @param content file content as a string
- * @param language language identifier (e.g. "java", "python", "typescript")
- * @return {@code true} if any architecture keyword is found
- */
- public boolean shouldAnalyze(String content, String language) {
- if (content == null || content.isBlank()) {
- return false;
- }
-
- Set languageSpecific = LANGUAGE_KEYWORDS.get(language != null ? language.toLowerCase() : "");
- if (languageSpecific != null) {
- for (String keyword : languageSpecific) {
- if (content.contains(keyword)) {
- return true;
- }
- }
- }
-
- for (String keyword : GENERIC_KEYWORDS) {
- if (content.contains(keyword)) {
- return true;
- }
- }
-
- return false;
- }
-
- /**
- * Determines whether a file should be analyzed based on its raw byte content.
- * Decodes bytes as UTF-8 and delegates to {@link #shouldAnalyze(String, String)}.
- *
- * @param rawContent raw file bytes
- * @param language language identifier
- * @return {@code true} if any architecture keyword is found
- */
- public boolean shouldAnalyze(byte[] rawContent, String language) {
- if (rawContent == null || rawContent.length == 0) {
- return false;
- }
- return shouldAnalyze(new String(rawContent, StandardCharsets.UTF_8), language);
- }
-}
diff --git a/src/main/java/io/github/randomcodespace/iq/analyzer/ConfigScanner.java b/src/main/java/io/github/randomcodespace/iq/analyzer/ConfigScanner.java
deleted file mode 100644
index bb725596..00000000
--- a/src/main/java/io/github/randomcodespace/iq/analyzer/ConfigScanner.java
+++ /dev/null
@@ -1,543 +0,0 @@
-package io.github.randomcodespace.iq.analyzer;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.stereotype.Component;
-import org.yaml.snakeyaml.Yaml;
-
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Scans well-known config files in a repository root and populates an
- * {@link InfrastructureRegistry} with discovered infrastructure endpoints.
- *
- * Supported sources:
- *
- * - Spring: {@code application.yml} / {@code application.properties}
- * - Docker Compose: {@code docker-compose.yml} / {@code compose.yml}
- * - Generic env files: {@code .env}, {@code .env.local}, etc.
- * - Maven build: {@code pom.xml} — dependency-based detection
- *
- *
- * Stateless Spring bean — safe for virtual threads and concurrent use.
- */
-@Component
-public class ConfigScanner {
- private static final String PROP_DEPENDENCY = "dependency";
- private static final String PROP_DETECTION = "detection";
- private static final String PROP_ELASTICSEARCH = "elasticsearch";
- private static final String PROP_H2 = "h2";
- private static final String PROP_KAFKA = "kafka";
- private static final String PROP_MARIADB = "mariadb";
- private static final String PROP_MONGO = "mongo";
- private static final String PROP_MONGODB = "mongodb";
- private static final String PROP_MYSQL = "mysql";
- private static final String PROP_POM_XML = "pom.xml";
- private static final String PROP_POSTGRESQL = "postgresql";
- private static final String PROP_RABBITMQ = "rabbitmq";
- private static final String PROP_REDIS = "redis";
- private static final String PROP_SOURCE = "source";
- private static final String PROP_SQL = "sql";
-
-
- private static final Logger log = LoggerFactory.getLogger(ConfigScanner.class);
-
- private static final Pattern POM_ARTIFACT_ID =
- Pattern.compile("([^<]+)");
-
- // -------------------------------------------------------------------------
- // Public API
- // -------------------------------------------------------------------------
-
- /**
- * Scan the given repository root and return a populated {@link InfrastructureRegistry}.
- * Never throws — errors are logged at DEBUG level and scanning continues.
- *
- * @param repoPath repository root directory
- * @return populated registry (may be empty if no config files found)
- */
- public InfrastructureRegistry scan(Path repoPath) {
- Path root = repoPath.toAbsolutePath().normalize();
- InfrastructureRegistry registry = new InfrastructureRegistry();
-
- scanSpringConfig(root, registry);
- scanDockerCompose(root, registry);
- scanEnvFiles(root, registry);
- scanBuildFiles(root, registry);
-
- log.debug("ConfigScanner found {} endpoints in {}", registry.size(), root);
- return registry;
- }
-
- // -------------------------------------------------------------------------
- // Spring application.yml / application.properties
- // -------------------------------------------------------------------------
-
- private void scanSpringConfig(Path root, InfrastructureRegistry registry) {
- List candidates = List.of(
- root.resolve("application.yml"),
- root.resolve("application.yaml"),
- root.resolve("application.properties"),
- root.resolve("src/main/resources/application.yml"),
- root.resolve("src/main/resources/application.yaml"),
- root.resolve("src/main/resources/application.properties")
- );
-
- for (Path candidate : candidates) {
- if (Files.isRegularFile(candidate)) {
- String name = java.util.Objects.toString(candidate.getFileName(), "");
- if (name.endsWith(".properties")) {
- parseSpringProperties(candidate, registry);
- } else {
- parseSpringYaml(candidate, registry);
- }
- }
- }
- }
-
- @SuppressWarnings("unchecked")
- private void parseSpringYaml(Path file, InfrastructureRegistry registry) {
- try {
- String content = Files.readString(file, StandardCharsets.UTF_8);
- Yaml yaml = new Yaml(new org.yaml.snakeyaml.constructor.SafeConstructor(new org.yaml.snakeyaml.LoaderOptions()));
- Object loaded = yaml.load(content);
- if (!(loaded instanceof Map, ?> raw)) return;
-
- Map flat = new TreeMap<>();
- flattenYaml("", (Map) raw, flat);
- processSpringFlatMap(flat, registry);
- } catch (Exception e) {
- log.debug("Failed to parse Spring YAML {}: {}", file, e.getMessage());
- }
- }
-
- private void parseSpringProperties(Path file, InfrastructureRegistry registry) {
- try {
- Properties props = new Properties();
- try (var reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) {
- props.load(reader);
- }
- Map flat = new TreeMap<>();
- for (String key : props.stringPropertyNames()) {
- flat.put(key, props.getProperty(key));
- }
- processSpringFlatMap(flat, registry);
- } catch (Exception e) {
- log.debug("Failed to parse Spring properties {}: {}", file, e.getMessage());
- }
- }
-
- @SuppressWarnings("unchecked")
- private void flattenYaml(String prefix, Map data, Map result) {
- for (var entry : data.entrySet()) {
- String key = prefix.isEmpty() ? entry.getKey() : prefix + "." + entry.getKey();
- Object value = entry.getValue();
- if (value instanceof Map, ?> nested) {
- flattenYaml(key, (Map) nested, result);
- } else if (value != null) {
- result.put(key, String.valueOf(value));
- }
- }
- }
-
- private void processSpringFlatMap(Map flat, InfrastructureRegistry registry) {
- // Service name
- String appName = flat.get("spring.application.name");
- if (appName != null && !appName.isBlank()) {
- registry.setServiceName(appName.trim());
- }
-
- // Datasource (JPA/JDBC)
- String dsUrl = flat.get("spring.datasource.url");
- if (dsUrl != null && !dsUrl.isBlank()) {
- String dbType = detectDatabaseTypeFromUrl(dsUrl);
- registry.register(new InfraEndpoint(
- "db:spring.datasource",
- InfraEndpoint.Kind.DATABASE,
- "spring.datasource",
- dbType,
- dsUrl,
- Map.of(PROP_SOURCE, "spring.datasource.url")));
- }
-
- // Kafka bootstrap-servers
- String kafkaServers = coalesce(
- flat.get("spring.kafka.bootstrap-servers"),
- flat.get("spring.kafka.bootstrap_servers"));
- if (kafkaServers != null) {
- registry.register(new InfraEndpoint(
- "topic:spring.kafka",
- InfraEndpoint.Kind.TOPIC,
- "spring.kafka",
- PROP_KAFKA,
- kafkaServers,
- Map.of(PROP_SOURCE, "spring.kafka.bootstrap-servers")));
- }
-
- // Redis (spring.data.redis or spring.redis)
- String redisHost = coalesce(
- flat.get("spring.data.redis.host"),
- flat.get("spring.redis.host"));
- if (redisHost != null) {
- String redisPort = coalesce(
- flat.get("spring.data.redis.port"),
- flat.get("spring.redis.port"),
- "6379");
- registry.register(new InfraEndpoint(
- "cache:spring.redis",
- InfraEndpoint.Kind.CACHE,
- "spring.redis",
- PROP_REDIS,
- "redis://" + redisHost + ":" + redisPort,
- Map.of(PROP_SOURCE, "spring.redis.host")));
- }
-
- // RabbitMQ
- String rabbitHost = flat.get("spring.rabbitmq.host");
- if (rabbitHost != null && !rabbitHost.isBlank()) {
- String rabbitPort = coalesce(flat.get("spring.rabbitmq.port"), "5672");
- registry.register(new InfraEndpoint(
- "queue:spring.rabbitmq",
- InfraEndpoint.Kind.QUEUE,
- "spring.rabbitmq",
- PROP_RABBITMQ,
- "amqp://" + rabbitHost + ":" + rabbitPort,
- Map.of(PROP_SOURCE, "spring.rabbitmq.host")));
- }
- }
-
- // -------------------------------------------------------------------------
- // Docker Compose
- // -------------------------------------------------------------------------
-
- private void scanDockerCompose(Path root, InfrastructureRegistry registry) {
- List candidates = List.of(
- root.resolve("docker-compose.yml"),
- root.resolve("docker-compose.yaml"),
- root.resolve("compose.yml"),
- root.resolve("compose.yaml")
- );
-
- for (Path candidate : candidates) {
- if (Files.isRegularFile(candidate)) {
- parseDockerCompose(candidate, registry);
- return; // Only process the first found
- }
- }
- }
-
- private void parseDockerCompose(Path file, InfrastructureRegistry registry) {
- try {
- String content = Files.readString(file, StandardCharsets.UTF_8);
- Yaml yaml = new Yaml(new org.yaml.snakeyaml.constructor.SafeConstructor(new org.yaml.snakeyaml.LoaderOptions()));
- Object loaded = yaml.load(content);
- if (!(loaded instanceof Map, ?> data)) return;
-
- Object servicesObj = data.get("services");
- if (!(servicesObj instanceof Map, ?> services)) return;
-
- for (var entry : services.entrySet()) {
- String svcName = String.valueOf(entry.getKey());
- if (!(entry.getValue() instanceof Map, ?> svcConfig)) continue;
-
- Object imageObj = svcConfig.get("image");
- if (imageObj == null) continue;
-
- String image = String.valueOf(imageObj);
- String ports = extractFirstPort(svcConfig);
- detectDockerInfra(svcName, image, ports, registry);
- }
- } catch (Exception e) {
- log.debug("Failed to parse Docker Compose {}: {}", file, e.getMessage());
- }
- }
-
- private String extractFirstPort(Map, ?> svcConfig) {
- Object portsObj = svcConfig.get("ports");
- if (!(portsObj instanceof List> portsList) || portsList.isEmpty()) return null;
- return String.valueOf(portsList.get(0));
- }
-
- private void detectDockerInfra(String svcName, String image, String ports,
- InfrastructureRegistry registry) {
- String imageLower = image.toLowerCase();
- // Strip tag (e.g. postgres:15 → postgres)
- String imageBase = imageLower.contains(":") ? imageLower.substring(0, imageLower.indexOf(':')) : imageLower;
- // Strip registry prefix (e.g. docker.io/library/postgres → postgres)
- if (imageBase.contains("/")) {
- imageBase = imageBase.substring(imageBase.lastIndexOf('/') + 1);
- }
-
- Map