Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
b4d03ea
checkpoint: pre-yolo 20260403-163239
aksOps Apr 3, 2026
c9b8d66
fix(security): add npm override for lodash >= 4.17.24 to fix HIGH CVEs
aksOps Apr 3, 2026
a912c6a
feat(intelligence): implement capability matrix and deterministic que…
aksOps Apr 3, 2026
b33baa9
feat(intelligence): Phase 1 provenance model, repository identity, fi…
aksOps Apr 3, 2026
59f2e4e
fix(intelligence): address PE architecture review — RepositoryIdentit…
aksOps Apr 3, 2026
624fdd3
fix(intelligence): address CTO review — TreeMap determinism + Neo4j p…
aksOps Apr 3, 2026
d3e462c
fix(intelligence): fix HashMap determinism in FileInventory + gitigno…
aksOps Apr 3, 2026
7b30a33
fix(intelligence): address PE review — cpp capability table + QueryPl…
aksOps Apr 3, 2026
d901b3b
fix(intelligence): fix Process resource leak in RepositoryIdentity.ru…
aksOps Apr 3, 2026
eef9dbc
fix(intelligence): close InputStream explicitly in runGit() — SonarQu…
aksOps Apr 3, 2026
cc08698
feat(intelligence): Phase 2 lexical intelligence — doc comment index …
aksOps Apr 3, 2026
c855019
test(intelligence): Phase 1-3 test execution — edge cases, cross-lang…
aksOps Apr 3, 2026
03eca64
fix(intelligence): explicit UTF-8 in RepositoryIdentity.runGit() — DM…
aksOps Apr 3, 2026
4f9eddd
feat(intelligence): Phase 5 language-specific enrichment — extractors…
aksOps Apr 3, 2026
eb049ce
fix(intelligence): Phase 5 JavaLanguageExtractor — false-positive CAL…
aksOps Apr 3, 2026
f7390b7
fix(intelligence): GoLanguageExtractor duplicate IMPORTS edges (RAN-170)
aksOps Apr 3, 2026
00b62f5
fix(intelligence): Phase 5 PE re-review fixes — determinism, TS dedup…
aksOps Apr 3, 2026
6003cb8
merge: sync feature/ran-162-language-extractors with main
aksOps Apr 3, 2026
66bd4fd
fix(intelligence): Phase 5 PE re-review round 2 — 4 extractor fixes (…
aksOps Apr 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import io.github.randomcodespace.iq.cache.AnalysisCache;
import io.github.randomcodespace.iq.config.CodeIqConfig;
import io.github.randomcodespace.iq.intelligence.RepositoryIdentity;
import io.github.randomcodespace.iq.intelligence.extractor.LanguageEnricher;
import io.github.randomcodespace.iq.intelligence.lexical.LexicalEnricher;
import io.github.randomcodespace.iq.model.CodeEdge;
import io.github.randomcodespace.iq.model.CodeNode;
Expand Down Expand Up @@ -61,13 +62,16 @@ public class EnrichCommand implements Callable<Integer> {
private final LayerClassifier layerClassifier;
private final List<Linker> linkers;
private final LexicalEnricher lexicalEnricher;
private final LanguageEnricher languageEnricher;

public EnrichCommand(CodeIqConfig config, LayerClassifier layerClassifier,
List<Linker> linkers, LexicalEnricher lexicalEnricher) {
List<Linker> linkers, LexicalEnricher lexicalEnricher,
LanguageEnricher languageEnricher) {
this.config = config;
this.layerClassifier = layerClassifier;
this.linkers = linkers;
this.lexicalEnricher = lexicalEnricher;
this.languageEnricher = languageEnricher;
}

@Override
Expand Down Expand Up @@ -151,6 +155,10 @@ private int enrichFromCache(AnalysisCache cache, Path root, NumberFormat nf, Ins
CliOutput.step("\uD83D\uDD0D", "Enriching lexical metadata...");
lexicalEnricher.enrich(enrichedNodes, root);

// 3b2. Language-specific enrichment (call graph, type hints, import resolution)
CliOutput.step("\uD83D\uDD0D", "Running language-specific enrichment...");
languageEnricher.enrich(enrichedNodes, enrichedEdges, root);

// 3c. Detect services
CliOutput.step("\uD83C\uDFD7\uFE0F", "Detecting service boundaries...");
var serviceDetector = new io.github.randomcodespace.iq.analyzer.ServiceDetector();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
package io.github.randomcodespace.iq.intelligence.extractor;

import io.github.randomcodespace.iq.detector.DetectorContext;
import io.github.randomcodespace.iq.model.CodeEdge;
import io.github.randomcodespace.iq.model.CodeNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
* Runs all {@link LanguageExtractor} beans after {@link io.github.randomcodespace.iq.intelligence.lexical.LexicalEnricher}
* during the {@code enrich} command.
*
* <p>Builds a combined node registry (by id and fqn), groups nodes by source file,
* reads each file once, and dispatches to matching extractors. Results (edges, type hints)
* are written back into the in-memory node/edge lists before Neo4j bulk-load.
*
* <p>Extraction failures log a warning and are skipped — the pipeline never aborts.
*/
@Component
public class LanguageEnricher {

private static final Logger log = LoggerFactory.getLogger(LanguageEnricher.class);

/**
* Language alias map: normalises file-extension languages to extractor language keys.
* e.g. "javascript" nodes are handled by the "typescript" extractor.
*/
private static final Map<String, String> LANGUAGE_ALIASES = Map.of(
"javascript", "typescript"
);

private final List<LanguageExtractor> extractors;

public LanguageEnricher(List<LanguageExtractor> extractors) {
this.extractors = List.copyOf(extractors);
}

/**
* Enrich nodes with language-specific intelligence and add new edges.
*
* @param nodes All enriched nodes (post-linker, post-classifier, post-lexical).
* @param edges Mutable edge list — new edges are appended in place.
* @param rootPath Absolute root path of the analysed repository (for file reads).
*/
public void enrich(List<CodeNode> nodes, List<CodeEdge> edges, Path rootPath) {

Check failure on line 56 in src/main/java/io/github/randomcodespace/iq/intelligence/extractor/LanguageEnricher.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this method to reduce its Cognitive Complexity from 20 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=RandomCodeSpace_code-iq&issues=AZ1U9H4Dv4af9OZ5POlf&open=AZ1U9H4Dv4af9OZ5POlf&pullRequest=29
if (extractors.isEmpty()) {
log.debug("No LanguageExtractor beans registered — skipping language enrichment");
return;
}

// Build combined node registry: id → node, fqn → node
Map<String, CodeNode> nodeRegistry = buildRegistry(nodes);

// Build extractor lookup: normalised language → extractor
Map<String, LanguageExtractor> extractorByLanguage = new HashMap<>();
for (LanguageExtractor extractor : extractors) {
extractorByLanguage.put(extractor.getLanguage(), extractor);
}

// Group nodes by file path (read each file only once).
// TreeMap guarantees deterministic iteration order (alphabetical by path).
Map<String, List<CodeNode>> nodesByFile = new TreeMap<>();
for (CodeNode node : nodes) {
if (node.getFilePath() != null) {
nodesByFile.computeIfAbsent(node.getFilePath(), k -> new ArrayList<>()).add(node);
}
}

int edgesAdded = 0;
int typeHintsAdded = 0;

for (Map.Entry<String, List<CodeNode>> entry : nodesByFile.entrySet()) {

Check warning on line 83 in src/main/java/io/github/randomcodespace/iq/intelligence/extractor/LanguageEnricher.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Reduce the total number of break and continue statements in this loop to use at most one.

See more on https://sonarcloud.io/project/issues?id=RandomCodeSpace_code-iq&issues=AZ1U9H4Dv4af9OZ5POle&open=AZ1U9H4Dv4af9OZ5POle&pullRequest=29
String filePath = entry.getKey();
List<CodeNode> fileNodes = entry.getValue();

String language = detectLanguage(filePath);
if (language == null) continue;

String resolvedLanguage = LANGUAGE_ALIASES.getOrDefault(language, language);
LanguageExtractor extractor = extractorByLanguage.get(resolvedLanguage);
if (extractor == null) continue;

String content = readFile(rootPath, filePath);
if (content == null) continue;

DetectorContext ctx = new DetectorContext(filePath, language, content, nodeRegistry, null);

for (CodeNode node : fileNodes) {
try {
LanguageExtractionResult result = extractor.extract(ctx, node);
edges.addAll(result.callEdges());
edges.addAll(result.symbolReferences());
edgesAdded += result.callEdges().size() + result.symbolReferences().size();
for (Map.Entry<String, String> hint : result.typeHints().entrySet()) {
node.getProperties().put(hint.getKey(), hint.getValue());
typeHintsAdded++;
}
} catch (Exception e) {
log.warn("LanguageExtractor {} failed on node {} in {}: {}",
extractor.getClass().getSimpleName(), node.getId(), filePath, e.getMessage());
}
}
}

log.info("Language enrichment: {} edges added, {} type hints added across {} extractors",
edgesAdded, typeHintsAdded, extractorByLanguage.size());
}

private Map<String, CodeNode> buildRegistry(List<CodeNode> nodes) {
Map<String, CodeNode> registry = new HashMap<>();
for (CodeNode node : nodes) {
if (node.getId() != null) {
registry.put(node.getId(), node);
}
if (node.getFqn() != null && !node.getFqn().isEmpty()) {
registry.put(node.getFqn(), node);
}
}
return registry;
}

private String readFile(Path rootPath, String filePath) {
try {
Path resolved = rootPath.resolve(filePath);
if (!Files.exists(resolved)) return null;
return Files.readString(resolved, StandardCharsets.UTF_8);
} catch (IOException e) {
log.debug("Could not read file {}: {}", filePath, e.getMessage());
return null;
}
}

/**
* Map file extension to language string (mirrors FileDiscovery conventions).
*/
static String detectLanguage(String filePath) {
if (filePath == null) return null;
int dot = filePath.lastIndexOf('.');
if (dot < 0) return null;
return switch (filePath.substring(dot + 1).toLowerCase()) {
case "java" -> "java";
case "ts", "tsx" -> "typescript";
case "js", "jsx", "mjs", "cjs" -> "javascript";
case "py", "pyw" -> "python";
case "go" -> "go";
default -> null;
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package io.github.randomcodespace.iq.intelligence.extractor;

import io.github.randomcodespace.iq.intelligence.CapabilityLevel;
import io.github.randomcodespace.iq.model.CodeEdge;

import java.util.List;
import java.util.Map;

/**
* Result of a single {@link LanguageExtractor#extract} call.
*
* @param callEdges CALLS edges discovered for this node (method invocations, function calls).
* @param symbolReferences IMPORTS / DEPENDS_ON edges from import/symbol resolution.
* @param typeHints Type annotation key-value pairs to store in node properties
* (e.g. {@code "param_types" -> "int, str"}, {@code "return_type" -> "str"}).
* @param confidence Confidence level of this extraction result.
*/
public record LanguageExtractionResult(
List<CodeEdge> callEdges,
List<CodeEdge> symbolReferences,
Map<String, String> typeHints,
CapabilityLevel confidence
) {
public LanguageExtractionResult {
callEdges = List.copyOf(callEdges);
symbolReferences = List.copyOf(symbolReferences);
typeHints = Map.copyOf(typeHints);
}

/** Empty result with PARTIAL confidence. */
public static LanguageExtractionResult empty() {
return new LanguageExtractionResult(List.of(), List.of(), Map.of(), CapabilityLevel.PARTIAL);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package io.github.randomcodespace.iq.intelligence.extractor;

import io.github.randomcodespace.iq.detector.DetectorContext;
import io.github.randomcodespace.iq.model.CodeNode;

/**
* Strategy interface for language-specific enrichment extractors.
*
* <p>Implementations are stateless Spring {@code @Component} beans auto-discovered
* via classpath scan. Each extractor targets a single language and deepens the
* capability matrix beyond what the general intelligence layer provides.
*
* <p>Extractors run during {@code enrich} (after {@link io.github.randomcodespace.iq.intelligence.lexical.LexicalEnricher})
* and must never run during {@code index}.
*/
public interface LanguageExtractor {

/**
* The primary language this extractor targets (e.g. "java", "typescript", "python", "go").
* Matches the language values produced by {@code FileDiscovery}.
*/
String getLanguage();

/**
* Extract additional intelligence for the given node from its source file.
*
* <p>The {@code ctx} carries file content and a node registry via {@code parsedData}
* (cast to {@code Map<String, CodeNode>} — a combined id + fqn index built by
* {@link LanguageEnricher}).
*
* @param ctx Detector context for the node's source file; {@code parsedData} contains
* the node registry as {@code Map<String, CodeNode>}.
* @param node The specific node to enrich.
* @return Extraction result; never {@code null}.
*/
LanguageExtractionResult extract(DetectorContext ctx, CodeNode node);
}
Loading
Loading