Skip to content

Commit 6431cca

Browse files
aksOpsclaude
andcommitted
Add 16 structured data detectors for JSON, YAML, TOML, INI, Markdown, Proto, SQL, Batch
Generic detectors (6): json_structure, yaml_structure, toml_structure, ini_structure, markdown_structure, proto_structure — extract CONFIG_KEY and CONFIG_FILE nodes from any file of their format. Specialized detectors (10): package_json, tsconfig_json, openapi, docker_compose, github_actions, kubernetes, pyproject_toml, sql_structure, batch_structure, properties_detector — triggered by filename patterns, extract domain-specific nodes (endpoints, entities, infra resources, dependencies). Infrastructure: add TOML/INI/Markdown/Proto parsers to analyzer.py, register all 16 detectors in registry.py, add .env/.csv/.dockerfile to include_extensions in config.py. Benchmark results across 189K files in 8 projects: - +155K new nodes (+14.3%), +201K new edges (+10.6%) - 100% deterministic across 2 runs on 7/8 projects - <5% performance overhead Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5d2ac25 commit 6431cca

22 files changed

Lines changed: 2250 additions & 0 deletions

src/code_intelligence/analyzer.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,33 @@ def _parse_structured(language: str, content: bytes, file_path: str) -> Any:
7171
elif language == "sql":
7272
from code_intelligence.parsing.structured.sql_parser import SqlParser
7373
return SqlParser().parse(content, file_path)
74+
elif language == "toml":
75+
try:
76+
import tomllib
77+
except ModuleNotFoundError:
78+
import tomli as tomllib # type: ignore[no-redef]
79+
try:
80+
text = content.decode("utf-8", errors="replace")
81+
data = tomllib.loads(text)
82+
except Exception as exc:
83+
return {"error": "invalid_toml", "file": file_path, "detail": str(exc)}
84+
return {"type": "toml", "file": file_path, "data": data}
85+
elif language == "ini":
86+
import configparser
87+
try:
88+
text = content.decode("utf-8", errors="replace")
89+
parser = configparser.ConfigParser()
90+
parser.read_string(text)
91+
data = {section: dict(parser[section]) for section in parser.sections()}
92+
except Exception as exc:
93+
return {"error": "invalid_ini", "file": file_path, "detail": str(exc)}
94+
return {"type": "ini", "file": file_path, "data": data}
95+
elif language == "markdown":
96+
# Return raw text for regex-based detection
97+
return {"type": "markdown", "file": file_path, "data": content.decode("utf-8", errors="replace")}
98+
elif language == "proto":
99+
# Return raw text for regex-based detection
100+
return {"type": "proto", "file": file_path, "data": content.decode("utf-8", errors="replace")}
74101
return None
75102

76103

src/code_intelligence/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class DiscoveryConfig(BaseModel):
2424
".rb", ".rs", ".kt", ".kts", ".scala", ".swift",
2525
".r", ".R", ".pl", ".pm", ".lua", ".dart",
2626
".toml", ".ini", ".cfg", ".conf",
27+
".env", ".csv", ".dockerfile",
2728
])
2829
exclude_patterns: list[str] = Field(default_factory=lambda: [
2930
"**/node_modules/**",

src/code_intelligence/detectors/config/__init__.py

Whitespace-only changes.
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
"""Batch script (.bat/.cmd) structure detector for labels, calls, and variables."""
2+
3+
from __future__ import annotations
4+
5+
import re
6+
7+
from code_intelligence.detectors.base import DetectorContext, DetectorResult
8+
from code_intelligence.models.graph import (
9+
EdgeKind,
10+
GraphEdge,
11+
GraphNode,
12+
NodeKind,
13+
SourceLocation,
14+
)
15+
16+
_LABEL_RE = re.compile(r'^:(\w+)', re.MULTILINE)
17+
_CALL_RE = re.compile(r'CALL\s+:?(\S+)', re.IGNORECASE)
18+
_SET_RE = re.compile(r'SET\s+(\w+)=', re.IGNORECASE)
19+
20+
21+
class BatchStructureDetector:
22+
"""Detects Batch script structures: labels, CALL commands, and SET variables."""
23+
24+
name: str = "batch_structure"
25+
supported_languages: tuple[str, ...] = ("batch",)
26+
27+
def detect(self, ctx: DetectorContext) -> DetectorResult:
28+
result = DetectorResult()
29+
30+
try:
31+
text = ctx.content.decode("utf-8", errors="replace")
32+
except Exception:
33+
return result
34+
35+
filepath = ctx.file_path
36+
lines = text.split("\n")
37+
module_id = f"bat:{filepath}"
38+
39+
# MODULE node for the script
40+
result.nodes.append(GraphNode(
41+
id=module_id,
42+
kind=NodeKind.MODULE,
43+
label=filepath,
44+
fqn=filepath,
45+
module=ctx.module_name,
46+
location=SourceLocation(
47+
file_path=filepath,
48+
line_start=1,
49+
),
50+
))
51+
52+
for i, line in enumerate(lines):
53+
line_num = i + 1
54+
stripped = line.strip()
55+
56+
# Skip comments and echo off
57+
if not stripped:
58+
continue
59+
upper = stripped.upper()
60+
if upper.startswith("@ECHO OFF"):
61+
continue
62+
if upper.startswith("REM ") or upper == "REM":
63+
continue
64+
if stripped.startswith("::"):
65+
continue
66+
67+
# Labels
68+
m = _LABEL_RE.match(stripped)
69+
if m:
70+
label_name = m.group(1)
71+
result.nodes.append(GraphNode(
72+
id=f"bat:{filepath}:label:{label_name}",
73+
kind=NodeKind.METHOD,
74+
label=f":{label_name}",
75+
fqn=f"{filepath}:{label_name}",
76+
module=ctx.module_name,
77+
location=SourceLocation(
78+
file_path=filepath,
79+
line_start=line_num,
80+
),
81+
))
82+
result.edges.append(GraphEdge(
83+
source=module_id,
84+
target=f"bat:{filepath}:label:{label_name}",
85+
kind=EdgeKind.CONTAINS,
86+
label=f"{filepath} contains :{label_name}",
87+
))
88+
continue
89+
90+
# CALL commands
91+
m = _CALL_RE.search(stripped)
92+
if m:
93+
call_target = m.group(1)
94+
# Determine if calling an internal label or external script
95+
if call_target.startswith(":"):
96+
target_id = f"bat:{filepath}:label:{call_target[1:]}"
97+
elif "." in call_target:
98+
# External script call
99+
target_id = call_target
100+
else:
101+
target_id = f"bat:{filepath}:label:{call_target}"
102+
103+
result.edges.append(GraphEdge(
104+
source=module_id,
105+
target=target_id,
106+
kind=EdgeKind.CALLS,
107+
label=f"{filepath} calls {call_target}",
108+
))
109+
110+
# SET variables
111+
m = _SET_RE.search(stripped)
112+
if m:
113+
var_name = m.group(1)
114+
result.nodes.append(GraphNode(
115+
id=f"bat:{filepath}:set:{var_name}",
116+
kind=NodeKind.CONFIG_DEFINITION,
117+
label=f"SET {var_name}",
118+
fqn=f"{filepath}:{var_name}",
119+
module=ctx.module_name,
120+
location=SourceLocation(
121+
file_path=filepath,
122+
line_start=line_num,
123+
),
124+
properties={"variable": var_name},
125+
))
126+
127+
return result
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
"""Docker Compose detector for container orchestration definitions."""
2+
3+
from __future__ import annotations
4+
5+
import os
6+
import re
7+
from typing import Any
8+
9+
from code_intelligence.detectors.base import DetectorContext, DetectorResult
10+
from code_intelligence.models.graph import (
11+
EdgeKind,
12+
GraphEdge,
13+
GraphNode,
14+
NodeKind,
15+
SourceLocation,
16+
)
17+
18+
_COMPOSE_FILENAME_RE = re.compile(
19+
r"^(docker-compose|compose).*\.(yml|yaml)$", re.IGNORECASE
20+
)
21+
22+
23+
def _is_compose_file(ctx: DetectorContext) -> bool:
24+
"""Check whether the file is a Docker Compose file."""
25+
basename = os.path.basename(ctx.file_path)
26+
if _COMPOSE_FILENAME_RE.match(basename):
27+
return True
28+
# Fallback: check parsed data for compose-like structure
29+
if ctx.parsed_data and ctx.parsed_data.get("type") == "yaml":
30+
data = ctx.parsed_data.get("data")
31+
if isinstance(data, dict) and "services" in data:
32+
return True
33+
return False
34+
35+
36+
class DockerComposeDetector:
37+
"""Detects services, ports, volumes, networks, and dependencies from Docker Compose files."""
38+
39+
name: str = "docker_compose"
40+
supported_languages: tuple[str, ...] = ("yaml",)
41+
42+
def detect(self, ctx: DetectorContext) -> DetectorResult:
43+
result = DetectorResult()
44+
45+
if not _is_compose_file(ctx):
46+
return result
47+
48+
if not ctx.parsed_data:
49+
return result
50+
51+
data = ctx.parsed_data.get("data")
52+
if not isinstance(data, dict):
53+
return result
54+
55+
services = data.get("services")
56+
if not isinstance(services, dict):
57+
return result
58+
59+
fp = ctx.file_path
60+
61+
# Build a set of known service IDs for edge resolution
62+
service_ids: dict[str, str] = {}
63+
for svc_name in services:
64+
service_ids[svc_name] = f"compose:{fp}:service:{svc_name}"
65+
66+
for svc_name, svc_def in services.items():
67+
if not isinstance(svc_def, dict):
68+
continue
69+
70+
svc_id = service_ids[svc_name]
71+
72+
# Properties for the service node
73+
props: dict[str, Any] = {}
74+
if "image" in svc_def:
75+
props["image"] = str(svc_def["image"])
76+
build = svc_def.get("build")
77+
if isinstance(build, str):
78+
props["build_context"] = build
79+
elif isinstance(build, dict) and "context" in build:
80+
props["build_context"] = str(build["context"])
81+
82+
# INFRA_RESOURCE node for the service
83+
result.nodes.append(GraphNode(
84+
id=svc_id,
85+
kind=NodeKind.INFRA_RESOURCE,
86+
label=svc_name,
87+
fqn=f"compose:{svc_name}",
88+
module=ctx.module_name,
89+
location=SourceLocation(file_path=fp),
90+
properties=props,
91+
))
92+
93+
# Ports
94+
ports = svc_def.get("ports")
95+
if isinstance(ports, list):
96+
for port_entry in ports:
97+
port_str = str(port_entry)
98+
result.nodes.append(GraphNode(
99+
id=f"compose:{fp}:service:{svc_name}:port:{port_str}",
100+
kind=NodeKind.CONFIG_KEY,
101+
label=f"{svc_name} port {port_str}",
102+
module=ctx.module_name,
103+
location=SourceLocation(file_path=fp),
104+
properties={"port": port_str},
105+
))
106+
107+
# depends_on
108+
depends_on = svc_def.get("depends_on")
109+
if isinstance(depends_on, list):
110+
for dep in depends_on:
111+
dep_str = str(dep)
112+
if dep_str in service_ids:
113+
result.edges.append(GraphEdge(
114+
source=svc_id,
115+
target=service_ids[dep_str],
116+
kind=EdgeKind.DEPENDS_ON,
117+
label=f"{svc_name} depends on {dep_str}",
118+
))
119+
elif isinstance(depends_on, dict):
120+
for dep_str in depends_on:
121+
if dep_str in service_ids:
122+
result.edges.append(GraphEdge(
123+
source=svc_id,
124+
target=service_ids[dep_str],
125+
kind=EdgeKind.DEPENDS_ON,
126+
label=f"{svc_name} depends on {dep_str}",
127+
))
128+
129+
# links
130+
links = svc_def.get("links")
131+
if isinstance(links, list):
132+
for link in links:
133+
link_name = str(link).split(":")[0]
134+
if link_name in service_ids:
135+
result.edges.append(GraphEdge(
136+
source=svc_id,
137+
target=service_ids[link_name],
138+
kind=EdgeKind.CONNECTS_TO,
139+
label=f"{svc_name} links to {link_name}",
140+
))
141+
142+
# Volumes
143+
volumes = svc_def.get("volumes")
144+
if isinstance(volumes, list):
145+
for vol_entry in volumes:
146+
vol_str = str(vol_entry) if not isinstance(vol_entry, dict) else vol_entry.get("source", str(vol_entry))
147+
result.nodes.append(GraphNode(
148+
id=f"compose:{fp}:service:{svc_name}:volume:{vol_str}",
149+
kind=NodeKind.CONFIG_KEY,
150+
label=f"{svc_name} volume {vol_str}",
151+
module=ctx.module_name,
152+
location=SourceLocation(file_path=fp),
153+
properties={"volume": vol_str},
154+
))
155+
156+
# Networks
157+
networks = svc_def.get("networks")
158+
if isinstance(networks, list):
159+
for net in networks:
160+
result.nodes.append(GraphNode(
161+
id=f"compose:{fp}:service:{svc_name}:network:{net}",
162+
kind=NodeKind.CONFIG_KEY,
163+
label=f"{svc_name} network {net}",
164+
module=ctx.module_name,
165+
location=SourceLocation(file_path=fp),
166+
properties={"network": str(net)},
167+
))
168+
elif isinstance(networks, dict):
169+
for net_name in networks:
170+
result.nodes.append(GraphNode(
171+
id=f"compose:{fp}:service:{svc_name}:network:{net_name}",
172+
kind=NodeKind.CONFIG_KEY,
173+
label=f"{svc_name} network {net_name}",
174+
module=ctx.module_name,
175+
location=SourceLocation(file_path=fp),
176+
properties={"network": str(net_name)},
177+
))
178+
179+
return result

0 commit comments

Comments
 (0)