add: builder skill#19
Conversation
There was a problem hiding this comment.
Code Review
This pull request adds the aimx-hydra-lightning-builder skill, enabling users to scaffold, audit, and migrate machine learning repositories using Hydra, PyTorch Lightning, and Aim. It includes a project template, architectural guides, and scripts for repository analysis. The review feedback identified several optimization opportunities in the audit_repo.py script, including improving memory efficiency for file reading, optimizing directory traversal by pruning ignored paths, and reducing redundant I/O through content caching.
| def read_text(path: Path, limit: int = 300_000) -> str: | ||
| try: | ||
| return path.read_text(encoding="utf-8", errors="ignore")[:limit] | ||
| except OSError: | ||
| return "" |
There was a problem hiding this comment.
The current implementation of read_text reads the entire file into memory before slicing it. This can lead to high memory usage or even OOM errors if the repository contains very large text files (e.g., logs or datasets). It is more efficient to read only up to the specified limit directly from the file stream.
| def read_text(path: Path, limit: int = 300_000) -> str: | |
| try: | |
| return path.read_text(encoding="utf-8", errors="ignore")[:limit] | |
| except OSError: | |
| return "" | |
| def read_text(path: Path, limit: int = 300_000) -> str: | |
| try: | |
| with path.open("r", encoding="utf-8", errors="ignore") as f: | |
| return f.read(limit) | |
| except OSError: | |
| return "" |
| def iter_text_files(repo: Path) -> list[Path]: | ||
| files: list[Path] = [] | ||
| for path in repo.rglob("*"): | ||
| if any(part in IGNORED_PARTS for part in path.parts): | ||
| continue | ||
| if path.is_file() and path.suffix in TEXT_EXTENSIONS: | ||
| files.append(path) | ||
| return files |
There was a problem hiding this comment.
Using repo.rglob("*") and then checking path.parts for every file is inefficient because it traverses into ignored directories (like .git or .venv) before filtering them out. For large repositories, this can significantly slow down the audit. Using os.walk allows you to prune these directories and avoid unnecessary traversal.
| def iter_text_files(repo: Path) -> list[Path]: | |
| files: list[Path] = [] | |
| for path in repo.rglob("*"): | |
| if any(part in IGNORED_PARTS for part in path.parts): | |
| continue | |
| if path.is_file() and path.suffix in TEXT_EXTENSIONS: | |
| files.append(path) | |
| return files | |
| def iter_text_files(repo: Path) -> list[Path]: | |
| import os | |
| files: list[Path] = [] | |
| for root, dirs, filenames in os.walk(repo): | |
| dirs[:] = [d for d in dirs if d not in IGNORED_PARTS] | |
| for filename in filenames: | |
| path = Path(root) / filename | |
| if path.suffix in TEXT_EXTENSIONS: | |
| files.append(path) | |
| return files |
| files = iter_text_files(repo) | ||
| rel_files = {rel(repo, path) for path in files} | ||
| configs_dir = repo / "configs" | ||
| config_groups = sorted(path.name for path in configs_dir.iterdir() if path.is_dir()) if configs_dir.exists() else [] |
There was a problem hiding this comment.
Using configs_dir.exists() is not sufficient because iterdir() will raise a NotADirectoryError if configs_dir exists but is a file. Using is_dir() is safer.
| config_groups = sorted(path.name for path in configs_dir.iterdir() if path.is_dir()) if configs_dir.exists() else [] | |
| config_groups = sorted(path.name for path in configs_dir.iterdir() if path.is_dir()) if configs_dir.is_dir() else [] |
| hydra_entrypoints = grep(py_files, r"@hydra\.main|hydra\.main\(") | ||
| lightning_modules = grep(py_files, r"LightningModule|lightning\.LightningModule|L\.LightningModule") | ||
| lightning_datamodules = grep(py_files, r"LightningDataModule|lightning\.LightningDataModule") | ||
| trainer_usage = grep(py_files, r"\bTrainer\b|trainer\.fit|trainer\.validate|trainer\.test") | ||
| aim_logger_configs = grep(yaml_files, r"aim\.pytorch_lightning\.AimLogger") | ||
| direct_aim_tracks = grep(py_files, r"experiment\.track|aim_run\.track|from aim import") | ||
| scalar_logs = grep(py_files, r"\bself\.log\(") | ||
| hparam_logs = grep(py_files, r"log_hyperparams|log_hyperparameters") |
There was a problem hiding this comment.
The audit function calls grep multiple times, and each grep call re-reads the files from disk. This results in redundant I/O operations. It is more efficient to read the file contents once and then perform the regex searches on the cached strings.
py_contents = {p: read_text(p) for p in py_files}
yaml_contents = {p: read_text(p) for p in yaml_files}
def find(contents: dict[Path, str], pattern: str) -> list[Path]:
regex = re.compile(pattern)
return [p for p, text in contents.items() if regex.search(text)]
hydra_entrypoints = find(py_contents, r"@hydra\.main|hydra\.main\(")
lightning_modules = find(py_contents, r"LightningModule|lightning\.LightningModule|L\.LightningModule")
lightning_datamodules = find(py_contents, r"LightningDataModule|lightning\.LightningDataModule")
trainer_usage = find(py_contents, r"\bTrainer\b|trainer\.fit|trainer\.validate|trainer\.test")
aim_logger_configs = find(yaml_contents, r"aim\.pytorch_lightning\.AimLogger")
direct_aim_tracks = find(py_contents, r"experiment\.track|aim_run\.track|from aim import")
scalar_logs = find(py_contents, r"\bself\.log\(")
hparam_logs = find(py_contents, r"log_hyperparams|log_hyperparameters")
No description provided.