Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 33 additions & 65 deletions .github/workflows/theseus-engine.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,14 @@ jobs:
- uses: actions/checkout@v4
- id: extract
run: |
REPOS=$(python -c '
import json
with open("theseus.config.json") as f:
config = json.load(f)
names = [r["name"] for r in config.get("repositories", [])]
print(json.dumps(names))
')
REPOS=$(python scripts/workflow.py discover-repos)
echo "repos=$REPOS" >> "$GITHUB_OUTPUT"

analyze:
needs: discover-repos
runs-on: ubuntu-latest
permissions:
contents: write
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -78,6 +74,7 @@ jobs:
> /tmp/data-save/status.json

git reset --hard HEAD 2>/dev/null || true
# Discard generated data but keep gitignored files (-fd, no -x)
git clean -fd 2>/dev/null || true

git fetch origin chore/monthly-data-update 2>/dev/null || true
Expand Down Expand Up @@ -143,6 +140,33 @@ jobs:
echo "has_branch=false" >> "$GITHUB_OUTPUT"
fi

- name: Fix shared branch ancestry
if: steps.shared.outputs.has_branch == 'true'
run: |
if git merge-base --is-ancestor origin/main HEAD 2>/dev/null; then
echo "Shared branch has common history with main."
exit 0
fi
echo "Shared branch has orphaned history. Rebasing onto main..."
SAVE_DIR=$(mktemp -d)
cp -r data/* "$SAVE_DIR"/ 2>/dev/null || true
git checkout origin/main
# Full reset: remove everything from index and all untracked/ignored files
git rm -rf --cached . >/dev/null 2>&1 || true
git clean -fdx >/dev/null 2>&1 || true
mkdir -p data/raw data/processed data/.status
cp -r "$SAVE_DIR"/* data/ 2>/dev/null || true
rm -rf "$SAVE_DIR"
git add data/
git -c user.name="github-actions[bot]" \
-c user.email="41898282+github-actions[bot]@users.noreply.github.com" \
commit -m "fix: rebase shared data onto main"
# Force-push is safe here: create-pr is the only job at this point,
# and the orphaned branch must be replaced to share history with main
git push origin HEAD:chore/monthly-data-update --force
git fetch origin chore/monthly-data-update
git checkout -B chore/monthly-data-update origin/chore/monthly-data-update

- name: Check for status markers
id: check
if: steps.shared.outputs.has_branch == 'true'
Expand All @@ -155,67 +179,11 @@ jobs:

- name: Build PR body
if: steps.check.outputs.has_data == 'true'
run: |
python << 'PYEOF'
import json, os, glob

status_dir = "data/.status"
statuses = {}
for f in sorted(glob.glob(os.path.join(status_dir, "*.json"))):
with open(f) as fh:
s = json.load(fh)
statuses[s["repo"]] = s["status"]

total = len(statuses)
passed = sum(1 for v in statuses.values() if v == "success")

rows = "\n".join(
f"| {repo} | {'✅' if s == 'success' else '❌'} |"
for repo, s in sorted(statuses.items())
)

header = "## Automated Theseus Data Engine Run\n\n"
table = "| Repo | Status |\n|------|--------|\n"
total_row = f"| **Total** | **{passed}/{total} completed** |\n\n"
footer = ("This pull request contains the latest pre-computed "
"persistence data for the tracked repositories.\n\n"
"**Trigger:** Monthly Schedule / Workflow Dispatch")
body = header + table + rows + "\n" + total_row + footer
with open("pr-body.md", "w") as f:
f.write(body)
PYEOF
run: python scripts/workflow.py build-pr-body

- name: Validate graph files
if: steps.check.outputs.has_data == 'true'
run: |
python << 'PYEOF'
import json, glob, sys

files = sorted(glob.glob("data/processed/*.json"))
if not files:
print("No processed files found to validate.")
sys.exit(1)

errors = 0
for f in files:
try:
with open(f) as fh:
data = json.load(fh)
assert "snapshots" in data, f"Missing snapshots in {f}"
assert "fossils" in data, f"Missing fossils in {f}"
for snap in data["snapshots"]:
assert "snapshot_date" in snap, f"Missing snapshot_date in {f}"
assert "composition" in snap, f"Missing composition in {f}"
print(f" ✓ {f}")
except (json.JSONDecodeError, AssertionError, KeyError) as e:
print(f" ✗ {f}: {e}")
errors += 1

if errors:
print(f"Validation failed: {errors} error(s)")
sys.exit(1)
print("All graph files validated.")
PYEOF
run: python scripts/workflow.py validate-graph-files

- name: Create or update pull request
if: steps.check.outputs.has_data == 'true'
Expand Down
20 changes: 15 additions & 5 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,23 @@ jobs:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
with:
persist-credentials: false

- name: Setup Python and Poetry
uses: ./.github/actions/setup-python-poetry
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
with:
poetry-install-args: --with dev
python-version: "3.12"
cache: pip

- name: Install Poetry
run: pipx install poetry
shell: bash

- name: Install dependencies
run: poetry install --with dev
shell: bash

- name: Run linter
run: poetry run pylint scripts/ --output-format=colorized
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ dev = [
"pylint>=4.0.5,<5.0.0"
]

[tool.pylint]
init-hook = "import sys; sys.path.insert(0, 'scripts')"

[tool.pylint.format]
max-line-length = 120

Expand Down
14 changes: 14 additions & 0 deletions scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Ensure the scripts directory is on sys.path for sibling imports.

When ``scripts`` is imported as a package (e.g. ``from scripts._blame import ...``
from tests), this adds the package directory to ``sys.path`` so that subsequent
``import _path_guard`` calls from sibling modules resolve correctly.
"""

import sys
from pathlib import Path

_SCRIPTS_DIR = str(Path(__file__).resolve().parent)
if _SCRIPTS_DIR not in sys.path:
sys.path.insert(0, _SCRIPTS_DIR)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
9 changes: 2 additions & 7 deletions scripts/_blame.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,12 @@

import concurrent.futures
import logging
import os
import sys
import threading
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path

# Ensure sibling imports work in all invocation contexts
_SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
if _SCRIPTS_DIR not in sys.path:
sys.path.insert(0, _SCRIPTS_DIR)
import _path_guard # noqa: F401 # pylint: disable=unused-import

from _utils import run_command

Expand Down Expand Up @@ -269,7 +264,7 @@ def blame_year_counts(self, files: list[str]) -> dict[str, int]:
logger.info(" Blaming %d files (%d workers)...", len(files), self.max_workers)
age_distribution: dict[str, int] = defaultdict(int)

def _accumulate(file_path: str, raw_output: str) -> None:
def _accumulate(_file_path: str, raw_output: str) -> None:
for year, count in parse_blame_year_counts(raw_output).items():
age_distribution[year] += count

Expand Down
24 changes: 11 additions & 13 deletions scripts/_data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,12 @@

import json
import logging
import os
from pathlib import Path

logger = logging.getLogger(__name__)


# TODO: Move away from OS to Pathlib
def load_snapshot_data(file_path: str) -> dict:
def load_snapshot_data(file_path: str | Path) -> dict:
"""
Load snapshot data from a JSON file, normalising to ``{snapshots, fossils}``.

Expand All @@ -59,12 +58,12 @@ def load_snapshot_data(file_path: str) -> dict:
:param file_path: Path to the JSON data file.
:return: Dictionary with ``snapshots`` (list) and ``fossils`` (dict) keys.
"""
if not os.path.exists(file_path):
path = Path(file_path)
if not path.exists():
return {"snapshots": [], "fossils": {}}

try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
data = json.loads(path.read_text(encoding="utf-8"))
if isinstance(data, list):
return {"snapshots": data, "fossils": {}}
if isinstance(data, dict):
Expand All @@ -81,20 +80,19 @@ def load_snapshot_data(file_path: str) -> dict:
return {"snapshots": [], "fossils": {}}


# TODO: Move away from OS to Pathlib
def save_snapshot_data(file_path: str, snapshots: list[dict], fossils: dict) -> None:
def save_snapshot_data(file_path: str | Path, snapshots: list[dict], fossils: dict) -> None:
"""
Atomically write snapshot data to a minified JSON file.

Writes to a ``.tmp`` sibling first, then atomically replaces the target
via ``os.replace`` to prevent file corruption on crash.
to prevent file corruption on crash.

:param file_path: Destination path.
:param snapshots: List of snapshot objects.
:param fossils: Fossil dictionary (``genesis`` + ``survivor`` keys).
"""
tmp_path = file_path + ".tmp"
path = Path(file_path)
tmp_path = path.with_suffix(path.suffix + ".tmp")
data = {"snapshots": snapshots, "fossils": fossils}
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(data, f, separators=(",", ":"))
os.replace(tmp_path, file_path)
tmp_path.write_text(json.dumps(data, separators=(",", ":")), encoding="utf-8")
tmp_path.replace(path)
17 changes: 17 additions & 0 deletions scripts/_path_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Ensure the scripts directory is on sys.path for sibling imports.

Every script in this directory should ``import _path_guard`` (with a
``# noqa: F401`` comment if the linter complains) before importing any
sibling module. This is a no-op when the script's directory is already
on ``sys.path`` (the normal case for ``python scripts/foo.py``), but
guarantees correctness for ``python -m scripts.foo`` and test-runner
invocations.
"""

import sys
from pathlib import Path

_SCRIPTS_DIR = str(Path(__file__).resolve().parent)
if _SCRIPTS_DIR not in sys.path:
sys.path.insert(0, _SCRIPTS_DIR)
6 changes: 4 additions & 2 deletions scripts/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def remove_path(path: str) -> None:
["cmd", "/c", "rd", "/s", "/q", path],
capture_output=True,
timeout=30,
check=False,
)
if not os.path.exists(path):
return
Expand All @@ -226,14 +227,15 @@ def remove_path(path: str) -> None:
["rm", "-rf", path],
capture_output=True,
timeout=30,
check=False,
)
if not os.path.exists(path):
return
except (subprocess.SubprocessError, OSError):
pass

# Fallback: retry with shutil.rmtree, fixing permissions on each retry
def handle_remove_readonly(func, path, _exc_info):
def _handle_remove_readonly(func, path, _exc):
try:
current_mode = os.stat(path).st_mode
os.chmod(
Expand All @@ -248,7 +250,7 @@ def handle_remove_readonly(func, path, _exc_info):

for attempt in range(3):
try:
shutil.rmtree(path, onerror=handle_remove_readonly)
shutil.rmtree(path, onexc=_handle_remove_readonly)
break
except Exception: # noqa: BLE001
if attempt < 2:
Expand Down
Loading
Loading