From 71cc56be9aa02fda8fe2b00cbc18975067a25f64 Mon Sep 17 00:00:00 2001 From: Asif Sayyed Date: Sat, 6 Jun 2026 11:42:43 +0530 Subject: [PATCH 1/2] update the doc, script and workflow --- .github/workflows/theseus-engine.yml | 5 +---- docs/CONFIGURATION.md | 2 +- scripts/run_pipeline.py | 14 +++++++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/theseus-engine.yml b/.github/workflows/theseus-engine.yml index b433db7..8421204 100644 --- a/.github/workflows/theseus-engine.yml +++ b/.github/workflows/theseus-engine.yml @@ -52,7 +52,7 @@ jobs: - name: Run pipeline for ${{ matrix.repo }} id: pipeline continue-on-error: true - run: poetry run python scripts/run_pipeline.py --repo "${{ matrix.repo }}" --update-survivor + run: poetry run python -m scripts.run_pipeline --repo "${{ matrix.repo }}" --update-survivor timeout-minutes: 120 - name: Push data to shared branch @@ -151,9 +151,6 @@ jobs: SAVE_DIR=$(mktemp -d) cp -r data/* "$SAVE_DIR"/ 2>/dev/null || true git checkout origin/main - # Full reset: remove everything from index and all untracked/ignored files - git rm -rf --cached . >/dev/null 2>&1 || true - git clean -fdx >/dev/null 2>&1 || true mkdir -p data/raw data/processed data/.status cp -r "$SAVE_DIR"/* data/ 2>/dev/null || true rm -rf "$SAVE_DIR" diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 09246a8..8e8665f 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -72,7 +72,7 @@ Paste this template into the `repositories` array in `theseus.config.json`: Then run the pipeline to generate the data: ```bash -python scripts/run_pipeline.py --repo REPO-NAME +python -m scripts.run_pipeline --repo REPO-NAME ``` This single command clones the repository, runs quarterly/monthly snapshot analysis, discovers both genesis and survivor fossils, and writes two files: diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 4924845..4b771a0 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -1,6 +1,10 @@ """ Unified orchestration script for the Theseus data pipeline. +Usage:: + + python -m scripts.run_pipeline [--repo NAME] [--reprocess YYYY-MM] [--update-survivor] + Runs all three stages in sequence on one or more repositories: 1. **Analyse** (snapshot generation via ``analyse_repository``) @@ -30,10 +34,10 @@ import sys import time -import _path_guard # noqa: F401 # pylint: disable=unused-import +import scripts._path_guard # noqa: F401 # pylint: disable=unused-import -from _utils import load_config -from cleanup_data import cleanup_data as run_cleanup +from scripts._utils import load_config +from scripts.cleanup_data import cleanup_data as run_cleanup logger = logging.getLogger(__name__) @@ -78,7 +82,7 @@ def run_pipeline( # ── Stage 1: Analyse ────────────────────────────────────────────── logger.info("═══ STAGE 1: Snapshot analysis ═══") - from analyse_repository import ( + from scripts.analyse_repository import ( process_repository, ) @@ -98,7 +102,7 @@ def run_pipeline( had_failures = True # ── Stage 2: Fossils ─────────────────────────────────────────────── - from add_fossils import backfill_fossils, update_survivor_fossils + from scripts.add_fossils import backfill_fossils, update_survivor_fossils repo_urls = { r["name"]: f"https://github.com/{r['repo']}.git" From 044f1b478d6493620345c5ce021fab5ff694b1c5 Mon Sep 17 00:00:00 2001 From: Asif Sayyed Date: Sat, 6 Jun 2026 12:40:57 +0530 Subject: [PATCH 2/2] fix: pass matrix.repo via env, preserve dotfiles in ancestry fix - Pass matrix.repo via REPO_NAME env var instead of inline template interpolation for defense-in-depth (no shell expansion risk) - Replace cp -r data/* with cp -a data/. to preserve dotfiles like data/.status during orphaned-branch ancestry fix --- .github/workflows/theseus-engine.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/theseus-engine.yml b/.github/workflows/theseus-engine.yml index 8421204..dc794f5 100644 --- a/.github/workflows/theseus-engine.yml +++ b/.github/workflows/theseus-engine.yml @@ -52,7 +52,9 @@ jobs: - name: Run pipeline for ${{ matrix.repo }} id: pipeline continue-on-error: true - run: poetry run python -m scripts.run_pipeline --repo "${{ matrix.repo }}" --update-survivor + env: + REPO_NAME: ${{ matrix.repo }} + run: poetry run python -m scripts.run_pipeline --repo "$REPO_NAME" --update-survivor timeout-minutes: 120 - name: Push data to shared branch @@ -149,10 +151,10 @@ jobs: fi echo "Shared branch has orphaned history. Rebasing onto main..." SAVE_DIR=$(mktemp -d) - cp -r data/* "$SAVE_DIR"/ 2>/dev/null || true + cp -a data/. "$SAVE_DIR"/ 2>/dev/null || true git checkout origin/main mkdir -p data/raw data/processed data/.status - cp -r "$SAVE_DIR"/* data/ 2>/dev/null || true + cp -a "$SAVE_DIR"/. data/ 2>/dev/null || true rm -rf "$SAVE_DIR" git add data/ git -c user.name="github-actions[bot]" \