TheWizardsCode · SorraTheOrc · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/.github/workflows/balance-validation.yml b/.github/workflows/balance-validation.yml
@@ -0,0 +1,280 @@
+name: Balance Validation
+
+on:
+  schedule:
+    # Run nightly at 3:00 AM UTC (offset from ai-tournament at 2:00 AM)
+    - cron: '0 3 * * *'
+  workflow_dispatch:
+    inputs:
+      compare_baseline:
+        description: 'Compare against baseline (true/false)'
+        required: false
+        default: 'true'
+      fail_on_regression:
+        description: 'Fail workflow on regression detection (true/false)'
+        required: false
+        default: 'true'
+      stability_threshold:
+        description: 'Stability delta threshold for regression (percentage)'
+        required: false
+        default: '5'
+  push:
+    branches:
+      - main
+    paths:
+      # Trigger on changes to simulation code, content, or AI systems
+      - 'src/gengine/echoes/sim/**'
+      - 'src/gengine/echoes/core/**'
+      - 'src/gengine/ai_player/**'
+      - 'content/worlds/**'
+      - 'content/config/**'
+      - 'scripts/run_batch_sweeps.py'
+      - 'scripts/analyze_balance.py'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'src/gengine/echoes/sim/**'
+      - 'src/gengine/echoes/core/**'
+      - 'src/gengine/ai_player/**'
+      - 'content/worlds/**'
+      - 'content/config/**'
+      - 'scripts/run_batch_sweeps.py'
+      - 'scripts/analyze_balance.py'
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  balance-validation:
+    runs-on: ubuntu-latest
+    outputs:
+      regression_detected: ${{ steps.compare.outputs.regression_detected }}
+      regression_summary: ${{ steps.compare.outputs.regression_summary }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Create build directory
+        run: mkdir -p build
+
+      - name: Run CI balance sweeps
+        id: sweep
+        run: |
+          echo "Running reduced parameter grid balance sweeps..."
+          python scripts/run_batch_sweeps.py \
+            --config content/config/ci_balance_sweeps.yml \
+            --output-dir build/ci_sweeps \
+            --verbose
+
+          echo "sweep_completed=true" >> $GITHUB_OUTPUT
+
+      - name: Aggregate sweep results
+        run: |
+          python scripts/aggregate_sweep_results.py \
+            --database build/ci_sweep_results.db \
+            ingest build/ci_sweeps \
+            --verbose
+
+      - name: Generate balance report
+        run: |
+          python scripts/analyze_balance.py \
+            --database build/ci_sweep_results.db \
+            report
+
+      - name: Compare against baseline
+        id: compare
+        env:
+          COMPARE_BASELINE: ${{ github.event.inputs.compare_baseline || 'true' }}
+          STABILITY_THRESHOLD: ${{ github.event.inputs.stability_threshold || '5' }}
+        run: |
+          if [[ "$COMPARE_BASELINE" != "true" ]]; then
+            echo "Baseline comparison skipped by user input"
+            echo "regression_detected=false" >> $GITHUB_OUTPUT
+            echo "regression_summary=Baseline comparison skipped" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          python scripts/manage_balance_baseline.py compare \
+            --current build/ci_sweeps/batch_sweep_summary.json \
+            --baseline content/baselines/balance_baseline.json \
+            --output build/regression_summary.json \
+            --stability-threshold "$STABILITY_THRESHOLD"
+
+          # Check if regressions were detected
+          if [[ -f build/regression_summary.json ]]; then
+            REGRESSION_COUNT=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(len(d.get('regressions', [])))")
+            if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
+              echo "regression_detected=true" >> $GITHUB_OUTPUT
+              SUMMARY=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(d.get('summary', 'Regressions detected'))")
-            REGRESSION_COUNT=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(len(d.get('regressions', [])))")
-            if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
-              echo "regression_detected=true" >> $GITHUB_OUTPUT
-              SUMMARY=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(d.get('summary', 'Regressions detected'))")
+            REGRESSION_COUNT=$(jq '.regressions | length' build/regression_summary.json)
+            if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
+              echo "regression_detected=true" >> $GITHUB_OUTPUT
+              SUMMARY=$(jq -r '.summary // "Regressions detected"' build/regression_summary.json)
-            REGRESSION_COUNT=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(len(d.get('regressions', [])))")
-            if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
-              echo "regression_detected=true" >> $GITHUB_OUTPUT
-              SUMMARY=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(d.get('summary', 'Regressions detected'))")
+            REGRESSION_COUNT=$(jq '.regressions | length' build/regression_summary.json)
+            if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
+              echo "regression_detected=true" >> $GITHUB_OUTPUT
+              SUMMARY=$(jq -r '.summary // "Regressions detected"' build/regression_summary.json)
+              echo "regression_summary=$SUMMARY" >> $GITHUB_OUTPUT
+            else
+              echo "regression_detected=false" >> $GITHUB_OUTPUT
+              echo "regression_summary=No regressions detected" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "regression_detected=false" >> $GITHUB_OUTPUT
+            echo "regression_summary=No baseline comparison performed" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Generate comparison charts
+        if: always()
+        run: |
+          python scripts/manage_balance_baseline.py chart \
+            --current build/ci_sweeps/batch_sweep_summary.json \
+            --baseline content/baselines/balance_baseline.json \
+            --output build/comparison_charts.png || echo "Chart generation skipped (missing baseline or data)"
+
+      - name: Archive balance validation results
+        uses: actions/upload-artifact@v4
+        with:
+          name: balance-validation-${{ github.run_id }}
+          path: |
+            build/ci_sweeps/batch_sweep_summary.json
+            build/ci_sweep_results.db
+            build/balance_report.html
+            build/balance_report.json
+            build/regression_summary.json
+            build/comparison_charts.png
+          retention-days: 30
+
+      - name: Comment on PR with balance summary
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+
+            let report = {};
+            try {
+              report = JSON.parse(fs.readFileSync('build/balance_report.json', 'utf8'));
+            } catch (e) {
+              console.log('Could not read balance report');
+              return;
+            }
+
+            let regressionSummary = {};
+            try {
+              regressionSummary = JSON.parse(fs.readFileSync('build/regression_summary.json', 'utf8'));
+            } catch (e) {
+              regressionSummary = { regressions: [], summary: 'No baseline comparison' };
+            }
+
+            const regressions = regressionSummary.regressions || [];
+            const hasRegressions = regressions.length > 0;
+
+            let body = `## 📊 Balance Validation Results\n\n`;
+
+            if (hasRegressions) {
+              body += `### ⚠️ Regressions Detected\n\n`;
+              for (const reg of regressions) {
+                const icon = reg.severity === 'failure' ? '❌' : '⚠️';
+                body += `- ${icon} **${reg.metric_name}**: ${reg.description}\n`;
+              }
+              body += `\n`;
+            } else {
+              body += `### ✅ No Regressions Detected\n\n`;
+            }
+
+            // Strategy stats
+            if (regressionSummary.current_stats && regressionSummary.current_stats.strategy_stats) {
+              body += `### Strategy Win Rates\n\n`;
+              body += `| Strategy | Avg Stability | Win Rate |\n`;
+              body += `|----------|---------------|----------|\n`;
+              for (const [strategy, stats] of Object.entries(regressionSummary.current_stats.strategy_stats)) {
+                const avgStab = stats.avg_stability?.toFixed(3) || 'N/A';
+                const winRate = ((stats.avg_stability || 0) >= 0.5 ? '✅' : '⚠️');
+                body += `| ${strategy} | ${avgStab} | ${winRate} |\n`;
+              }
+              body += `\n`;
+            }
+
+            body += `\n📁 [Download full report](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID})\n`;
+
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: body
+            });
+
+      - name: Fail on regression (if configured)
+        if: steps.compare.outputs.regression_detected == 'true'
+        env:
+          FAIL_ON_REGRESSION: ${{ github.event.inputs.fail_on_regression || 'true' }}
+        run: |
+          echo "::error::Balance regression detected: ${{ steps.compare.outputs.regression_summary }}"
+          if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
+            echo "Failing workflow due to regression detection"
+            exit 1
+          else
+            echo "::warning::Regression detected but workflow configured to continue"
+          fi
+
+  update-baseline:
+    needs: balance-validation
+    runs-on: ubuntu-latest
+    # Only update baseline on successful main branch runs (scheduled or push)
+    if: |
+      github.ref == 'refs/heads/main' && 
+      (github.event_name == 'schedule' || github.event_name == 'push') &&
+      needs.balance-validation.outputs.regression_detected != 'true'
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Download sweep results
+        uses: actions/download-artifact@v4.1.8
+        with:
+          name: balance-validation-${{ github.run_id }}
+          path: build
+
+      - name: Update baseline
+        run: |
+          python scripts/manage_balance_baseline.py update \
+            --source build/ci_sweeps/batch_sweep_summary.json \
+            --output content/baselines/balance_baseline.json
+
+      - name: Create baseline update PR
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          commit-message: "chore: update balance validation baseline"
+          branch: automated/update-balance-baseline
+          delete-branch: true
+          title: "[Automated] Update Balance Validation Baseline"
+          body: |
+            This PR updates the balance validation baseline from the latest successful CI run.
+
+            **Run ID:** ${{ github.run_id }}
+            **Commit:** ${{ github.sha }}
+
+            The baseline is updated automatically when:
+            - The balance validation workflow passes on main branch
+            - No regressions are detected against the previous baseline
+
+            Please review the changes before merging.
+          labels: |
+            automated
+            balance
diff --git a/content/baselines/balance_baseline.json b/content/baselines/balance_baseline.json
@@ -0,0 +1,67 @@
+{
+  "version": "1.0",
+  "created_at": "2025-01-01T00:00:00Z",
+  "updated_at": "2025-01-01T00:00:00Z",
+  "git_commit": null,
+  "description": "Initial balance baseline - placeholder for first CI run",
+  "strategy_stats": {
+    "balanced": {
+      "avg_stability": 0.7,
+      "min_stability": 0.5,
+      "max_stability": 0.9,
+      "win_rate": 0.8,
+      "avg_actions": 10,
+      "total_actions": 50,
+      "count": 5,
+      "completed": 5,
+      "failed": 0
+    },
+    "aggressive": {
+      "avg_stability": 0.5,
+      "min_stability": 0.3,
+      "max_stability": 0.7,
+      "win_rate": 0.5,
+      "avg_actions": 15,
+      "total_actions": 75,
+      "count": 5,
+      "completed": 5,
+      "failed": 0
+    },
+    "diplomatic": {
+      "avg_stability": 0.65,
+      "min_stability": 0.45,
+      "max_stability": 0.85,
+      "win_rate": 0.6,
+      "avg_actions": 8,
+      "total_actions": 40,
+      "count": 5,
+      "completed": 5,
+      "failed": 0
+    }
+  },
+  "difficulty_stats": {
+    "easy": {
+      "avg_stability": 0.75,
+      "count": 5
+    },
+    "normal": {
+      "avg_stability": 0.6,
+      "count": 5
+    },
+    "hard": {
+      "avg_stability": 0.5,
+      "count": 5
+    }
+  },
+  "total_sweeps": 45,
+  "completed_sweeps": 45,
+  "failed_sweeps": 0,
+  "total_duration_seconds": 0,
+  "thresholds": {
+    "stability_delta_warning": 5.0,
+    "stability_delta_failure": 10.0,
+    "win_rate_delta_warning": 5.0,
+    "win_rate_delta_failure": 10.0,
+    "unused_content_warning": true
+  }
+}