diff --git a/.github/workflows/balance-validation.yml b/.github/workflows/balance-validation.yml
new file mode 100644
index 00000000..78ddc954
--- /dev/null
+++ b/.github/workflows/balance-validation.yml
@@ -0,0 +1,280 @@
+name: Balance Validation
+
+on:
+  schedule:
+    # Run nightly at 3:00 AM UTC (offset from ai-tournament at 2:00 AM)
+    - cron: '0 3 * * *'
+  workflow_dispatch:
+    inputs:
+      compare_baseline:
+        description: 'Compare against baseline (true/false)'
+        required: false
+        default: 'true'
+      fail_on_regression:
+        description: 'Fail workflow on regression detection (true/false)'
+        required: false
+        default: 'true'
+      stability_threshold:
+        description: 'Stability delta threshold for regression (percentage)'
+        required: false
+        default: '5'
+  push:
+    branches:
+      - main
+    paths:
+      # Trigger on changes to simulation code, content, or AI systems
+      - 'src/gengine/echoes/sim/**'
+      - 'src/gengine/echoes/core/**'
+      - 'src/gengine/ai_player/**'
+      - 'content/worlds/**'
+      - 'content/config/**'
+      - 'scripts/run_batch_sweeps.py'
+      - 'scripts/analyze_balance.py'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'src/gengine/echoes/sim/**'
+      - 'src/gengine/echoes/core/**'
+      - 'src/gengine/ai_player/**'
+      - 'content/worlds/**'
+      - 'content/config/**'
+      - 'scripts/run_batch_sweeps.py'
+      - 'scripts/analyze_balance.py'
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  balance-validation:
+    runs-on: ubuntu-latest
+    outputs:
+      regression_detected: ${{ steps.compare.outputs.regression_detected }}
+      regression_summary: ${{ steps.compare.outputs.regression_summary }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Create build directory
+        run: mkdir -p build
+
+      - name: Run CI balance sweeps
+        id: sweep
+        run: |
+          echo "Running reduced parameter grid balance sweeps..."
+          python scripts/run_batch_sweeps.py \
+            --config content/config/ci_balance_sweeps.yml \
+            --output-dir build/ci_sweeps \
+            --verbose
+          
+          echo "sweep_completed=true" >> $GITHUB_OUTPUT
+
+      - name: Aggregate sweep results
+        run: |
+          python scripts/aggregate_sweep_results.py \
+            --database build/ci_sweep_results.db \
+            ingest build/ci_sweeps \
+            --verbose
+
+      - name: Generate balance report
+        run: |
+          python scripts/analyze_balance.py \
+            --database build/ci_sweep_results.db \
+            report
+
+      - name: Compare against baseline
+        id: compare
+        env:
+          COMPARE_BASELINE: ${{ github.event.inputs.compare_baseline || 'true' }}
+          STABILITY_THRESHOLD: ${{ github.event.inputs.stability_threshold || '5' }}
+        run: |
+          if [[ "$COMPARE_BASELINE" != "true" ]]; then
+            echo "Baseline comparison skipped by user input"
+            echo "regression_detected=false" >> $GITHUB_OUTPUT
+            echo "regression_summary=Baseline comparison skipped" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          
+          python scripts/manage_balance_baseline.py compare \
+            --current build/ci_sweeps/batch_sweep_summary.json \
+            --baseline content/baselines/balance_baseline.json \
+            --output build/regression_summary.json \
+            --stability-threshold "$STABILITY_THRESHOLD"
+          
+          # Check if regressions were detected
+          if [[ -f build/regression_summary.json ]]; then
+            REGRESSION_COUNT=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(len(d.get('regressions', [])))")
+            if [[ "$REGRESSION_COUNT" -gt 0 ]]; then
+              echo "regression_detected=true" >> $GITHUB_OUTPUT
+              SUMMARY=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(d.get('summary', 'Regressions detected'))")
+              echo "regression_summary=$SUMMARY" >> $GITHUB_OUTPUT
+            else
+              echo "regression_detected=false" >> $GITHUB_OUTPUT
+              echo "regression_summary=No regressions detected" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "regression_detected=false" >> $GITHUB_OUTPUT
+            echo "regression_summary=No baseline comparison performed" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Generate comparison charts
+        if: always()
+        run: |
+          python scripts/manage_balance_baseline.py chart \
+            --current build/ci_sweeps/batch_sweep_summary.json \
+            --baseline content/baselines/balance_baseline.json \
+            --output build/comparison_charts.png || echo "Chart generation skipped (missing baseline or data)"
+
+      - name: Archive balance validation results
+        uses: actions/upload-artifact@v4
+        with:
+          name: balance-validation-${{ github.run_id }}
+          path: |
+            build/ci_sweeps/batch_sweep_summary.json
+            build/ci_sweep_results.db
+            build/balance_report.html
+            build/balance_report.json
+            build/regression_summary.json
+            build/comparison_charts.png
+          retention-days: 30
+
+      - name: Comment on PR with balance summary
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            
+            let report = {};
+            try {
+              report = JSON.parse(fs.readFileSync('build/balance_report.json', 'utf8'));
+            } catch (e) {
+              console.log('Could not read balance report');
+              return;
+            }
+            
+            let regressionSummary = {};
+            try {
+              regressionSummary = JSON.parse(fs.readFileSync('build/regression_summary.json', 'utf8'));
+            } catch (e) {
+              regressionSummary = { regressions: [], summary: 'No baseline comparison' };
+            }
+            
+            const regressions = regressionSummary.regressions || [];
+            const hasRegressions = regressions.length > 0;
+            
+            let body = `## 📊 Balance Validation Results\n\n`;
+            
+            if (hasRegressions) {
+              body += `### ⚠️ Regressions Detected\n\n`;
+              for (const reg of regressions) {
+                const icon = reg.severity === 'failure' ? '❌' : '⚠️';
+                body += `- ${icon} **${reg.metric_name}**: ${reg.description}\n`;
+              }
+              body += `\n`;
+            } else {
+              body += `### ✅ No Regressions Detected\n\n`;
+            }
+            
+            // Strategy stats
+            if (regressionSummary.current_stats && regressionSummary.current_stats.strategy_stats) {
+              body += `### Strategy Win Rates\n\n`;
+              body += `| Strategy | Avg Stability | Win Rate |\n`;
+              body += `|----------|---------------|----------|\n`;
+              for (const [strategy, stats] of Object.entries(regressionSummary.current_stats.strategy_stats)) {
+                const avgStab = stats.avg_stability?.toFixed(3) || 'N/A';
+                const winRate = ((stats.avg_stability || 0) >= 0.5 ? '✅' : '⚠️');
+                body += `| ${strategy} | ${avgStab} | ${winRate} |\n`;
+              }
+              body += `\n`;
+            }
+            
+            body += `\n📁 [Download full report](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID})\n`;
+            
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: body
+            });
+
+      - name: Fail on regression (if configured)
+        if: steps.compare.outputs.regression_detected == 'true'
+        env:
+          FAIL_ON_REGRESSION: ${{ github.event.inputs.fail_on_regression || 'true' }}
+        run: |
+          echo "::error::Balance regression detected: ${{ steps.compare.outputs.regression_summary }}"
+          if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then
+            echo "Failing workflow due to regression detection"
+            exit 1
+          else
+            echo "::warning::Regression detected but workflow configured to continue"
+          fi
+
+  update-baseline:
+    needs: balance-validation
+    runs-on: ubuntu-latest
+    # Only update baseline on successful main branch runs (scheduled or push)
+    if: |
+      github.ref == 'refs/heads/main' && 
+      (github.event_name == 'schedule' || github.event_name == 'push') &&
+      needs.balance-validation.outputs.regression_detected != 'true'
+    
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Download sweep results
+        uses: actions/download-artifact@v4.1.8
+        with:
+          name: balance-validation-${{ github.run_id }}
+          path: build
+
+      - name: Update baseline
+        run: |
+          python scripts/manage_balance_baseline.py update \
+            --source build/ci_sweeps/batch_sweep_summary.json \
+            --output content/baselines/balance_baseline.json
+
+      - name: Create baseline update PR
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          commit-message: "chore: update balance validation baseline"
+          branch: automated/update-balance-baseline
+          delete-branch: true
+          title: "[Automated] Update Balance Validation Baseline"
+          body: |
+            This PR updates the balance validation baseline from the latest successful CI run.
+            
+            **Run ID:** ${{ github.run_id }}
+            **Commit:** ${{ github.sha }}
+            
+            The baseline is updated automatically when:
+            - The balance validation workflow passes on main branch
+            - No regressions are detected against the previous baseline
+            
+            Please review the changes before merging.
+          labels: |
+            automated
+            balance
diff --git a/content/baselines/balance_baseline.json b/content/baselines/balance_baseline.json
new file mode 100644
index 00000000..5b679663
--- /dev/null
+++ b/content/baselines/balance_baseline.json
@@ -0,0 +1,67 @@
+{
+  "version": "1.0",
+  "created_at": "2025-01-01T00:00:00Z",
+  "updated_at": "2025-01-01T00:00:00Z",
+  "git_commit": null,
+  "description": "Initial balance baseline - placeholder for first CI run",
+  "strategy_stats": {
+    "balanced": {
+      "avg_stability": 0.7,
+      "min_stability": 0.5,
+      "max_stability": 0.9,
+      "win_rate": 0.8,
+      "avg_actions": 10,
+      "total_actions": 50,
+      "count": 5,
+      "completed": 5,
+      "failed": 0
+    },
+    "aggressive": {
+      "avg_stability": 0.5,
+      "min_stability": 0.3,
+      "max_stability": 0.7,
+      "win_rate": 0.5,
+      "avg_actions": 15,
+      "total_actions": 75,
+      "count": 5,
+      "completed": 5,
+      "failed": 0
+    },
+    "diplomatic": {
+      "avg_stability": 0.65,
+      "min_stability": 0.45,
+      "max_stability": 0.85,
+      "win_rate": 0.6,
+      "avg_actions": 8,
+      "total_actions": 40,
+      "count": 5,
+      "completed": 5,
+      "failed": 0
+    }
+  },
+  "difficulty_stats": {
+    "easy": {
+      "avg_stability": 0.75,
+      "count": 5
+    },
+    "normal": {
+      "avg_stability": 0.6,
+      "count": 5
+    },
+    "hard": {
+      "avg_stability": 0.5,
+      "count": 5
+    }
+  },
+  "total_sweeps": 45,
+  "completed_sweeps": 45,
+  "failed_sweeps": 0,
+  "total_duration_seconds": 0,
+  "thresholds": {
+    "stability_delta_warning": 5.0,
+    "stability_delta_failure": 10.0,
+    "win_rate_delta_warning": 5.0,
+    "win_rate_delta_failure": 10.0,
+    "unused_content_warning": true
+  }
+}
diff --git a/content/config/ci_balance_sweeps.yml b/content/config/ci_balance_sweeps.yml
new file mode 100644
index 00000000..8c6d8818
--- /dev/null
+++ b/content/config/ci_balance_sweeps.yml
@@ -0,0 +1,64 @@
+# CI Balance Sweeps Configuration
+# Reduced parameter grid for fast CI validation of balance changes.
+# This configuration is optimized for speed while still providing meaningful coverage.
+
+# Parameter Grid - reduced from full sweeps for CI speed
+parameters:
+  # Three representative strategies covering the spectrum
+  strategies:
+    - balanced
+    - aggressive
+    - diplomatic
+
+  # Three difficulty presets to validate scaling
+  difficulties:
+    - easy
+    - normal
+    - hard
+
+  # Five deterministic seeds for reproducibility with statistical significance
+  seeds:
+    - 42
+    - 123
+    - 456
+    - 789
+    - 1000
+
+  # Default world only for CI (additional worlds tested in nightly full sweeps)
+  worlds:
+    - default
+
+  # Reduced tick budget for faster execution (100 ticks captures most dynamics)
+  tick_budgets:
+    - 100
+
+# Parallel execution settings optimized for CI runners
+parallel:
+  # Use 2 workers to balance speed and resource usage on CI
+  max_workers: 2
+
+  # Shorter timeout for CI
+  timeout_per_sweep: 120
+
+# Output configuration
+output:
+  # Standard CI output directory
+  dir: build/ci_sweeps
+
+  # Include telemetry for debugging but keep it minimal
+  include_telemetry: true
+
+  # Include summary for report generation
+  include_summary: true
+
+# Sampling - use full grid since CI grid is already small (3*3*5*1*1 = 45 combinations)
+sampling:
+  mode: full
+  sample_count: 45
+  sample_seed: 42
+
+# Metadata for traceability
+metadata:
+  include_git_commit: true
+  include_timestamp: true
+  include_runtime_info: true
diff --git a/scripts/manage_balance_baseline.py b/scripts/manage_balance_baseline.py
new file mode 100644
index 00000000..fe9950ea
--- /dev/null
+++ b/scripts/manage_balance_baseline.py
@@ -0,0 +1,954 @@
+#!/usr/bin/env python3
+"""Manage balance validation baselines for CI integration.
+
+Provides tools for storing, updating, and comparing balance sweep results
+against baseline data for regression detection in CI workflows.
+
+Examples
+--------
+Compare current sweep against baseline::
+
+    python scripts/manage_balance_baseline.py compare \\
+        --current build/ci_sweeps/batch_sweep_summary.json \\
+        --baseline content/baselines/balance_baseline.json
+
+Update baseline from successful sweep::
+
+    python scripts/manage_balance_baseline.py update \\
+        --source build/ci_sweeps/batch_sweep_summary.json \\
+        --output content/baselines/balance_baseline.json
+
+Generate comparison chart::
+
+    python scripts/manage_balance_baseline.py chart \\
+        --current build/ci_sweeps/batch_sweep_summary.json \\
+        --baseline content/baselines/balance_baseline.json \\
+        --output build/comparison_chart.png
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Sequence
+
+# Optional imports for visualization
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    HAS_MATPLOTLIB = True
+except ImportError:
+    HAS_MATPLOTLIB = False
+
+
+# ============================================================================
+# Data Classes
+# ============================================================================
+
+
+@dataclass
+class RegressionAlert:
+    """Alert for a detected balance regression."""
+
+    metric_name: str
+    strategy: str | None
+    baseline_value: float
+    current_value: float
+    delta_percent: float
+    severity: str  # "warning" or "failure"
+    description: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "metric_name": self.metric_name,
+            "strategy": self.strategy,
+            "baseline_value": round(self.baseline_value, 4),
+            "current_value": round(self.current_value, 4),
+            "delta_percent": round(self.delta_percent, 2),
+            "severity": self.severity,
+            "description": self.description,
+        }
+
+
+@dataclass
+class ComparisonResult:
+    """Result of comparing current sweep against baseline."""
+
+    timestamp: str
+    baseline_path: str
+    current_path: str
+    regressions: list[RegressionAlert] = field(default_factory=list)
+    baseline_stats: dict[str, Any] = field(default_factory=dict)
+    current_stats: dict[str, Any] = field(default_factory=dict)
+    summary: str = ""
+    passed: bool = True
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "timestamp": self.timestamp,
+            "baseline_path": self.baseline_path,
+            "current_path": self.current_path,
+            "regressions": [r.to_dict() for r in self.regressions],
+            "baseline_stats": self.baseline_stats,
+            "current_stats": self.current_stats,
+            "summary": self.summary,
+            "passed": self.passed,
+        }
+
+
+@dataclass
+class BaselineConfig:
+    """Configuration for baseline thresholds."""
+
+    stability_delta_warning: float = 5.0  # percentage
+    stability_delta_failure: float = 10.0  # percentage
+    win_rate_delta_warning: float = 5.0  # percentage
+    win_rate_delta_failure: float = 10.0  # percentage
+    unused_content_warning: bool = True
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> BaselineConfig:
+        return cls(
+            stability_delta_warning=data.get("stability_delta_warning", 5.0),
+            stability_delta_failure=data.get("stability_delta_failure", 10.0),
+            win_rate_delta_warning=data.get("win_rate_delta_warning", 5.0),
+            win_rate_delta_failure=data.get("win_rate_delta_failure", 10.0),
+            unused_content_warning=data.get("unused_content_warning", True),
+        )
+
+
+# ============================================================================
+# Baseline Management Functions
+# ============================================================================
+
+
+def load_baseline(path: Path) -> dict[str, Any] | None:
+    """Load baseline data from JSON file.
+
+    Parameters
+    ----------
+    path
+        Path to baseline JSON file.
+
+    Returns
+    -------
+    dict[str, Any] | None
+        Baseline data or None if file doesn't exist.
+    """
+    if not path.exists():
+        return None
+
+    with open(path) as f:
+        return json.load(f)
+
+
+def load_sweep_summary(path: Path) -> dict[str, Any] | None:
+    """Load sweep summary from batch_sweep_summary.json.
+
+    Parameters
+    ----------
+    path
+        Path to sweep summary JSON file.
+
+    Returns
+    -------
+    dict[str, Any] | None
+        Sweep summary data or None if file doesn't exist.
+    """
+    if not path.exists():
+        return None
+
+    with open(path) as f:
+        return json.load(f)
+
+
+def extract_strategy_stats(sweep_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
+    """Extract strategy statistics from sweep summary.
+
+    Parameters
+    ----------
+    sweep_data
+        Sweep summary data.
+
+    Returns
+    -------
+    dict[str, dict[str, Any]]
+        Strategy statistics keyed by strategy name.
+    """
+    return sweep_data.get("strategy_stats", {})
+
+
+def extract_difficulty_stats(sweep_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
+    """Extract difficulty statistics from sweep summary.
+
+    Parameters
+    ----------
+    sweep_data
+        Sweep summary data.
+
+    Returns
+    -------
+    dict[str, dict[str, Any]]
+        Difficulty statistics keyed by difficulty name.
+    """
+    return sweep_data.get("difficulty_stats", {})
+
+
+def compute_win_rate(strategy_stats: dict[str, Any]) -> float:
+    """Compute win rate from strategy statistics.
+
+    Win is defined as stability >= 0.5.
+
+    Parameters
+    ----------
+    strategy_stats
+        Statistics for a single strategy.
+
+    Returns
+    -------
+    float
+        Win rate as a fraction (0.0 to 1.0).
+    """
+    avg_stability = strategy_stats.get("avg_stability", 0.0)
+    # Approximate win rate based on average stability
+    # In actual sweep data, we'd count individual games
+    if avg_stability >= 0.5:
+        # Scale win rate based on how far above threshold
+        return min(1.0, 0.5 + (avg_stability - 0.5))
+    else:
+        # Scale win rate based on how far below threshold
+        return max(0.0, avg_stability)
+
+
+def compare_strategy_stats(
+    baseline_stats: dict[str, dict[str, Any]],
+    current_stats: dict[str, dict[str, Any]],
+    config: BaselineConfig,
+) -> list[RegressionAlert]:
+    """Compare strategy statistics against baseline.
+
+    Parameters
+    ----------
+    baseline_stats
+        Baseline strategy statistics.
+    current_stats
+        Current sweep strategy statistics.
+    config
+        Threshold configuration.
+
+    Returns
+    -------
+    list[RegressionAlert]
+        List of detected regressions.
+    """
+    alerts: list[RegressionAlert] = []
+
+    for strategy, current in current_stats.items():
+        baseline = baseline_stats.get(strategy)
+        if not baseline:
+            continue
+
+        # Compare average stability
+        baseline_stab = baseline.get("avg_stability", 0.0)
+        current_stab = current.get("avg_stability", 0.0)
+
+        if baseline_stab > 0:
+            delta_percent = ((current_stab - baseline_stab) / baseline_stab) * 100
+
+            # Only flag decreases (negative deltas)
+            if delta_percent < -config.stability_delta_failure:
+                alerts.append(
+                    RegressionAlert(
+                        metric_name="avg_stability",
+                        strategy=strategy,
+                        baseline_value=baseline_stab,
+                        current_value=current_stab,
+                        delta_percent=delta_percent,
+                        severity="failure",
+                        description=(
+                            f"Strategy '{strategy}' stability dropped by "
+                            f"{abs(delta_percent):.1f}% (from {baseline_stab:.3f} "
+                            f"to {current_stab:.3f})"
+                        ),
+                    )
+                )
+            elif delta_percent < -config.stability_delta_warning:
+                alerts.append(
+                    RegressionAlert(
+                        metric_name="avg_stability",
+                        strategy=strategy,
+                        baseline_value=baseline_stab,
+                        current_value=current_stab,
+                        delta_percent=delta_percent,
+                        severity="warning",
+                        description=(
+                            f"Strategy '{strategy}' stability decreased by "
+                            f"{abs(delta_percent):.1f}% (from {baseline_stab:.3f} "
+                            f"to {current_stab:.3f})"
+                        ),
+                    )
+                )
+
+        # Compare win rates - prefer explicit win_rate if available
+        baseline_wr = baseline.get("win_rate")
+        current_wr = current.get("win_rate")
+
+        # Only compute from stability if not explicitly provided
+        if baseline_wr is None:
+            baseline_wr = compute_win_rate(baseline)
+        if current_wr is None:
+            current_wr = compute_win_rate(current)
+
+        if baseline_wr > 0:
+            wr_delta_percent = ((current_wr - baseline_wr) / baseline_wr) * 100
+
+            if wr_delta_percent < -config.win_rate_delta_failure:
+                alerts.append(
+                    RegressionAlert(
+                        metric_name="win_rate",
+                        strategy=strategy,
+                        baseline_value=baseline_wr,
+                        current_value=current_wr,
+                        delta_percent=wr_delta_percent,
+                        severity="failure",
+                        description=(
+                            f"Strategy '{strategy}' win rate dropped by "
+                            f"{abs(wr_delta_percent):.1f}% (from {baseline_wr:.1%} "
+                            f"to {current_wr:.1%})"
+                        ),
+                    )
+                )
+            elif wr_delta_percent < -config.win_rate_delta_warning:
+                alerts.append(
+                    RegressionAlert(
+                        metric_name="win_rate",
+                        strategy=strategy,
+                        baseline_value=baseline_wr,
+                        current_value=current_wr,
+                        delta_percent=wr_delta_percent,
+                        severity="warning",
+                        description=(
+                            f"Strategy '{strategy}' win rate decreased by "
+                            f"{abs(wr_delta_percent):.1f}% (from {baseline_wr:.1%} "
+                            f"to {current_wr:.1%})"
+                        ),
+                    )
+                )
+
+    return alerts
+
+
+def compare_against_baseline(
+    baseline_path: Path,
+    current_path: Path,
+    stability_threshold: float = 5.0,
+) -> ComparisonResult:
+    """Compare current sweep results against baseline.
+
+    Parameters
+    ----------
+    baseline_path
+        Path to baseline JSON file.
+    current_path
+        Path to current sweep summary.
+    stability_threshold
+        Threshold percentage for stability regression detection.
+
+    Returns
+    -------
+    ComparisonResult
+        Comparison result with any detected regressions.
+    """
+    baseline = load_baseline(baseline_path)
+    current = load_sweep_summary(current_path)
+
+    result = ComparisonResult(
+        timestamp=datetime.now(timezone.utc).isoformat(),
+        baseline_path=str(baseline_path),
+        current_path=str(current_path),
+    )
+
+    if not baseline:
+        result.summary = "No baseline found - establishing new baseline"
+        result.passed = True
+        if current:
+            result.current_stats = {
+                "strategy_stats": extract_strategy_stats(current),
+                "difficulty_stats": extract_difficulty_stats(current),
+            }
+        return result
+
+    if not current:
+        result.summary = "No current sweep data found"
+        result.passed = False
+        return result
+
+    # Extract stats
+    baseline_strategy_stats = baseline.get("strategy_stats", {})
+    current_strategy_stats = extract_strategy_stats(current)
+
+    result.baseline_stats = {
+        "strategy_stats": baseline_strategy_stats,
+        "difficulty_stats": baseline.get("difficulty_stats", {}),
+    }
+    result.current_stats = {
+        "strategy_stats": current_strategy_stats,
+        "difficulty_stats": extract_difficulty_stats(current),
+    }
+
+    # Get threshold config from baseline or use defaults
+    thresholds = baseline.get("thresholds", {})
+    config = BaselineConfig.from_dict(thresholds)
+    # Override with CLI threshold if provided
+    config.stability_delta_warning = stability_threshold
+    config.stability_delta_failure = stability_threshold * 2
+
+    # Compare strategy stats
+    alerts = compare_strategy_stats(
+        baseline_strategy_stats, current_strategy_stats, config
+    )
+    result.regressions = alerts
+
+    # Determine pass/fail
+    failures = [a for a in alerts if a.severity == "failure"]
+    warnings = [a for a in alerts if a.severity == "warning"]
+
+    if failures:
+        result.passed = False
+        result.summary = (
+            f"FAILED: {len(failures)} regression(s) detected, "
+            f"{len(warnings)} warning(s)"
+        )
+    elif warnings:
+        result.passed = True
+        result.summary = f"PASSED with {len(warnings)} warning(s)"
+    else:
+        result.passed = True
+        result.summary = "PASSED: No regressions detected"
+
+    return result
+
+
+def create_baseline(
+    sweep_path: Path,
+    output_path: Path,
+    git_commit: str | None = None,
+) -> dict[str, Any]:
+    """Create a new baseline from sweep results.
+
+    Parameters
+    ----------
+    sweep_path
+        Path to sweep summary JSON file.
+    output_path
+        Path to write baseline JSON file.
+    git_commit
+        Git commit hash to record in baseline.
+
+    Returns
+    -------
+    dict[str, Any]
+        Created baseline data.
+    """
+    sweep = load_sweep_summary(sweep_path)
+    if not sweep:
+        raise ValueError(f"Could not load sweep data from {sweep_path}")
+
+    now = datetime.now(timezone.utc).isoformat()
+
+    baseline = {
+        "version": "1.0",
+        "created_at": now,
+        "updated_at": now,
+        "git_commit": git_commit or sweep.get("metadata", {}).get("git_commit"),
+        "description": "Balance validation baseline",
+        "strategy_stats": extract_strategy_stats(sweep),
+        "difficulty_stats": extract_difficulty_stats(sweep),
+        "total_sweeps": sweep.get("total_sweeps", 0),
+        "completed_sweeps": sweep.get("completed_sweeps", 0),
+        "failed_sweeps": sweep.get("failed_sweeps", 0),
+        "total_duration_seconds": sweep.get("total_duration_seconds", 0),
+        "thresholds": {
+            "stability_delta_warning": 5.0,
+            "stability_delta_failure": 10.0,
+            "win_rate_delta_warning": 5.0,
+            "win_rate_delta_failure": 10.0,
+            "unused_content_warning": True,
+        },
+    }
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(baseline, f, indent=2)
+
+    return baseline
+
+
+def update_baseline(
+    sweep_path: Path,
+    output_path: Path,
+    git_commit: str | None = None,
+) -> dict[str, Any]:
+    """Update existing baseline with new sweep results.
+
+    Parameters
+    ----------
+    sweep_path
+        Path to sweep summary JSON file.
+    output_path
+        Path to write updated baseline JSON file.
+    git_commit
+        Git commit hash to record.
+
+    Returns
+    -------
+    dict[str, Any]
+        Updated baseline data.
+    """
+    existing = load_baseline(output_path)
+    sweep = load_sweep_summary(sweep_path)
+
+    if not sweep:
+        raise ValueError(f"Could not load sweep data from {sweep_path}")
+
+    now = datetime.now(timezone.utc).isoformat()
+
+    baseline = {
+        "version": "1.0",
+        "created_at": existing.get("created_at", now) if existing else now,
+        "updated_at": now,
+        "git_commit": git_commit or sweep.get("metadata", {}).get("git_commit"),
+        "description": "Balance validation baseline",
+        "strategy_stats": extract_strategy_stats(sweep),
+        "difficulty_stats": extract_difficulty_stats(sweep),
+        "total_sweeps": sweep.get("total_sweeps", 0),
+        "completed_sweeps": sweep.get("completed_sweeps", 0),
+        "failed_sweeps": sweep.get("failed_sweeps", 0),
+        "total_duration_seconds": sweep.get("total_duration_seconds", 0),
+        "thresholds": existing.get("thresholds", {}) if existing else {
+            "stability_delta_warning": 5.0,
+            "stability_delta_failure": 10.0,
+            "win_rate_delta_warning": 5.0,
+            "win_rate_delta_failure": 10.0,
+            "unused_content_warning": True,
+        },
+    }
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(baseline, f, indent=2)
+
+    return baseline
+
+
+# ============================================================================
+# Visualization Functions
+# ============================================================================
+
+
+def generate_comparison_chart(
+    baseline_path: Path,
+    current_path: Path,
+    output_path: Path,
+) -> bool:
+    """Generate comparison chart showing baseline vs current stats.
+
+    Parameters
+    ----------
+    baseline_path
+        Path to baseline JSON file.
+    current_path
+        Path to current sweep summary.
+    output_path
+        Path to save chart image.
+
+    Returns
+    -------
+    bool
+        True if chart was generated successfully.
+    """
+    if not HAS_MATPLOTLIB:
+        sys.stderr.write("Warning: matplotlib not available for chart generation\n")
+        return False
+
+    baseline = load_baseline(baseline_path)
+    current = load_sweep_summary(current_path)
+
+    if not baseline or not current:
+        sys.stderr.write("Warning: Missing baseline or current data for chart\n")
+        return False
+
+    baseline_stats = baseline.get("strategy_stats", {})
+    current_stats = extract_strategy_stats(current)
+
+    # Get strategies present in both
+    strategies = sorted(set(baseline_stats.keys()) & set(current_stats.keys()))
+    if not strategies:
+        sys.stderr.write("Warning: No common strategies to compare\n")
+        return False
+
+    # Extract data for plotting
+    baseline_stability = [baseline_stats[s].get("avg_stability", 0) for s in strategies]
+    current_stability = [current_stats[s].get("avg_stability", 0) for s in strategies]
+
+    # Create figure with subplots
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+
+    # Bar chart comparing stability
+    x = range(len(strategies))
+    width = 0.35
+
+    ax1.bar([i - width / 2 for i in x], baseline_stability, width, label="Baseline")
+    ax1.bar([i + width / 2 for i in x], current_stability, width, label="Current")
+    ax1.set_xlabel("Strategy")
+    ax1.set_ylabel("Average Stability")
+    ax1.set_title("Strategy Stability: Baseline vs Current")
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(strategies)
+    ax1.legend()
+    ax1.set_ylim(0, 1)
+
+    # Delta chart
+    deltas = [
+        ((c - b) / b * 100) if b > 0 else 0
+        for b, c in zip(baseline_stability, current_stability, strict=True)
+    ]
+    colors = ["green" if d >= 0 else "red" for d in deltas]
+
+    ax2.bar(strategies, deltas, color=colors)
+    ax2.axhline(y=0, color="black", linestyle="-", linewidth=0.5)
+    ax2.axhline(y=-5, color="orange", linestyle="--", linewidth=0.5, label="Warning")
+    ax2.axhline(y=-10, color="red", linestyle="--", linewidth=0.5, label="Failure")
+    ax2.set_xlabel("Strategy")
+    ax2.set_ylabel("Stability Delta (%)")
+    ax2.set_title("Stability Change from Baseline")
+    ax2.legend()
+
+    plt.tight_layout()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    plt.savefig(output_path, dpi=100, bbox_inches="tight")
+    plt.close(fig)
+
+    return True
+
+
+# ============================================================================
+# CLI Commands
+# ============================================================================
+
+
+def cmd_compare(args: argparse.Namespace) -> int:
+    """Handle the compare command."""
+    result = compare_against_baseline(
+        baseline_path=args.baseline,
+        current_path=args.current,
+        stability_threshold=args.stability_threshold,
+    )
+
+    # Write output
+    if args.output:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(json.dumps(result.to_dict(), indent=2))
+        if not args.quiet:
+            print(f"Comparison result written to {args.output}")
+
+    # Print summary
+    if not args.quiet:
+        print(f"\n{result.summary}")
+        if result.regressions:
+            print("\nRegressions detected:")
+            for reg in result.regressions:
+                icon = "❌" if reg.severity == "failure" else "⚠️"
+                print(f"  {icon} [{reg.severity.upper()}] {reg.description}")
+
+    return 0 if result.passed else 1
+
+
+def cmd_update(args: argparse.Namespace) -> int:
+    """Handle the update command."""
+    try:
+        baseline = update_baseline(
+            sweep_path=args.source,
+            output_path=args.output,
+            git_commit=args.git_commit,
+        )
+
+        if not args.quiet:
+            print(f"Baseline updated: {args.output}")
+            print(f"  Strategies: {list(baseline.get('strategy_stats', {}).keys())}")
+            print(f"  Total sweeps: {baseline.get('total_sweeps', 0)}")
+            print(f"  Git commit: {baseline.get('git_commit', 'N/A')}")
+
+        return 0
+    except Exception as e:
+        sys.stderr.write(f"Error updating baseline: {e}\n")
+        return 1
+
+
+def cmd_create(args: argparse.Namespace) -> int:
+    """Handle the create command."""
+    try:
+        baseline = create_baseline(
+            sweep_path=args.source,
+            output_path=args.output,
+            git_commit=args.git_commit,
+        )
+
+        if not args.quiet:
+            print(f"Baseline created: {args.output}")
+            print(f"  Strategies: {list(baseline.get('strategy_stats', {}).keys())}")
+            print(f"  Total sweeps: {baseline.get('total_sweeps', 0)}")
+
+        return 0
+    except Exception as e:
+        sys.stderr.write(f"Error creating baseline: {e}\n")
+        return 1
+
+
+def cmd_chart(args: argparse.Namespace) -> int:
+    """Handle the chart command."""
+    success = generate_comparison_chart(
+        baseline_path=args.baseline,
+        current_path=args.current,
+        output_path=args.output,
+    )
+
+    if success:
+        if not args.quiet:
+            print(f"Chart generated: {args.output}")
+        return 0
+    else:
+        sys.stderr.write("Failed to generate chart\n")
+        return 1
+
+
+def cmd_show(args: argparse.Namespace) -> int:
+    """Handle the show command."""
+    baseline = load_baseline(args.baseline)
+
+    if not baseline:
+        sys.stderr.write(f"Baseline not found: {args.baseline}\n")
+        return 1
+
+    if args.json:
+        print(json.dumps(baseline, indent=2))
+    else:
+        print(f"\nBaseline: {args.baseline}")
+        print("=" * 60)
+        print(f"Version: {baseline.get('version', 'N/A')}")
+        print(f"Created: {baseline.get('created_at', 'N/A')}")
+        print(f"Updated: {baseline.get('updated_at', 'N/A')}")
+        print(f"Git commit: {baseline.get('git_commit', 'N/A')}")
+        print(f"\nTotal sweeps: {baseline.get('total_sweeps', 0)}")
+        print(f"Completed: {baseline.get('completed_sweeps', 0)}")
+        print(f"Failed: {baseline.get('failed_sweeps', 0)}")
+
+        print("\nStrategy Stats:")
+        for strategy, stats in baseline.get("strategy_stats", {}).items():
+            avg_stab = stats.get("avg_stability", 0)
+            win_rate = stats.get("win_rate", compute_win_rate(stats))
+            print(f"  {strategy}: stability={avg_stab:.3f}, win_rate={win_rate:.1%}")
+
+        print("\nThresholds:")
+        thresholds = baseline.get("thresholds", {})
+        print(f"  Stability warning: {thresholds.get('stability_delta_warning', 5)}%")
+        print(f"  Stability failure: {thresholds.get('stability_delta_failure', 10)}%")
+
+    return 0
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    """CLI entry point for baseline management."""
+    parser = argparse.ArgumentParser(
+        description="Manage balance validation baselines for CI integration.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Compare current sweep against baseline
+  python scripts/manage_balance_baseline.py compare \\
+      --current build/ci_sweeps/batch_sweep_summary.json \\
+      --baseline content/baselines/balance_baseline.json
+
+  # Update baseline from successful sweep
+  python scripts/manage_balance_baseline.py update \\
+      --source build/ci_sweeps/batch_sweep_summary.json \\
+      --output content/baselines/balance_baseline.json
+
+  # Generate comparison chart
+  python scripts/manage_balance_baseline.py chart \\
+      --current build/ci_sweeps/batch_sweep_summary.json \\
+      --baseline content/baselines/balance_baseline.json \\
+      --output build/comparison.png
+""",
+    )
+
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # Compare command
+    compare_parser = subparsers.add_parser(
+        "compare", help="Compare current sweep against baseline"
+    )
+    compare_parser.add_argument(
+        "--current",
+        "-c",
+        type=Path,
+        required=True,
+        help="Path to current sweep summary JSON",
+    )
+    compare_parser.add_argument(
+        "--baseline",
+        "-b",
+        type=Path,
+        required=True,
+        help="Path to baseline JSON file",
+    )
+    compare_parser.add_argument(
+        "--output",
+        "-o",
+        type=Path,
+        help="Path to write comparison result JSON",
+    )
+    compare_parser.add_argument(
+        "--stability-threshold",
+        type=float,
+        default=5.0,
+        help="Stability delta threshold percentage (default: 5.0)",
+    )
+    compare_parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Suppress non-essential output",
+    )
+
+    # Update command
+    update_parser = subparsers.add_parser(
+        "update", help="Update baseline from sweep results"
+    )
+    update_parser.add_argument(
+        "--source",
+        "-s",
+        type=Path,
+        required=True,
+        help="Path to sweep summary JSON",
+    )
+    update_parser.add_argument(
+        "--output",
+        "-o",
+        type=Path,
+        required=True,
+        help="Path to baseline JSON file to update",
+    )
+    update_parser.add_argument(
+        "--git-commit",
+        type=str,
+        help="Git commit hash to record",
+    )
+    update_parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Suppress non-essential output",
+    )
+
+    # Create command
+    create_parser = subparsers.add_parser(
+        "create", help="Create new baseline from sweep results"
+    )
+    create_parser.add_argument(
+        "--source",
+        "-s",
+        type=Path,
+        required=True,
+        help="Path to sweep summary JSON",
+    )
+    create_parser.add_argument(
+        "--output",
+        "-o",
+        type=Path,
+        required=True,
+        help="Path to write new baseline JSON",
+    )
+    create_parser.add_argument(
+        "--git-commit",
+        type=str,
+        help="Git commit hash to record",
+    )
+    create_parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Suppress non-essential output",
+    )
+
+    # Chart command
+    chart_parser = subparsers.add_parser(
+        "chart", help="Generate comparison chart"
+    )
+    chart_parser.add_argument(
+        "--current",
+        "-c",
+        type=Path,
+        required=True,
+        help="Path to current sweep summary JSON",
+    )
+    chart_parser.add_argument(
+        "--baseline",
+        "-b",
+        type=Path,
+        required=True,
+        help="Path to baseline JSON file",
+    )
+    chart_parser.add_argument(
+        "--output",
+        "-o",
+        type=Path,
+        required=True,
+        help="Path to save chart image",
+    )
+    chart_parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Suppress non-essential output",
+    )
+
+    # Show command
+    show_parser = subparsers.add_parser(
+        "show", help="Display baseline information"
+    )
+    show_parser.add_argument(
+        "--baseline",
+        "-b",
+        type=Path,
+        default=Path("content/baselines/balance_baseline.json"),
+        help="Path to baseline JSON file",
+    )
+    show_parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Output as JSON",
+    )
+
+    args = parser.parse_args(argv)
+
+    handlers = {
+        "compare": cmd_compare,
+        "update": cmd_update,
+        "create": cmd_create,
+        "chart": cmd_chart,
+        "show": cmd_show,
+    }
+
+    return handlers[args.command](args)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
diff --git a/tests/scripts/test_balance_validation.py b/tests/scripts/test_balance_validation.py
new file mode 100644
index 00000000..5e9392dc
--- /dev/null
+++ b/tests/scripts/test_balance_validation.py
@@ -0,0 +1,806 @@
+"""Tests for balance validation CI integration.
+
+Covers baseline management, regression detection, threshold configuration,
+comparison logic, and workflow component integration.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from importlib import util
+from pathlib import Path
+
+import pytest
+
+_MODULE_PATH = (
+    Path(__file__).resolve().parents[2] / "scripts" / "manage_balance_baseline.py"
+)
+
+
+def _load_baseline_module():
+    spec = util.spec_from_file_location("manage_balance_baseline", _MODULE_PATH)
+    module = util.module_from_spec(spec)
+    assert spec and spec.loader
+    sys.modules.setdefault("manage_balance_baseline", module)
+    spec.loader.exec_module(module)
+    return module
+
+
+_module = _load_baseline_module()
+RegressionAlert = _module.RegressionAlert
+ComparisonResult = _module.ComparisonResult
+BaselineConfig = _module.BaselineConfig
+load_baseline = _module.load_baseline
+load_sweep_summary = _module.load_sweep_summary
+extract_strategy_stats = _module.extract_strategy_stats
+extract_difficulty_stats = _module.extract_difficulty_stats
+compute_win_rate = _module.compute_win_rate
+compare_strategy_stats = _module.compare_strategy_stats
+compare_against_baseline = _module.compare_against_baseline
+create_baseline = _module.create_baseline
+update_baseline = _module.update_baseline
+main = _module.main
+
+
+# ============================================================================
+# Test Fixtures
+# ============================================================================
+
+
+@pytest.fixture
+def sample_baseline() -> dict:
+    """Create a sample baseline for testing."""
+    return {
+        "version": "1.0",
+        "created_at": "2025-01-01T00:00:00Z",
+        "updated_at": "2025-01-01T00:00:00Z",
+        "git_commit": "abc1234",
+        "description": "Test baseline",
+        "strategy_stats": {
+            "balanced": {
+                "avg_stability": 0.7,
+                "min_stability": 0.5,
+                "max_stability": 0.9,
+                "win_rate": 0.8,
+                "avg_actions": 10,
+                "total_actions": 50,
+                "count": 5,
+                "completed": 5,
+                "failed": 0,
+            },
+            "aggressive": {
+                "avg_stability": 0.5,
+                "min_stability": 0.3,
+                "max_stability": 0.7,
+                "win_rate": 0.5,
+                "avg_actions": 15,
+                "total_actions": 75,
+                "count": 5,
+                "completed": 5,
+                "failed": 0,
+            },
+            "diplomatic": {
+                "avg_stability": 0.65,
+                "min_stability": 0.45,
+                "max_stability": 0.85,
+                "win_rate": 0.6,
+                "avg_actions": 8,
+                "total_actions": 40,
+                "count": 5,
+                "completed": 5,
+                "failed": 0,
+            },
+        },
+        "difficulty_stats": {
+            "easy": {"avg_stability": 0.75, "count": 5},
+            "normal": {"avg_stability": 0.6, "count": 5},
+            "hard": {"avg_stability": 0.5, "count": 5},
+        },
+        "total_sweeps": 45,
+        "completed_sweeps": 45,
+        "failed_sweeps": 0,
+        "thresholds": {
+            "stability_delta_warning": 5.0,
+            "stability_delta_failure": 10.0,
+            "win_rate_delta_warning": 5.0,
+            "win_rate_delta_failure": 10.0,
+            "unused_content_warning": True,
+        },
+    }
+
+
+@pytest.fixture
+def sample_sweep_summary() -> dict:
+    """Create a sample sweep summary for testing."""
+    return {
+        "config": {
+            "strategies": ["balanced", "aggressive", "diplomatic"],
+            "difficulties": ["easy", "normal", "hard"],
+            "seeds": [42, 123, 456],
+            "worlds": ["default"],
+            "tick_budgets": [100],
+        },
+        "total_sweeps": 45,
+        "completed_sweeps": 44,
+        "failed_sweeps": 1,
+        "strategy_stats": {
+            "balanced": {
+                "avg_stability": 0.72,
+                "min_stability": 0.52,
+                "max_stability": 0.92,
+                "avg_actions": 11,
+                "total_actions": 55,
+                "count": 15,
+                "completed": 15,
+                "failed": 0,
+            },
+            "aggressive": {
+                "avg_stability": 0.48,
+                "min_stability": 0.28,
+                "max_stability": 0.68,
+                "avg_actions": 16,
+                "total_actions": 80,
+                "count": 15,
+                "completed": 14,
+                "failed": 1,
+            },
+            "diplomatic": {
+                "avg_stability": 0.67,
+                "min_stability": 0.47,
+                "max_stability": 0.87,
+                "avg_actions": 9,
+                "total_actions": 45,
+                "count": 15,
+                "completed": 15,
+                "failed": 0,
+            },
+        },
+        "difficulty_stats": {
+            "easy": {"avg_stability": 0.77, "count": 15},
+            "normal": {"avg_stability": 0.62, "count": 15},
+            "hard": {"avg_stability": 0.48, "count": 15},
+        },
+        "total_duration_seconds": 120.5,
+        "metadata": {
+            "timestamp": "2025-01-15T10:00:00Z",
+            "git_commit": "def5678",
+        },
+    }
+
+
+# ============================================================================
+# Test: Baseline Creation and Loading
+# ============================================================================
+
+
+class TestBaselineCreationAndLoading:
+    """Tests for baseline file creation and loading."""
+
+    def test_load_baseline_from_file(
+        self, tmp_path: Path, sample_baseline: dict
+    ) -> None:
+        """Load baseline data from JSON file."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+
+        loaded = load_baseline(baseline_path)
+
+        assert loaded is not None
+        assert loaded["version"] == "1.0"
+        assert "strategy_stats" in loaded
+        assert "balanced" in loaded["strategy_stats"]
+
+    def test_load_baseline_missing_file(self, tmp_path: Path) -> None:
+        """Return None when baseline file doesn't exist."""
+        baseline_path = tmp_path / "nonexistent.json"
+
+        loaded = load_baseline(baseline_path)
+
+        assert loaded is None
+
+    def test_create_baseline_from_sweep(
+        self, tmp_path: Path, sample_sweep_summary: dict
+    ) -> None:
+        """Create new baseline from sweep summary."""
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+        output_path = tmp_path / "new_baseline.json"
+
+        baseline = create_baseline(sweep_path, output_path)
+
+        assert baseline is not None
+        assert output_path.exists()
+        assert baseline["strategy_stats"]["balanced"]["avg_stability"] == 0.72
+        assert baseline["total_sweeps"] == 45
+        assert "created_at" in baseline
+        assert "thresholds" in baseline
+
+    def test_create_baseline_with_git_commit(
+        self, tmp_path: Path, sample_sweep_summary: dict
+    ) -> None:
+        """Create baseline with explicit git commit."""
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+        output_path = tmp_path / "baseline.json"
+
+        baseline = create_baseline(sweep_path, output_path, git_commit="custom123")
+
+        assert baseline["git_commit"] == "custom123"
+
+    def test_update_baseline_preserves_created_at(
+        self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict
+    ) -> None:
+        """Update baseline preserves original creation timestamp."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+
+        updated = update_baseline(sweep_path, baseline_path)
+
+        assert updated["created_at"] == "2025-01-01T00:00:00Z"
+        assert updated["updated_at"] != updated["created_at"]
+
+
+# ============================================================================
+# Test: Regression Detection Logic
+# ============================================================================
+
+
+class TestRegressionDetectionLogic:
+    """Tests for detecting balance regressions."""
+
+    def test_detect_stability_regression_failure(self) -> None:
+        """Detect significant stability decrease as failure."""
+        baseline_stats = {
+            "balanced": {"avg_stability": 0.8, "win_rate": 0.9},
+        }
+        current_stats = {
+            "balanced": {"avg_stability": 0.6, "win_rate": 0.7},  # 25% drop
+        }
+        config = BaselineConfig(
+            stability_delta_warning=5.0,
+            stability_delta_failure=10.0,
+        )
+
+        alerts = compare_strategy_stats(baseline_stats, current_stats, config)
+
+        stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"]
+        assert len(stability_alerts) == 1
+        assert stability_alerts[0].severity == "failure"
+        assert stability_alerts[0].delta_percent < -10
+
+    def test_detect_stability_regression_warning(self) -> None:
+        """Detect moderate stability decrease as warning."""
+        baseline_stats = {
+            "balanced": {"avg_stability": 0.8, "win_rate": 0.9},
+        }
+        current_stats = {
+            "balanced": {"avg_stability": 0.74, "win_rate": 0.85},  # 7.5% drop
+        }
+        config = BaselineConfig(
+            stability_delta_warning=5.0,
+            stability_delta_failure=10.0,
+        )
+
+        alerts = compare_strategy_stats(baseline_stats, current_stats, config)
+
+        stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"]
+        assert len(stability_alerts) == 1
+        assert stability_alerts[0].severity == "warning"
+
+    def test_no_regression_within_threshold(self) -> None:
+        """No regression when changes are within threshold."""
+        baseline_stats = {
+            "balanced": {"avg_stability": 0.8, "win_rate": 0.9},
+        }
+        current_stats = {
+            "balanced": {"avg_stability": 0.78, "win_rate": 0.88},  # 2.5% drop
+        }
+        config = BaselineConfig(
+            stability_delta_warning=5.0,
+            stability_delta_failure=10.0,
+        )
+
+        alerts = compare_strategy_stats(baseline_stats, current_stats, config)
+
+        stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"]
+        assert len(stability_alerts) == 0
+
+    def test_stability_improvement_not_flagged(self) -> None:
+        """Stability improvements should not trigger alerts."""
+        baseline_stats = {
+            "balanced": {"avg_stability": 0.6, "win_rate": 0.7},
+        }
+        current_stats = {
+            "balanced": {"avg_stability": 0.8, "win_rate": 0.9},  # 33% improvement
+        }
+        config = BaselineConfig()
+
+        alerts = compare_strategy_stats(baseline_stats, current_stats, config)
+
+        stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"]
+        assert len(stability_alerts) == 0
+
+    def test_detect_win_rate_regression(self) -> None:
+        """Detect win rate regression."""
+        baseline_stats = {
+            "aggressive": {"avg_stability": 0.5, "win_rate": 0.8},
+        }
+        current_stats = {
+            "aggressive": {"avg_stability": 0.5, "win_rate": 0.6},  # 25% drop
+        }
+        config = BaselineConfig(
+            win_rate_delta_warning=5.0,
+            win_rate_delta_failure=10.0,
+        )
+
+        alerts = compare_strategy_stats(baseline_stats, current_stats, config)
+
+        win_rate_alerts = [a for a in alerts if a.metric_name == "win_rate"]
+        assert len(win_rate_alerts) == 1
+        assert win_rate_alerts[0].severity == "failure"
+
+    def test_multiple_strategy_regressions(self) -> None:
+        """Detect regressions across multiple strategies."""
+        baseline_stats = {
+            "balanced": {"avg_stability": 0.8},
+            "aggressive": {"avg_stability": 0.6},
+            "diplomatic": {"avg_stability": 0.7},
+        }
+        current_stats = {
+            "balanced": {"avg_stability": 0.5},  # Big drop
+            "aggressive": {"avg_stability": 0.58},  # Minor drop
+            "diplomatic": {"avg_stability": 0.4},  # Big drop
+        }
+        config = BaselineConfig(
+            stability_delta_warning=5.0,
+            stability_delta_failure=10.0,
+        )
+
+        alerts = compare_strategy_stats(baseline_stats, current_stats, config)
+
+        # Should flag balanced and diplomatic
+        affected_strategies = {a.strategy for a in alerts}
+        assert "balanced" in affected_strategies
+        assert "diplomatic" in affected_strategies
+
+
+# ============================================================================
+# Test: Threshold Configuration
+# ============================================================================
+
+
+class TestThresholdConfiguration:
+    """Tests for configurable regression thresholds."""
+
+    def test_baseline_config_from_dict(self) -> None:
+        """Load threshold config from dictionary."""
+        data = {
+            "stability_delta_warning": 3.0,
+            "stability_delta_failure": 8.0,
+            "win_rate_delta_warning": 4.0,
+            "win_rate_delta_failure": 12.0,
+            "unused_content_warning": False,
+        }
+
+        config = BaselineConfig.from_dict(data)
+
+        assert config.stability_delta_warning == 3.0
+        assert config.stability_delta_failure == 8.0
+        assert config.win_rate_delta_warning == 4.0
+        assert config.win_rate_delta_failure == 12.0
+        assert config.unused_content_warning is False
+
+    def test_baseline_config_defaults(self) -> None:
+        """Use default thresholds when not specified."""
+        config = BaselineConfig.from_dict({})
+
+        assert config.stability_delta_warning == 5.0
+        assert config.stability_delta_failure == 10.0
+
+    def test_custom_threshold_in_comparison(self) -> None:
+        """Custom thresholds affect regression detection."""
+        baseline_stats = {
+            "balanced": {"avg_stability": 0.8, "win_rate": 0.8},
+        }
+        current_stats = {
+            "balanced": {"avg_stability": 0.74, "win_rate": 0.74},  # 7.5% drop
+        }
+
+        # With default 5% warning threshold - should warn (stability alert)
+        strict_config = BaselineConfig(
+            stability_delta_warning=5.0,
+            stability_delta_failure=10.0,
+        )
+        strict_alerts = compare_strategy_stats(
+            baseline_stats, current_stats, strict_config
+        )
+        # Could have multiple alerts (stability + win_rate)
+        stability_alerts = [
+            a for a in strict_alerts if a.metric_name == "avg_stability"
+        ]
+        assert len(stability_alerts) == 1
+
+        # With relaxed 10% warning threshold - should pass
+        relaxed_config = BaselineConfig(
+            stability_delta_warning=10.0,
+            stability_delta_failure=20.0,
+            win_rate_delta_warning=10.0,
+            win_rate_delta_failure=20.0,
+        )
+        relaxed_alerts = compare_strategy_stats(
+            baseline_stats, current_stats, relaxed_config
+        )
+        assert len(relaxed_alerts) == 0
+
+
+# ============================================================================
+# Test: Comparison of Sweep Results
+# ============================================================================
+
+
+class TestComparisonOfSweepResults:
+    """Tests for comparing sweep results against baseline."""
+
+    def test_compare_against_baseline_no_regression(
+        self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict
+    ) -> None:
+        """Comparison passes when no significant regressions."""
+        baseline_path = tmp_path / "baseline.json"
+        # Modify baseline to have lower values so current is an improvement
+        modified_baseline = sample_baseline.copy()
+        modified_baseline["strategy_stats"]["balanced"]["avg_stability"] = 0.6
+        modified_baseline["strategy_stats"]["balanced"]["win_rate"] = 0.6
+        modified_baseline["strategy_stats"]["aggressive"]["avg_stability"] = 0.4
+        modified_baseline["strategy_stats"]["aggressive"]["win_rate"] = 0.4
+        modified_baseline["strategy_stats"]["diplomatic"]["avg_stability"] = 0.5
+        modified_baseline["strategy_stats"]["diplomatic"]["win_rate"] = 0.5
+        baseline_path.write_text(json.dumps(modified_baseline))
+
+        # Current sweep with better stats (improvement, not regression)
+        current_sweep = sample_sweep_summary.copy()
+        current_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.72
+        current_sweep["strategy_stats"]["aggressive"]["avg_stability"] = 0.48
+        current_sweep["strategy_stats"]["diplomatic"]["avg_stability"] = 0.67
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(current_sweep))
+
+        result = compare_against_baseline(
+            baseline_path, sweep_path, stability_threshold=5.0
+        )
+
+        assert result.passed is True
+        assert "PASSED" in result.summary
+
+    def test_compare_against_baseline_with_regression(
+        self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict
+    ) -> None:
+        """Comparison fails when significant regression detected."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+
+        # Current sweep with much lower stability
+        current_sweep = sample_sweep_summary.copy()
+        current_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.4
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(current_sweep))
+
+        result = compare_against_baseline(
+            baseline_path, sweep_path, stability_threshold=5.0
+        )
+
+        assert result.passed is False
+        assert len(result.regressions) > 0
+        assert "FAILED" in result.summary
+
+    def test_compare_missing_baseline_passes(
+        self, tmp_path: Path, sample_sweep_summary: dict
+    ) -> None:
+        """Comparison passes when no baseline exists (establishing new baseline)."""
+        baseline_path = tmp_path / "nonexistent_baseline.json"
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+
+        result = compare_against_baseline(
+            baseline_path, sweep_path, stability_threshold=5.0
+        )
+
+        assert result.passed is True
+        assert "establishing new baseline" in result.summary.lower()
+
+    def test_compare_missing_current_fails(
+        self, tmp_path: Path, sample_baseline: dict
+    ) -> None:
+        """Comparison fails when current sweep data is missing."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+        sweep_path = tmp_path / "nonexistent_sweep.json"
+
+        result = compare_against_baseline(
+            baseline_path, sweep_path, stability_threshold=5.0
+        )
+
+        assert result.passed is False
+        assert "no current sweep data" in result.summary.lower()
+
+    def test_comparison_result_serialization(
+        self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict
+    ) -> None:
+        """ComparisonResult can be serialized to JSON."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+
+        result = compare_against_baseline(
+            baseline_path, sweep_path, stability_threshold=5.0
+        )
+
+        data = result.to_dict()
+
+        assert "timestamp" in data
+        assert "regressions" in data
+        assert "baseline_stats" in data
+        assert "current_stats" in data
+        assert "summary" in data
+        assert "passed" in data
+
+        # Should be JSON serializable
+        json_str = json.dumps(data)
+        assert len(json_str) > 0
+
+
+# ============================================================================
+# Test: Workflow Component Integration
+# ============================================================================
+
+
+class TestWorkflowComponentIntegration:
+    """Tests for CLI commands and workflow integration."""
+
+    def test_cli_compare_command(
+        self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict, capsys
+    ) -> None:
+        """CLI compare command works correctly."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+        output_path = tmp_path / "result.json"
+
+        main([
+            "compare",
+            "--current", str(sweep_path),
+            "--baseline", str(baseline_path),
+            "--output", str(output_path),
+        ])
+
+        assert output_path.exists()
+        result = json.loads(output_path.read_text())
+        assert "regressions" in result
+        assert "summary" in result
+
+    def test_cli_update_command(
+        self, tmp_path: Path, sample_sweep_summary: dict, capsys
+    ) -> None:
+        """CLI update command creates/updates baseline."""
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+        output_path = tmp_path / "baseline.json"
+
+        exit_code = main([
+            "update",
+            "--source", str(sweep_path),
+            "--output", str(output_path),
+        ])
+
+        assert exit_code == 0
+        assert output_path.exists()
+
+        baseline = json.loads(output_path.read_text())
+        assert baseline["strategy_stats"]["balanced"]["avg_stability"] == 0.72
+
+    def test_cli_create_command(
+        self, tmp_path: Path, sample_sweep_summary: dict, capsys
+    ) -> None:
+        """CLI create command creates new baseline."""
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(sample_sweep_summary))
+        output_path = tmp_path / "new_baseline.json"
+
+        exit_code = main([
+            "create",
+            "--source", str(sweep_path),
+            "--output", str(output_path),
+            "--git-commit", "test123",
+        ])
+
+        assert exit_code == 0
+        baseline = json.loads(output_path.read_text())
+        assert baseline["git_commit"] == "test123"
+
+    def test_cli_show_command(
+        self, tmp_path: Path, sample_baseline: dict, capsys
+    ) -> None:
+        """CLI show command displays baseline info."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+
+        exit_code = main([
+            "show",
+            "--baseline", str(baseline_path),
+        ])
+
+        assert exit_code == 0
+        captured = capsys.readouterr()
+        assert "balanced" in captured.out
+        assert "aggressive" in captured.out
+
+    def test_cli_show_json_output(
+        self, tmp_path: Path, sample_baseline: dict, capsys
+    ) -> None:
+        """CLI show command outputs JSON when requested."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+
+        exit_code = main([
+            "show",
+            "--baseline", str(baseline_path),
+            "--json",
+        ])
+
+        assert exit_code == 0
+        captured = capsys.readouterr()
+        data = json.loads(captured.out)
+        assert data["version"] == "1.0"
+
+    def test_cli_compare_with_custom_threshold(
+        self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict, capsys
+    ) -> None:
+        """CLI compare respects custom stability threshold."""
+        baseline_path = tmp_path / "baseline.json"
+        baseline_path.write_text(json.dumps(sample_baseline))
+
+        # Create sweep with significant regression (50% drop from 0.7)
+        current_sweep = sample_sweep_summary.copy()
+        current_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.35
+        sweep_path = tmp_path / "sweep_summary.json"
+        sweep_path.write_text(json.dumps(current_sweep))
+
+        # With strict threshold (5%) - should fail
+        strict_result = tmp_path / "strict_result.json"
+        exit_code = main([
+            "compare",
+            "--current", str(sweep_path),
+            "--baseline", str(baseline_path),
+            "--output", str(strict_result),
+            "--stability-threshold", "5",
+            "--quiet",
+        ])
+        assert exit_code == 1  # Failed
+
+        # Create sweep with improvement (current > baseline) - should pass
+        improved_sweep = sample_sweep_summary.copy()
+        # Better than baseline: 0.7 -> 0.8, 0.5 -> 0.6, 0.65 -> 0.75
+        improved_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.8
+        improved_sweep["strategy_stats"]["aggressive"]["avg_stability"] = 0.6
+        improved_sweep["strategy_stats"]["diplomatic"]["avg_stability"] = 0.75
+        improved_sweep_path = tmp_path / "improved_sweep.json"
+        improved_sweep_path.write_text(json.dumps(improved_sweep))
+
+        relaxed_result = tmp_path / "relaxed_result.json"
+        exit_code = main([
+            "compare",
+            "--current", str(improved_sweep_path),
+            "--baseline", str(baseline_path),
+            "--output", str(relaxed_result),
+            "--stability-threshold", "5",
+            "--quiet",
+        ])
+        assert exit_code == 0  # Passed (improvements are not regressions)
+
+
+# ============================================================================
+# Test: Data Class Serialization
+# ============================================================================
+
+
+class TestDataClassSerialization:
+    """Tests for data class to_dict methods."""
+
+    def test_regression_alert_to_dict(self) -> None:
+        """RegressionAlert serializes correctly."""
+        alert = RegressionAlert(
+            metric_name="avg_stability",
+            strategy="balanced",
+            baseline_value=0.8,
+            current_value=0.6,
+            delta_percent=-25.0,
+            severity="failure",
+            description="Test alert",
+        )
+
+        data = alert.to_dict()
+
+        assert data["metric_name"] == "avg_stability"
+        assert data["strategy"] == "balanced"
+        assert data["baseline_value"] == 0.8
+        assert data["current_value"] == 0.6
+        assert data["delta_percent"] == -25.0
+        assert data["severity"] == "failure"
+
+    def test_comparison_result_to_dict(self) -> None:
+        """ComparisonResult serializes correctly."""
+        result = ComparisonResult(
+            timestamp="2025-01-15T10:00:00Z",
+            baseline_path="/test/baseline.json",
+            current_path="/test/current.json",
+            regressions=[
+                RegressionAlert(
+                    metric_name="avg_stability",
+                    strategy="balanced",
+                    baseline_value=0.8,
+                    current_value=0.6,
+                    delta_percent=-25.0,
+                    severity="failure",
+                    description="Test alert",
+                )
+            ],
+            summary="FAILED",
+            passed=False,
+        )
+
+        data = result.to_dict()
+
+        assert data["timestamp"] == "2025-01-15T10:00:00Z"
+        assert len(data["regressions"]) == 1
+        assert data["passed"] is False
+
+
+# ============================================================================
+# Test: Helper Functions
+# ============================================================================
+
+
+class TestHelperFunctions:
+    """Tests for utility/helper functions."""
+
+    def test_extract_strategy_stats(self, sample_sweep_summary: dict) -> None:
+        """Extract strategy statistics from sweep summary."""
+        stats = extract_strategy_stats(sample_sweep_summary)
+
+        assert "balanced" in stats
+        assert "aggressive" in stats
+        assert "diplomatic" in stats
+        assert stats["balanced"]["avg_stability"] == 0.72
+
+    def test_extract_difficulty_stats(self, sample_sweep_summary: dict) -> None:
+        """Extract difficulty statistics from sweep summary."""
+        stats = extract_difficulty_stats(sample_sweep_summary)
+
+        assert "easy" in stats
+        assert "normal" in stats
+        assert "hard" in stats
+
+    def test_compute_win_rate_above_threshold(self) -> None:
+        """Compute win rate for strategy above stability threshold."""
+        stats = {"avg_stability": 0.75}
+        win_rate = compute_win_rate(stats)
+        assert 0.5 < win_rate <= 1.0
+
+    def test_compute_win_rate_below_threshold(self) -> None:
+        """Compute win rate for strategy below stability threshold."""
+        stats = {"avg_stability": 0.3}
+        win_rate = compute_win_rate(stats)
+        assert 0.0 <= win_rate < 0.5
+
+    def test_compute_win_rate_explicit_value(self) -> None:
+        """Use explicit win_rate when available."""
+        stats = {"avg_stability": 0.4, "win_rate": 0.8}
+        # The compare function checks for explicit win_rate first
+        # but compute_win_rate approximates from avg_stability
+        win_rate = compute_win_rate(stats)
+        # This should return approximation based on avg_stability
+        assert win_rate < 0.5