diff --git a/.github/workflows/balance-validation.yml b/.github/workflows/balance-validation.yml new file mode 100644 index 00000000..78ddc954 --- /dev/null +++ b/.github/workflows/balance-validation.yml @@ -0,0 +1,280 @@ +name: Balance Validation + +on: + schedule: + # Run nightly at 3:00 AM UTC (offset from ai-tournament at 2:00 AM) + - cron: '0 3 * * *' + workflow_dispatch: + inputs: + compare_baseline: + description: 'Compare against baseline (true/false)' + required: false + default: 'true' + fail_on_regression: + description: 'Fail workflow on regression detection (true/false)' + required: false + default: 'true' + stability_threshold: + description: 'Stability delta threshold for regression (percentage)' + required: false + default: '5' + push: + branches: + - main + paths: + # Trigger on changes to simulation code, content, or AI systems + - 'src/gengine/echoes/sim/**' + - 'src/gengine/echoes/core/**' + - 'src/gengine/ai_player/**' + - 'content/worlds/**' + - 'content/config/**' + - 'scripts/run_batch_sweeps.py' + - 'scripts/analyze_balance.py' + pull_request: + branches: + - main + paths: + - 'src/gengine/echoes/sim/**' + - 'src/gengine/echoes/core/**' + - 'src/gengine/ai_player/**' + - 'content/worlds/**' + - 'content/config/**' + - 'scripts/run_batch_sweeps.py' + - 'scripts/analyze_balance.py' + +permissions: + contents: read + pull-requests: write + +jobs: + balance-validation: + runs-on: ubuntu-latest + outputs: + regression_detected: ${{ steps.compare.outputs.regression_detected }} + regression_summary: ${{ steps.compare.outputs.regression_summary }} + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Create build directory + run: mkdir -p build + + - name: Run CI balance sweeps + id: sweep + run: | + echo "Running reduced parameter grid balance sweeps..." + python scripts/run_batch_sweeps.py \ + --config content/config/ci_balance_sweeps.yml \ + --output-dir build/ci_sweeps \ + --verbose + + echo "sweep_completed=true" >> $GITHUB_OUTPUT + + - name: Aggregate sweep results + run: | + python scripts/aggregate_sweep_results.py \ + --database build/ci_sweep_results.db \ + ingest build/ci_sweeps \ + --verbose + + - name: Generate balance report + run: | + python scripts/analyze_balance.py \ + --database build/ci_sweep_results.db \ + report + + - name: Compare against baseline + id: compare + env: + COMPARE_BASELINE: ${{ github.event.inputs.compare_baseline || 'true' }} + STABILITY_THRESHOLD: ${{ github.event.inputs.stability_threshold || '5' }} + run: | + if [[ "$COMPARE_BASELINE" != "true" ]]; then + echo "Baseline comparison skipped by user input" + echo "regression_detected=false" >> $GITHUB_OUTPUT + echo "regression_summary=Baseline comparison skipped" >> $GITHUB_OUTPUT + exit 0 + fi + + python scripts/manage_balance_baseline.py compare \ + --current build/ci_sweeps/batch_sweep_summary.json \ + --baseline content/baselines/balance_baseline.json \ + --output build/regression_summary.json \ + --stability-threshold "$STABILITY_THRESHOLD" + + # Check if regressions were detected + if [[ -f build/regression_summary.json ]]; then + REGRESSION_COUNT=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(len(d.get('regressions', [])))") + if [[ "$REGRESSION_COUNT" -gt 0 ]]; then + echo "regression_detected=true" >> $GITHUB_OUTPUT + SUMMARY=$(python -c "import json; d=json.load(open('build/regression_summary.json')); print(d.get('summary', 'Regressions detected'))") + echo "regression_summary=$SUMMARY" >> $GITHUB_OUTPUT + else + echo "regression_detected=false" >> $GITHUB_OUTPUT + echo "regression_summary=No regressions detected" >> $GITHUB_OUTPUT + fi + else + echo "regression_detected=false" >> $GITHUB_OUTPUT + echo "regression_summary=No baseline comparison performed" >> $GITHUB_OUTPUT + fi + + - name: Generate comparison charts + if: always() + run: | + python scripts/manage_balance_baseline.py chart \ + --current build/ci_sweeps/batch_sweep_summary.json \ + --baseline content/baselines/balance_baseline.json \ + --output build/comparison_charts.png || echo "Chart generation skipped (missing baseline or data)" + + - name: Archive balance validation results + uses: actions/upload-artifact@v4 + with: + name: balance-validation-${{ github.run_id }} + path: | + build/ci_sweeps/batch_sweep_summary.json + build/ci_sweep_results.db + build/balance_report.html + build/balance_report.json + build/regression_summary.json + build/comparison_charts.png + retention-days: 30 + + - name: Comment on PR with balance summary + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + + let report = {}; + try { + report = JSON.parse(fs.readFileSync('build/balance_report.json', 'utf8')); + } catch (e) { + console.log('Could not read balance report'); + return; + } + + let regressionSummary = {}; + try { + regressionSummary = JSON.parse(fs.readFileSync('build/regression_summary.json', 'utf8')); + } catch (e) { + regressionSummary = { regressions: [], summary: 'No baseline comparison' }; + } + + const regressions = regressionSummary.regressions || []; + const hasRegressions = regressions.length > 0; + + let body = `## šŸ“Š Balance Validation Results\n\n`; + + if (hasRegressions) { + body += `### āš ļø Regressions Detected\n\n`; + for (const reg of regressions) { + const icon = reg.severity === 'failure' ? 'āŒ' : 'āš ļø'; + body += `- ${icon} **${reg.metric_name}**: ${reg.description}\n`; + } + body += `\n`; + } else { + body += `### āœ… No Regressions Detected\n\n`; + } + + // Strategy stats + if (regressionSummary.current_stats && regressionSummary.current_stats.strategy_stats) { + body += `### Strategy Win Rates\n\n`; + body += `| Strategy | Avg Stability | Win Rate |\n`; + body += `|----------|---------------|----------|\n`; + for (const [strategy, stats] of Object.entries(regressionSummary.current_stats.strategy_stats)) { + const avgStab = stats.avg_stability?.toFixed(3) || 'N/A'; + const winRate = ((stats.avg_stability || 0) >= 0.5 ? 'āœ…' : 'āš ļø'); + body += `| ${strategy} | ${avgStab} | ${winRate} |\n`; + } + body += `\n`; + } + + body += `\nšŸ“ [Download full report](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID})\n`; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Fail on regression (if configured) + if: steps.compare.outputs.regression_detected == 'true' + env: + FAIL_ON_REGRESSION: ${{ github.event.inputs.fail_on_regression || 'true' }} + run: | + echo "::error::Balance regression detected: ${{ steps.compare.outputs.regression_summary }}" + if [[ "$FAIL_ON_REGRESSION" == "true" ]]; then + echo "Failing workflow due to regression detection" + exit 1 + else + echo "::warning::Regression detected but workflow configured to continue" + fi + + update-baseline: + needs: balance-validation + runs-on: ubuntu-latest + # Only update baseline on successful main branch runs (scheduled or push) + if: | + github.ref == 'refs/heads/main' && + (github.event_name == 'schedule' || github.event_name == 'push') && + needs.balance-validation.outputs.regression_detected != 'true' + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Download sweep results + uses: actions/download-artifact@v4.1.8 + with: + name: balance-validation-${{ github.run_id }} + path: build + + - name: Update baseline + run: | + python scripts/manage_balance_baseline.py update \ + --source build/ci_sweeps/batch_sweep_summary.json \ + --output content/baselines/balance_baseline.json + + - name: Create baseline update PR + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "chore: update balance validation baseline" + branch: automated/update-balance-baseline + delete-branch: true + title: "[Automated] Update Balance Validation Baseline" + body: | + This PR updates the balance validation baseline from the latest successful CI run. + + **Run ID:** ${{ github.run_id }} + **Commit:** ${{ github.sha }} + + The baseline is updated automatically when: + - The balance validation workflow passes on main branch + - No regressions are detected against the previous baseline + + Please review the changes before merging. + labels: | + automated + balance diff --git a/content/baselines/balance_baseline.json b/content/baselines/balance_baseline.json new file mode 100644 index 00000000..5b679663 --- /dev/null +++ b/content/baselines/balance_baseline.json @@ -0,0 +1,67 @@ +{ + "version": "1.0", + "created_at": "2025-01-01T00:00:00Z", + "updated_at": "2025-01-01T00:00:00Z", + "git_commit": null, + "description": "Initial balance baseline - placeholder for first CI run", + "strategy_stats": { + "balanced": { + "avg_stability": 0.7, + "min_stability": 0.5, + "max_stability": 0.9, + "win_rate": 0.8, + "avg_actions": 10, + "total_actions": 50, + "count": 5, + "completed": 5, + "failed": 0 + }, + "aggressive": { + "avg_stability": 0.5, + "min_stability": 0.3, + "max_stability": 0.7, + "win_rate": 0.5, + "avg_actions": 15, + "total_actions": 75, + "count": 5, + "completed": 5, + "failed": 0 + }, + "diplomatic": { + "avg_stability": 0.65, + "min_stability": 0.45, + "max_stability": 0.85, + "win_rate": 0.6, + "avg_actions": 8, + "total_actions": 40, + "count": 5, + "completed": 5, + "failed": 0 + } + }, + "difficulty_stats": { + "easy": { + "avg_stability": 0.75, + "count": 5 + }, + "normal": { + "avg_stability": 0.6, + "count": 5 + }, + "hard": { + "avg_stability": 0.5, + "count": 5 + } + }, + "total_sweeps": 45, + "completed_sweeps": 45, + "failed_sweeps": 0, + "total_duration_seconds": 0, + "thresholds": { + "stability_delta_warning": 5.0, + "stability_delta_failure": 10.0, + "win_rate_delta_warning": 5.0, + "win_rate_delta_failure": 10.0, + "unused_content_warning": true + } +} diff --git a/content/config/ci_balance_sweeps.yml b/content/config/ci_balance_sweeps.yml new file mode 100644 index 00000000..8c6d8818 --- /dev/null +++ b/content/config/ci_balance_sweeps.yml @@ -0,0 +1,64 @@ +# CI Balance Sweeps Configuration +# Reduced parameter grid for fast CI validation of balance changes. +# This configuration is optimized for speed while still providing meaningful coverage. + +# Parameter Grid - reduced from full sweeps for CI speed +parameters: + # Three representative strategies covering the spectrum + strategies: + - balanced + - aggressive + - diplomatic + + # Three difficulty presets to validate scaling + difficulties: + - easy + - normal + - hard + + # Five deterministic seeds for reproducibility with statistical significance + seeds: + - 42 + - 123 + - 456 + - 789 + - 1000 + + # Default world only for CI (additional worlds tested in nightly full sweeps) + worlds: + - default + + # Reduced tick budget for faster execution (100 ticks captures most dynamics) + tick_budgets: + - 100 + +# Parallel execution settings optimized for CI runners +parallel: + # Use 2 workers to balance speed and resource usage on CI + max_workers: 2 + + # Shorter timeout for CI + timeout_per_sweep: 120 + +# Output configuration +output: + # Standard CI output directory + dir: build/ci_sweeps + + # Include telemetry for debugging but keep it minimal + include_telemetry: true + + # Include summary for report generation + include_summary: true + +# Sampling - use full grid since CI grid is already small (3*3*5*1*1 = 45 combinations) +sampling: + mode: full + sample_count: 45 + sample_seed: 42 + +# Metadata for traceability +metadata: + include_git_commit: true + include_timestamp: true + include_runtime_info: true diff --git a/scripts/manage_balance_baseline.py b/scripts/manage_balance_baseline.py new file mode 100644 index 00000000..fe9950ea --- /dev/null +++ b/scripts/manage_balance_baseline.py @@ -0,0 +1,954 @@ +#!/usr/bin/env python3 +"""Manage balance validation baselines for CI integration. + +Provides tools for storing, updating, and comparing balance sweep results +against baseline data for regression detection in CI workflows. + +Examples +-------- +Compare current sweep against baseline:: + + python scripts/manage_balance_baseline.py compare \\ + --current build/ci_sweeps/batch_sweep_summary.json \\ + --baseline content/baselines/balance_baseline.json + +Update baseline from successful sweep:: + + python scripts/manage_balance_baseline.py update \\ + --source build/ci_sweeps/batch_sweep_summary.json \\ + --output content/baselines/balance_baseline.json + +Generate comparison chart:: + + python scripts/manage_balance_baseline.py chart \\ + --current build/ci_sweeps/batch_sweep_summary.json \\ + --baseline content/baselines/balance_baseline.json \\ + --output build/comparison_chart.png +""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Sequence + +# Optional imports for visualization +try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + + +# ============================================================================ +# Data Classes +# ============================================================================ + + +@dataclass +class RegressionAlert: + """Alert for a detected balance regression.""" + + metric_name: str + strategy: str | None + baseline_value: float + current_value: float + delta_percent: float + severity: str # "warning" or "failure" + description: str + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "strategy": self.strategy, + "baseline_value": round(self.baseline_value, 4), + "current_value": round(self.current_value, 4), + "delta_percent": round(self.delta_percent, 2), + "severity": self.severity, + "description": self.description, + } + + +@dataclass +class ComparisonResult: + """Result of comparing current sweep against baseline.""" + + timestamp: str + baseline_path: str + current_path: str + regressions: list[RegressionAlert] = field(default_factory=list) + baseline_stats: dict[str, Any] = field(default_factory=dict) + current_stats: dict[str, Any] = field(default_factory=dict) + summary: str = "" + passed: bool = True + + def to_dict(self) -> dict[str, Any]: + return { + "timestamp": self.timestamp, + "baseline_path": self.baseline_path, + "current_path": self.current_path, + "regressions": [r.to_dict() for r in self.regressions], + "baseline_stats": self.baseline_stats, + "current_stats": self.current_stats, + "summary": self.summary, + "passed": self.passed, + } + + +@dataclass +class BaselineConfig: + """Configuration for baseline thresholds.""" + + stability_delta_warning: float = 5.0 # percentage + stability_delta_failure: float = 10.0 # percentage + win_rate_delta_warning: float = 5.0 # percentage + win_rate_delta_failure: float = 10.0 # percentage + unused_content_warning: bool = True + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> BaselineConfig: + return cls( + stability_delta_warning=data.get("stability_delta_warning", 5.0), + stability_delta_failure=data.get("stability_delta_failure", 10.0), + win_rate_delta_warning=data.get("win_rate_delta_warning", 5.0), + win_rate_delta_failure=data.get("win_rate_delta_failure", 10.0), + unused_content_warning=data.get("unused_content_warning", True), + ) + + +# ============================================================================ +# Baseline Management Functions +# ============================================================================ + + +def load_baseline(path: Path) -> dict[str, Any] | None: + """Load baseline data from JSON file. + + Parameters + ---------- + path + Path to baseline JSON file. + + Returns + ------- + dict[str, Any] | None + Baseline data or None if file doesn't exist. + """ + if not path.exists(): + return None + + with open(path) as f: + return json.load(f) + + +def load_sweep_summary(path: Path) -> dict[str, Any] | None: + """Load sweep summary from batch_sweep_summary.json. + + Parameters + ---------- + path + Path to sweep summary JSON file. + + Returns + ------- + dict[str, Any] | None + Sweep summary data or None if file doesn't exist. + """ + if not path.exists(): + return None + + with open(path) as f: + return json.load(f) + + +def extract_strategy_stats(sweep_data: dict[str, Any]) -> dict[str, dict[str, Any]]: + """Extract strategy statistics from sweep summary. + + Parameters + ---------- + sweep_data + Sweep summary data. + + Returns + ------- + dict[str, dict[str, Any]] + Strategy statistics keyed by strategy name. + """ + return sweep_data.get("strategy_stats", {}) + + +def extract_difficulty_stats(sweep_data: dict[str, Any]) -> dict[str, dict[str, Any]]: + """Extract difficulty statistics from sweep summary. + + Parameters + ---------- + sweep_data + Sweep summary data. + + Returns + ------- + dict[str, dict[str, Any]] + Difficulty statistics keyed by difficulty name. + """ + return sweep_data.get("difficulty_stats", {}) + + +def compute_win_rate(strategy_stats: dict[str, Any]) -> float: + """Compute win rate from strategy statistics. + + Win is defined as stability >= 0.5. + + Parameters + ---------- + strategy_stats + Statistics for a single strategy. + + Returns + ------- + float + Win rate as a fraction (0.0 to 1.0). + """ + avg_stability = strategy_stats.get("avg_stability", 0.0) + # Approximate win rate based on average stability + # In actual sweep data, we'd count individual games + if avg_stability >= 0.5: + # Scale win rate based on how far above threshold + return min(1.0, 0.5 + (avg_stability - 0.5)) + else: + # Scale win rate based on how far below threshold + return max(0.0, avg_stability) + + +def compare_strategy_stats( + baseline_stats: dict[str, dict[str, Any]], + current_stats: dict[str, dict[str, Any]], + config: BaselineConfig, +) -> list[RegressionAlert]: + """Compare strategy statistics against baseline. + + Parameters + ---------- + baseline_stats + Baseline strategy statistics. + current_stats + Current sweep strategy statistics. + config + Threshold configuration. + + Returns + ------- + list[RegressionAlert] + List of detected regressions. + """ + alerts: list[RegressionAlert] = [] + + for strategy, current in current_stats.items(): + baseline = baseline_stats.get(strategy) + if not baseline: + continue + + # Compare average stability + baseline_stab = baseline.get("avg_stability", 0.0) + current_stab = current.get("avg_stability", 0.0) + + if baseline_stab > 0: + delta_percent = ((current_stab - baseline_stab) / baseline_stab) * 100 + + # Only flag decreases (negative deltas) + if delta_percent < -config.stability_delta_failure: + alerts.append( + RegressionAlert( + metric_name="avg_stability", + strategy=strategy, + baseline_value=baseline_stab, + current_value=current_stab, + delta_percent=delta_percent, + severity="failure", + description=( + f"Strategy '{strategy}' stability dropped by " + f"{abs(delta_percent):.1f}% (from {baseline_stab:.3f} " + f"to {current_stab:.3f})" + ), + ) + ) + elif delta_percent < -config.stability_delta_warning: + alerts.append( + RegressionAlert( + metric_name="avg_stability", + strategy=strategy, + baseline_value=baseline_stab, + current_value=current_stab, + delta_percent=delta_percent, + severity="warning", + description=( + f"Strategy '{strategy}' stability decreased by " + f"{abs(delta_percent):.1f}% (from {baseline_stab:.3f} " + f"to {current_stab:.3f})" + ), + ) + ) + + # Compare win rates - prefer explicit win_rate if available + baseline_wr = baseline.get("win_rate") + current_wr = current.get("win_rate") + + # Only compute from stability if not explicitly provided + if baseline_wr is None: + baseline_wr = compute_win_rate(baseline) + if current_wr is None: + current_wr = compute_win_rate(current) + + if baseline_wr > 0: + wr_delta_percent = ((current_wr - baseline_wr) / baseline_wr) * 100 + + if wr_delta_percent < -config.win_rate_delta_failure: + alerts.append( + RegressionAlert( + metric_name="win_rate", + strategy=strategy, + baseline_value=baseline_wr, + current_value=current_wr, + delta_percent=wr_delta_percent, + severity="failure", + description=( + f"Strategy '{strategy}' win rate dropped by " + f"{abs(wr_delta_percent):.1f}% (from {baseline_wr:.1%} " + f"to {current_wr:.1%})" + ), + ) + ) + elif wr_delta_percent < -config.win_rate_delta_warning: + alerts.append( + RegressionAlert( + metric_name="win_rate", + strategy=strategy, + baseline_value=baseline_wr, + current_value=current_wr, + delta_percent=wr_delta_percent, + severity="warning", + description=( + f"Strategy '{strategy}' win rate decreased by " + f"{abs(wr_delta_percent):.1f}% (from {baseline_wr:.1%} " + f"to {current_wr:.1%})" + ), + ) + ) + + return alerts + + +def compare_against_baseline( + baseline_path: Path, + current_path: Path, + stability_threshold: float = 5.0, +) -> ComparisonResult: + """Compare current sweep results against baseline. + + Parameters + ---------- + baseline_path + Path to baseline JSON file. + current_path + Path to current sweep summary. + stability_threshold + Threshold percentage for stability regression detection. + + Returns + ------- + ComparisonResult + Comparison result with any detected regressions. + """ + baseline = load_baseline(baseline_path) + current = load_sweep_summary(current_path) + + result = ComparisonResult( + timestamp=datetime.now(timezone.utc).isoformat(), + baseline_path=str(baseline_path), + current_path=str(current_path), + ) + + if not baseline: + result.summary = "No baseline found - establishing new baseline" + result.passed = True + if current: + result.current_stats = { + "strategy_stats": extract_strategy_stats(current), + "difficulty_stats": extract_difficulty_stats(current), + } + return result + + if not current: + result.summary = "No current sweep data found" + result.passed = False + return result + + # Extract stats + baseline_strategy_stats = baseline.get("strategy_stats", {}) + current_strategy_stats = extract_strategy_stats(current) + + result.baseline_stats = { + "strategy_stats": baseline_strategy_stats, + "difficulty_stats": baseline.get("difficulty_stats", {}), + } + result.current_stats = { + "strategy_stats": current_strategy_stats, + "difficulty_stats": extract_difficulty_stats(current), + } + + # Get threshold config from baseline or use defaults + thresholds = baseline.get("thresholds", {}) + config = BaselineConfig.from_dict(thresholds) + # Override with CLI threshold if provided + config.stability_delta_warning = stability_threshold + config.stability_delta_failure = stability_threshold * 2 + + # Compare strategy stats + alerts = compare_strategy_stats( + baseline_strategy_stats, current_strategy_stats, config + ) + result.regressions = alerts + + # Determine pass/fail + failures = [a for a in alerts if a.severity == "failure"] + warnings = [a for a in alerts if a.severity == "warning"] + + if failures: + result.passed = False + result.summary = ( + f"FAILED: {len(failures)} regression(s) detected, " + f"{len(warnings)} warning(s)" + ) + elif warnings: + result.passed = True + result.summary = f"PASSED with {len(warnings)} warning(s)" + else: + result.passed = True + result.summary = "PASSED: No regressions detected" + + return result + + +def create_baseline( + sweep_path: Path, + output_path: Path, + git_commit: str | None = None, +) -> dict[str, Any]: + """Create a new baseline from sweep results. + + Parameters + ---------- + sweep_path + Path to sweep summary JSON file. + output_path + Path to write baseline JSON file. + git_commit + Git commit hash to record in baseline. + + Returns + ------- + dict[str, Any] + Created baseline data. + """ + sweep = load_sweep_summary(sweep_path) + if not sweep: + raise ValueError(f"Could not load sweep data from {sweep_path}") + + now = datetime.now(timezone.utc).isoformat() + + baseline = { + "version": "1.0", + "created_at": now, + "updated_at": now, + "git_commit": git_commit or sweep.get("metadata", {}).get("git_commit"), + "description": "Balance validation baseline", + "strategy_stats": extract_strategy_stats(sweep), + "difficulty_stats": extract_difficulty_stats(sweep), + "total_sweeps": sweep.get("total_sweeps", 0), + "completed_sweeps": sweep.get("completed_sweeps", 0), + "failed_sweeps": sweep.get("failed_sweeps", 0), + "total_duration_seconds": sweep.get("total_duration_seconds", 0), + "thresholds": { + "stability_delta_warning": 5.0, + "stability_delta_failure": 10.0, + "win_rate_delta_warning": 5.0, + "win_rate_delta_failure": 10.0, + "unused_content_warning": True, + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(baseline, f, indent=2) + + return baseline + + +def update_baseline( + sweep_path: Path, + output_path: Path, + git_commit: str | None = None, +) -> dict[str, Any]: + """Update existing baseline with new sweep results. + + Parameters + ---------- + sweep_path + Path to sweep summary JSON file. + output_path + Path to write updated baseline JSON file. + git_commit + Git commit hash to record. + + Returns + ------- + dict[str, Any] + Updated baseline data. + """ + existing = load_baseline(output_path) + sweep = load_sweep_summary(sweep_path) + + if not sweep: + raise ValueError(f"Could not load sweep data from {sweep_path}") + + now = datetime.now(timezone.utc).isoformat() + + baseline = { + "version": "1.0", + "created_at": existing.get("created_at", now) if existing else now, + "updated_at": now, + "git_commit": git_commit or sweep.get("metadata", {}).get("git_commit"), + "description": "Balance validation baseline", + "strategy_stats": extract_strategy_stats(sweep), + "difficulty_stats": extract_difficulty_stats(sweep), + "total_sweeps": sweep.get("total_sweeps", 0), + "completed_sweeps": sweep.get("completed_sweeps", 0), + "failed_sweeps": sweep.get("failed_sweeps", 0), + "total_duration_seconds": sweep.get("total_duration_seconds", 0), + "thresholds": existing.get("thresholds", {}) if existing else { + "stability_delta_warning": 5.0, + "stability_delta_failure": 10.0, + "win_rate_delta_warning": 5.0, + "win_rate_delta_failure": 10.0, + "unused_content_warning": True, + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(baseline, f, indent=2) + + return baseline + + +# ============================================================================ +# Visualization Functions +# ============================================================================ + + +def generate_comparison_chart( + baseline_path: Path, + current_path: Path, + output_path: Path, +) -> bool: + """Generate comparison chart showing baseline vs current stats. + + Parameters + ---------- + baseline_path + Path to baseline JSON file. + current_path + Path to current sweep summary. + output_path + Path to save chart image. + + Returns + ------- + bool + True if chart was generated successfully. + """ + if not HAS_MATPLOTLIB: + sys.stderr.write("Warning: matplotlib not available for chart generation\n") + return False + + baseline = load_baseline(baseline_path) + current = load_sweep_summary(current_path) + + if not baseline or not current: + sys.stderr.write("Warning: Missing baseline or current data for chart\n") + return False + + baseline_stats = baseline.get("strategy_stats", {}) + current_stats = extract_strategy_stats(current) + + # Get strategies present in both + strategies = sorted(set(baseline_stats.keys()) & set(current_stats.keys())) + if not strategies: + sys.stderr.write("Warning: No common strategies to compare\n") + return False + + # Extract data for plotting + baseline_stability = [baseline_stats[s].get("avg_stability", 0) for s in strategies] + current_stability = [current_stats[s].get("avg_stability", 0) for s in strategies] + + # Create figure with subplots + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) + + # Bar chart comparing stability + x = range(len(strategies)) + width = 0.35 + + ax1.bar([i - width / 2 for i in x], baseline_stability, width, label="Baseline") + ax1.bar([i + width / 2 for i in x], current_stability, width, label="Current") + ax1.set_xlabel("Strategy") + ax1.set_ylabel("Average Stability") + ax1.set_title("Strategy Stability: Baseline vs Current") + ax1.set_xticks(x) + ax1.set_xticklabels(strategies) + ax1.legend() + ax1.set_ylim(0, 1) + + # Delta chart + deltas = [ + ((c - b) / b * 100) if b > 0 else 0 + for b, c in zip(baseline_stability, current_stability, strict=True) + ] + colors = ["green" if d >= 0 else "red" for d in deltas] + + ax2.bar(strategies, deltas, color=colors) + ax2.axhline(y=0, color="black", linestyle="-", linewidth=0.5) + ax2.axhline(y=-5, color="orange", linestyle="--", linewidth=0.5, label="Warning") + ax2.axhline(y=-10, color="red", linestyle="--", linewidth=0.5, label="Failure") + ax2.set_xlabel("Strategy") + ax2.set_ylabel("Stability Delta (%)") + ax2.set_title("Stability Change from Baseline") + ax2.legend() + + plt.tight_layout() + output_path.parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=100, bbox_inches="tight") + plt.close(fig) + + return True + + +# ============================================================================ +# CLI Commands +# ============================================================================ + + +def cmd_compare(args: argparse.Namespace) -> int: + """Handle the compare command.""" + result = compare_against_baseline( + baseline_path=args.baseline, + current_path=args.current, + stability_threshold=args.stability_threshold, + ) + + # Write output + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(result.to_dict(), indent=2)) + if not args.quiet: + print(f"Comparison result written to {args.output}") + + # Print summary + if not args.quiet: + print(f"\n{result.summary}") + if result.regressions: + print("\nRegressions detected:") + for reg in result.regressions: + icon = "āŒ" if reg.severity == "failure" else "āš ļø" + print(f" {icon} [{reg.severity.upper()}] {reg.description}") + + return 0 if result.passed else 1 + + +def cmd_update(args: argparse.Namespace) -> int: + """Handle the update command.""" + try: + baseline = update_baseline( + sweep_path=args.source, + output_path=args.output, + git_commit=args.git_commit, + ) + + if not args.quiet: + print(f"Baseline updated: {args.output}") + print(f" Strategies: {list(baseline.get('strategy_stats', {}).keys())}") + print(f" Total sweeps: {baseline.get('total_sweeps', 0)}") + print(f" Git commit: {baseline.get('git_commit', 'N/A')}") + + return 0 + except Exception as e: + sys.stderr.write(f"Error updating baseline: {e}\n") + return 1 + + +def cmd_create(args: argparse.Namespace) -> int: + """Handle the create command.""" + try: + baseline = create_baseline( + sweep_path=args.source, + output_path=args.output, + git_commit=args.git_commit, + ) + + if not args.quiet: + print(f"Baseline created: {args.output}") + print(f" Strategies: {list(baseline.get('strategy_stats', {}).keys())}") + print(f" Total sweeps: {baseline.get('total_sweeps', 0)}") + + return 0 + except Exception as e: + sys.stderr.write(f"Error creating baseline: {e}\n") + return 1 + + +def cmd_chart(args: argparse.Namespace) -> int: + """Handle the chart command.""" + success = generate_comparison_chart( + baseline_path=args.baseline, + current_path=args.current, + output_path=args.output, + ) + + if success: + if not args.quiet: + print(f"Chart generated: {args.output}") + return 0 + else: + sys.stderr.write("Failed to generate chart\n") + return 1 + + +def cmd_show(args: argparse.Namespace) -> int: + """Handle the show command.""" + baseline = load_baseline(args.baseline) + + if not baseline: + sys.stderr.write(f"Baseline not found: {args.baseline}\n") + return 1 + + if args.json: + print(json.dumps(baseline, indent=2)) + else: + print(f"\nBaseline: {args.baseline}") + print("=" * 60) + print(f"Version: {baseline.get('version', 'N/A')}") + print(f"Created: {baseline.get('created_at', 'N/A')}") + print(f"Updated: {baseline.get('updated_at', 'N/A')}") + print(f"Git commit: {baseline.get('git_commit', 'N/A')}") + print(f"\nTotal sweeps: {baseline.get('total_sweeps', 0)}") + print(f"Completed: {baseline.get('completed_sweeps', 0)}") + print(f"Failed: {baseline.get('failed_sweeps', 0)}") + + print("\nStrategy Stats:") + for strategy, stats in baseline.get("strategy_stats", {}).items(): + avg_stab = stats.get("avg_stability", 0) + win_rate = stats.get("win_rate", compute_win_rate(stats)) + print(f" {strategy}: stability={avg_stab:.3f}, win_rate={win_rate:.1%}") + + print("\nThresholds:") + thresholds = baseline.get("thresholds", {}) + print(f" Stability warning: {thresholds.get('stability_delta_warning', 5)}%") + print(f" Stability failure: {thresholds.get('stability_delta_failure', 10)}%") + + return 0 + + +def main(argv: Sequence[str] | None = None) -> int: + """CLI entry point for baseline management.""" + parser = argparse.ArgumentParser( + description="Manage balance validation baselines for CI integration.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Compare current sweep against baseline + python scripts/manage_balance_baseline.py compare \\ + --current build/ci_sweeps/batch_sweep_summary.json \\ + --baseline content/baselines/balance_baseline.json + + # Update baseline from successful sweep + python scripts/manage_balance_baseline.py update \\ + --source build/ci_sweeps/batch_sweep_summary.json \\ + --output content/baselines/balance_baseline.json + + # Generate comparison chart + python scripts/manage_balance_baseline.py chart \\ + --current build/ci_sweeps/batch_sweep_summary.json \\ + --baseline content/baselines/balance_baseline.json \\ + --output build/comparison.png +""", + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # Compare command + compare_parser = subparsers.add_parser( + "compare", help="Compare current sweep against baseline" + ) + compare_parser.add_argument( + "--current", + "-c", + type=Path, + required=True, + help="Path to current sweep summary JSON", + ) + compare_parser.add_argument( + "--baseline", + "-b", + type=Path, + required=True, + help="Path to baseline JSON file", + ) + compare_parser.add_argument( + "--output", + "-o", + type=Path, + help="Path to write comparison result JSON", + ) + compare_parser.add_argument( + "--stability-threshold", + type=float, + default=5.0, + help="Stability delta threshold percentage (default: 5.0)", + ) + compare_parser.add_argument( + "--quiet", + "-q", + action="store_true", + help="Suppress non-essential output", + ) + + # Update command + update_parser = subparsers.add_parser( + "update", help="Update baseline from sweep results" + ) + update_parser.add_argument( + "--source", + "-s", + type=Path, + required=True, + help="Path to sweep summary JSON", + ) + update_parser.add_argument( + "--output", + "-o", + type=Path, + required=True, + help="Path to baseline JSON file to update", + ) + update_parser.add_argument( + "--git-commit", + type=str, + help="Git commit hash to record", + ) + update_parser.add_argument( + "--quiet", + "-q", + action="store_true", + help="Suppress non-essential output", + ) + + # Create command + create_parser = subparsers.add_parser( + "create", help="Create new baseline from sweep results" + ) + create_parser.add_argument( + "--source", + "-s", + type=Path, + required=True, + help="Path to sweep summary JSON", + ) + create_parser.add_argument( + "--output", + "-o", + type=Path, + required=True, + help="Path to write new baseline JSON", + ) + create_parser.add_argument( + "--git-commit", + type=str, + help="Git commit hash to record", + ) + create_parser.add_argument( + "--quiet", + "-q", + action="store_true", + help="Suppress non-essential output", + ) + + # Chart command + chart_parser = subparsers.add_parser( + "chart", help="Generate comparison chart" + ) + chart_parser.add_argument( + "--current", + "-c", + type=Path, + required=True, + help="Path to current sweep summary JSON", + ) + chart_parser.add_argument( + "--baseline", + "-b", + type=Path, + required=True, + help="Path to baseline JSON file", + ) + chart_parser.add_argument( + "--output", + "-o", + type=Path, + required=True, + help="Path to save chart image", + ) + chart_parser.add_argument( + "--quiet", + "-q", + action="store_true", + help="Suppress non-essential output", + ) + + # Show command + show_parser = subparsers.add_parser( + "show", help="Display baseline information" + ) + show_parser.add_argument( + "--baseline", + "-b", + type=Path, + default=Path("content/baselines/balance_baseline.json"), + help="Path to baseline JSON file", + ) + show_parser.add_argument( + "--json", + action="store_true", + help="Output as JSON", + ) + + args = parser.parse_args(argv) + + handlers = { + "compare": cmd_compare, + "update": cmd_update, + "create": cmd_create, + "chart": cmd_chart, + "show": cmd_show, + } + + return handlers[args.command](args) + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/scripts/test_balance_validation.py b/tests/scripts/test_balance_validation.py new file mode 100644 index 00000000..5e9392dc --- /dev/null +++ b/tests/scripts/test_balance_validation.py @@ -0,0 +1,806 @@ +"""Tests for balance validation CI integration. + +Covers baseline management, regression detection, threshold configuration, +comparison logic, and workflow component integration. +""" + +from __future__ import annotations + +import json +import sys +from importlib import util +from pathlib import Path + +import pytest + +_MODULE_PATH = ( + Path(__file__).resolve().parents[2] / "scripts" / "manage_balance_baseline.py" +) + + +def _load_baseline_module(): + spec = util.spec_from_file_location("manage_balance_baseline", _MODULE_PATH) + module = util.module_from_spec(spec) + assert spec and spec.loader + sys.modules.setdefault("manage_balance_baseline", module) + spec.loader.exec_module(module) + return module + + +_module = _load_baseline_module() +RegressionAlert = _module.RegressionAlert +ComparisonResult = _module.ComparisonResult +BaselineConfig = _module.BaselineConfig +load_baseline = _module.load_baseline +load_sweep_summary = _module.load_sweep_summary +extract_strategy_stats = _module.extract_strategy_stats +extract_difficulty_stats = _module.extract_difficulty_stats +compute_win_rate = _module.compute_win_rate +compare_strategy_stats = _module.compare_strategy_stats +compare_against_baseline = _module.compare_against_baseline +create_baseline = _module.create_baseline +update_baseline = _module.update_baseline +main = _module.main + + +# ============================================================================ +# Test Fixtures +# ============================================================================ + + +@pytest.fixture +def sample_baseline() -> dict: + """Create a sample baseline for testing.""" + return { + "version": "1.0", + "created_at": "2025-01-01T00:00:00Z", + "updated_at": "2025-01-01T00:00:00Z", + "git_commit": "abc1234", + "description": "Test baseline", + "strategy_stats": { + "balanced": { + "avg_stability": 0.7, + "min_stability": 0.5, + "max_stability": 0.9, + "win_rate": 0.8, + "avg_actions": 10, + "total_actions": 50, + "count": 5, + "completed": 5, + "failed": 0, + }, + "aggressive": { + "avg_stability": 0.5, + "min_stability": 0.3, + "max_stability": 0.7, + "win_rate": 0.5, + "avg_actions": 15, + "total_actions": 75, + "count": 5, + "completed": 5, + "failed": 0, + }, + "diplomatic": { + "avg_stability": 0.65, + "min_stability": 0.45, + "max_stability": 0.85, + "win_rate": 0.6, + "avg_actions": 8, + "total_actions": 40, + "count": 5, + "completed": 5, + "failed": 0, + }, + }, + "difficulty_stats": { + "easy": {"avg_stability": 0.75, "count": 5}, + "normal": {"avg_stability": 0.6, "count": 5}, + "hard": {"avg_stability": 0.5, "count": 5}, + }, + "total_sweeps": 45, + "completed_sweeps": 45, + "failed_sweeps": 0, + "thresholds": { + "stability_delta_warning": 5.0, + "stability_delta_failure": 10.0, + "win_rate_delta_warning": 5.0, + "win_rate_delta_failure": 10.0, + "unused_content_warning": True, + }, + } + + +@pytest.fixture +def sample_sweep_summary() -> dict: + """Create a sample sweep summary for testing.""" + return { + "config": { + "strategies": ["balanced", "aggressive", "diplomatic"], + "difficulties": ["easy", "normal", "hard"], + "seeds": [42, 123, 456], + "worlds": ["default"], + "tick_budgets": [100], + }, + "total_sweeps": 45, + "completed_sweeps": 44, + "failed_sweeps": 1, + "strategy_stats": { + "balanced": { + "avg_stability": 0.72, + "min_stability": 0.52, + "max_stability": 0.92, + "avg_actions": 11, + "total_actions": 55, + "count": 15, + "completed": 15, + "failed": 0, + }, + "aggressive": { + "avg_stability": 0.48, + "min_stability": 0.28, + "max_stability": 0.68, + "avg_actions": 16, + "total_actions": 80, + "count": 15, + "completed": 14, + "failed": 1, + }, + "diplomatic": { + "avg_stability": 0.67, + "min_stability": 0.47, + "max_stability": 0.87, + "avg_actions": 9, + "total_actions": 45, + "count": 15, + "completed": 15, + "failed": 0, + }, + }, + "difficulty_stats": { + "easy": {"avg_stability": 0.77, "count": 15}, + "normal": {"avg_stability": 0.62, "count": 15}, + "hard": {"avg_stability": 0.48, "count": 15}, + }, + "total_duration_seconds": 120.5, + "metadata": { + "timestamp": "2025-01-15T10:00:00Z", + "git_commit": "def5678", + }, + } + + +# ============================================================================ +# Test: Baseline Creation and Loading +# ============================================================================ + + +class TestBaselineCreationAndLoading: + """Tests for baseline file creation and loading.""" + + def test_load_baseline_from_file( + self, tmp_path: Path, sample_baseline: dict + ) -> None: + """Load baseline data from JSON file.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + + loaded = load_baseline(baseline_path) + + assert loaded is not None + assert loaded["version"] == "1.0" + assert "strategy_stats" in loaded + assert "balanced" in loaded["strategy_stats"] + + def test_load_baseline_missing_file(self, tmp_path: Path) -> None: + """Return None when baseline file doesn't exist.""" + baseline_path = tmp_path / "nonexistent.json" + + loaded = load_baseline(baseline_path) + + assert loaded is None + + def test_create_baseline_from_sweep( + self, tmp_path: Path, sample_sweep_summary: dict + ) -> None: + """Create new baseline from sweep summary.""" + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + output_path = tmp_path / "new_baseline.json" + + baseline = create_baseline(sweep_path, output_path) + + assert baseline is not None + assert output_path.exists() + assert baseline["strategy_stats"]["balanced"]["avg_stability"] == 0.72 + assert baseline["total_sweeps"] == 45 + assert "created_at" in baseline + assert "thresholds" in baseline + + def test_create_baseline_with_git_commit( + self, tmp_path: Path, sample_sweep_summary: dict + ) -> None: + """Create baseline with explicit git commit.""" + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + output_path = tmp_path / "baseline.json" + + baseline = create_baseline(sweep_path, output_path, git_commit="custom123") + + assert baseline["git_commit"] == "custom123" + + def test_update_baseline_preserves_created_at( + self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict + ) -> None: + """Update baseline preserves original creation timestamp.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + + updated = update_baseline(sweep_path, baseline_path) + + assert updated["created_at"] == "2025-01-01T00:00:00Z" + assert updated["updated_at"] != updated["created_at"] + + +# ============================================================================ +# Test: Regression Detection Logic +# ============================================================================ + + +class TestRegressionDetectionLogic: + """Tests for detecting balance regressions.""" + + def test_detect_stability_regression_failure(self) -> None: + """Detect significant stability decrease as failure.""" + baseline_stats = { + "balanced": {"avg_stability": 0.8, "win_rate": 0.9}, + } + current_stats = { + "balanced": {"avg_stability": 0.6, "win_rate": 0.7}, # 25% drop + } + config = BaselineConfig( + stability_delta_warning=5.0, + stability_delta_failure=10.0, + ) + + alerts = compare_strategy_stats(baseline_stats, current_stats, config) + + stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"] + assert len(stability_alerts) == 1 + assert stability_alerts[0].severity == "failure" + assert stability_alerts[0].delta_percent < -10 + + def test_detect_stability_regression_warning(self) -> None: + """Detect moderate stability decrease as warning.""" + baseline_stats = { + "balanced": {"avg_stability": 0.8, "win_rate": 0.9}, + } + current_stats = { + "balanced": {"avg_stability": 0.74, "win_rate": 0.85}, # 7.5% drop + } + config = BaselineConfig( + stability_delta_warning=5.0, + stability_delta_failure=10.0, + ) + + alerts = compare_strategy_stats(baseline_stats, current_stats, config) + + stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"] + assert len(stability_alerts) == 1 + assert stability_alerts[0].severity == "warning" + + def test_no_regression_within_threshold(self) -> None: + """No regression when changes are within threshold.""" + baseline_stats = { + "balanced": {"avg_stability": 0.8, "win_rate": 0.9}, + } + current_stats = { + "balanced": {"avg_stability": 0.78, "win_rate": 0.88}, # 2.5% drop + } + config = BaselineConfig( + stability_delta_warning=5.0, + stability_delta_failure=10.0, + ) + + alerts = compare_strategy_stats(baseline_stats, current_stats, config) + + stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"] + assert len(stability_alerts) == 0 + + def test_stability_improvement_not_flagged(self) -> None: + """Stability improvements should not trigger alerts.""" + baseline_stats = { + "balanced": {"avg_stability": 0.6, "win_rate": 0.7}, + } + current_stats = { + "balanced": {"avg_stability": 0.8, "win_rate": 0.9}, # 33% improvement + } + config = BaselineConfig() + + alerts = compare_strategy_stats(baseline_stats, current_stats, config) + + stability_alerts = [a for a in alerts if a.metric_name == "avg_stability"] + assert len(stability_alerts) == 0 + + def test_detect_win_rate_regression(self) -> None: + """Detect win rate regression.""" + baseline_stats = { + "aggressive": {"avg_stability": 0.5, "win_rate": 0.8}, + } + current_stats = { + "aggressive": {"avg_stability": 0.5, "win_rate": 0.6}, # 25% drop + } + config = BaselineConfig( + win_rate_delta_warning=5.0, + win_rate_delta_failure=10.0, + ) + + alerts = compare_strategy_stats(baseline_stats, current_stats, config) + + win_rate_alerts = [a for a in alerts if a.metric_name == "win_rate"] + assert len(win_rate_alerts) == 1 + assert win_rate_alerts[0].severity == "failure" + + def test_multiple_strategy_regressions(self) -> None: + """Detect regressions across multiple strategies.""" + baseline_stats = { + "balanced": {"avg_stability": 0.8}, + "aggressive": {"avg_stability": 0.6}, + "diplomatic": {"avg_stability": 0.7}, + } + current_stats = { + "balanced": {"avg_stability": 0.5}, # Big drop + "aggressive": {"avg_stability": 0.58}, # Minor drop + "diplomatic": {"avg_stability": 0.4}, # Big drop + } + config = BaselineConfig( + stability_delta_warning=5.0, + stability_delta_failure=10.0, + ) + + alerts = compare_strategy_stats(baseline_stats, current_stats, config) + + # Should flag balanced and diplomatic + affected_strategies = {a.strategy for a in alerts} + assert "balanced" in affected_strategies + assert "diplomatic" in affected_strategies + + +# ============================================================================ +# Test: Threshold Configuration +# ============================================================================ + + +class TestThresholdConfiguration: + """Tests for configurable regression thresholds.""" + + def test_baseline_config_from_dict(self) -> None: + """Load threshold config from dictionary.""" + data = { + "stability_delta_warning": 3.0, + "stability_delta_failure": 8.0, + "win_rate_delta_warning": 4.0, + "win_rate_delta_failure": 12.0, + "unused_content_warning": False, + } + + config = BaselineConfig.from_dict(data) + + assert config.stability_delta_warning == 3.0 + assert config.stability_delta_failure == 8.0 + assert config.win_rate_delta_warning == 4.0 + assert config.win_rate_delta_failure == 12.0 + assert config.unused_content_warning is False + + def test_baseline_config_defaults(self) -> None: + """Use default thresholds when not specified.""" + config = BaselineConfig.from_dict({}) + + assert config.stability_delta_warning == 5.0 + assert config.stability_delta_failure == 10.0 + + def test_custom_threshold_in_comparison(self) -> None: + """Custom thresholds affect regression detection.""" + baseline_stats = { + "balanced": {"avg_stability": 0.8, "win_rate": 0.8}, + } + current_stats = { + "balanced": {"avg_stability": 0.74, "win_rate": 0.74}, # 7.5% drop + } + + # With default 5% warning threshold - should warn (stability alert) + strict_config = BaselineConfig( + stability_delta_warning=5.0, + stability_delta_failure=10.0, + ) + strict_alerts = compare_strategy_stats( + baseline_stats, current_stats, strict_config + ) + # Could have multiple alerts (stability + win_rate) + stability_alerts = [ + a for a in strict_alerts if a.metric_name == "avg_stability" + ] + assert len(stability_alerts) == 1 + + # With relaxed 10% warning threshold - should pass + relaxed_config = BaselineConfig( + stability_delta_warning=10.0, + stability_delta_failure=20.0, + win_rate_delta_warning=10.0, + win_rate_delta_failure=20.0, + ) + relaxed_alerts = compare_strategy_stats( + baseline_stats, current_stats, relaxed_config + ) + assert len(relaxed_alerts) == 0 + + +# ============================================================================ +# Test: Comparison of Sweep Results +# ============================================================================ + + +class TestComparisonOfSweepResults: + """Tests for comparing sweep results against baseline.""" + + def test_compare_against_baseline_no_regression( + self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict + ) -> None: + """Comparison passes when no significant regressions.""" + baseline_path = tmp_path / "baseline.json" + # Modify baseline to have lower values so current is an improvement + modified_baseline = sample_baseline.copy() + modified_baseline["strategy_stats"]["balanced"]["avg_stability"] = 0.6 + modified_baseline["strategy_stats"]["balanced"]["win_rate"] = 0.6 + modified_baseline["strategy_stats"]["aggressive"]["avg_stability"] = 0.4 + modified_baseline["strategy_stats"]["aggressive"]["win_rate"] = 0.4 + modified_baseline["strategy_stats"]["diplomatic"]["avg_stability"] = 0.5 + modified_baseline["strategy_stats"]["diplomatic"]["win_rate"] = 0.5 + baseline_path.write_text(json.dumps(modified_baseline)) + + # Current sweep with better stats (improvement, not regression) + current_sweep = sample_sweep_summary.copy() + current_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.72 + current_sweep["strategy_stats"]["aggressive"]["avg_stability"] = 0.48 + current_sweep["strategy_stats"]["diplomatic"]["avg_stability"] = 0.67 + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(current_sweep)) + + result = compare_against_baseline( + baseline_path, sweep_path, stability_threshold=5.0 + ) + + assert result.passed is True + assert "PASSED" in result.summary + + def test_compare_against_baseline_with_regression( + self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict + ) -> None: + """Comparison fails when significant regression detected.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + + # Current sweep with much lower stability + current_sweep = sample_sweep_summary.copy() + current_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.4 + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(current_sweep)) + + result = compare_against_baseline( + baseline_path, sweep_path, stability_threshold=5.0 + ) + + assert result.passed is False + assert len(result.regressions) > 0 + assert "FAILED" in result.summary + + def test_compare_missing_baseline_passes( + self, tmp_path: Path, sample_sweep_summary: dict + ) -> None: + """Comparison passes when no baseline exists (establishing new baseline).""" + baseline_path = tmp_path / "nonexistent_baseline.json" + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + + result = compare_against_baseline( + baseline_path, sweep_path, stability_threshold=5.0 + ) + + assert result.passed is True + assert "establishing new baseline" in result.summary.lower() + + def test_compare_missing_current_fails( + self, tmp_path: Path, sample_baseline: dict + ) -> None: + """Comparison fails when current sweep data is missing.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + sweep_path = tmp_path / "nonexistent_sweep.json" + + result = compare_against_baseline( + baseline_path, sweep_path, stability_threshold=5.0 + ) + + assert result.passed is False + assert "no current sweep data" in result.summary.lower() + + def test_comparison_result_serialization( + self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict + ) -> None: + """ComparisonResult can be serialized to JSON.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + + result = compare_against_baseline( + baseline_path, sweep_path, stability_threshold=5.0 + ) + + data = result.to_dict() + + assert "timestamp" in data + assert "regressions" in data + assert "baseline_stats" in data + assert "current_stats" in data + assert "summary" in data + assert "passed" in data + + # Should be JSON serializable + json_str = json.dumps(data) + assert len(json_str) > 0 + + +# ============================================================================ +# Test: Workflow Component Integration +# ============================================================================ + + +class TestWorkflowComponentIntegration: + """Tests for CLI commands and workflow integration.""" + + def test_cli_compare_command( + self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict, capsys + ) -> None: + """CLI compare command works correctly.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + output_path = tmp_path / "result.json" + + main([ + "compare", + "--current", str(sweep_path), + "--baseline", str(baseline_path), + "--output", str(output_path), + ]) + + assert output_path.exists() + result = json.loads(output_path.read_text()) + assert "regressions" in result + assert "summary" in result + + def test_cli_update_command( + self, tmp_path: Path, sample_sweep_summary: dict, capsys + ) -> None: + """CLI update command creates/updates baseline.""" + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + output_path = tmp_path / "baseline.json" + + exit_code = main([ + "update", + "--source", str(sweep_path), + "--output", str(output_path), + ]) + + assert exit_code == 0 + assert output_path.exists() + + baseline = json.loads(output_path.read_text()) + assert baseline["strategy_stats"]["balanced"]["avg_stability"] == 0.72 + + def test_cli_create_command( + self, tmp_path: Path, sample_sweep_summary: dict, capsys + ) -> None: + """CLI create command creates new baseline.""" + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(sample_sweep_summary)) + output_path = tmp_path / "new_baseline.json" + + exit_code = main([ + "create", + "--source", str(sweep_path), + "--output", str(output_path), + "--git-commit", "test123", + ]) + + assert exit_code == 0 + baseline = json.loads(output_path.read_text()) + assert baseline["git_commit"] == "test123" + + def test_cli_show_command( + self, tmp_path: Path, sample_baseline: dict, capsys + ) -> None: + """CLI show command displays baseline info.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + + exit_code = main([ + "show", + "--baseline", str(baseline_path), + ]) + + assert exit_code == 0 + captured = capsys.readouterr() + assert "balanced" in captured.out + assert "aggressive" in captured.out + + def test_cli_show_json_output( + self, tmp_path: Path, sample_baseline: dict, capsys + ) -> None: + """CLI show command outputs JSON when requested.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + + exit_code = main([ + "show", + "--baseline", str(baseline_path), + "--json", + ]) + + assert exit_code == 0 + captured = capsys.readouterr() + data = json.loads(captured.out) + assert data["version"] == "1.0" + + def test_cli_compare_with_custom_threshold( + self, tmp_path: Path, sample_baseline: dict, sample_sweep_summary: dict, capsys + ) -> None: + """CLI compare respects custom stability threshold.""" + baseline_path = tmp_path / "baseline.json" + baseline_path.write_text(json.dumps(sample_baseline)) + + # Create sweep with significant regression (50% drop from 0.7) + current_sweep = sample_sweep_summary.copy() + current_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.35 + sweep_path = tmp_path / "sweep_summary.json" + sweep_path.write_text(json.dumps(current_sweep)) + + # With strict threshold (5%) - should fail + strict_result = tmp_path / "strict_result.json" + exit_code = main([ + "compare", + "--current", str(sweep_path), + "--baseline", str(baseline_path), + "--output", str(strict_result), + "--stability-threshold", "5", + "--quiet", + ]) + assert exit_code == 1 # Failed + + # Create sweep with improvement (current > baseline) - should pass + improved_sweep = sample_sweep_summary.copy() + # Better than baseline: 0.7 -> 0.8, 0.5 -> 0.6, 0.65 -> 0.75 + improved_sweep["strategy_stats"]["balanced"]["avg_stability"] = 0.8 + improved_sweep["strategy_stats"]["aggressive"]["avg_stability"] = 0.6 + improved_sweep["strategy_stats"]["diplomatic"]["avg_stability"] = 0.75 + improved_sweep_path = tmp_path / "improved_sweep.json" + improved_sweep_path.write_text(json.dumps(improved_sweep)) + + relaxed_result = tmp_path / "relaxed_result.json" + exit_code = main([ + "compare", + "--current", str(improved_sweep_path), + "--baseline", str(baseline_path), + "--output", str(relaxed_result), + "--stability-threshold", "5", + "--quiet", + ]) + assert exit_code == 0 # Passed (improvements are not regressions) + + +# ============================================================================ +# Test: Data Class Serialization +# ============================================================================ + + +class TestDataClassSerialization: + """Tests for data class to_dict methods.""" + + def test_regression_alert_to_dict(self) -> None: + """RegressionAlert serializes correctly.""" + alert = RegressionAlert( + metric_name="avg_stability", + strategy="balanced", + baseline_value=0.8, + current_value=0.6, + delta_percent=-25.0, + severity="failure", + description="Test alert", + ) + + data = alert.to_dict() + + assert data["metric_name"] == "avg_stability" + assert data["strategy"] == "balanced" + assert data["baseline_value"] == 0.8 + assert data["current_value"] == 0.6 + assert data["delta_percent"] == -25.0 + assert data["severity"] == "failure" + + def test_comparison_result_to_dict(self) -> None: + """ComparisonResult serializes correctly.""" + result = ComparisonResult( + timestamp="2025-01-15T10:00:00Z", + baseline_path="/test/baseline.json", + current_path="/test/current.json", + regressions=[ + RegressionAlert( + metric_name="avg_stability", + strategy="balanced", + baseline_value=0.8, + current_value=0.6, + delta_percent=-25.0, + severity="failure", + description="Test alert", + ) + ], + summary="FAILED", + passed=False, + ) + + data = result.to_dict() + + assert data["timestamp"] == "2025-01-15T10:00:00Z" + assert len(data["regressions"]) == 1 + assert data["passed"] is False + + +# ============================================================================ +# Test: Helper Functions +# ============================================================================ + + +class TestHelperFunctions: + """Tests for utility/helper functions.""" + + def test_extract_strategy_stats(self, sample_sweep_summary: dict) -> None: + """Extract strategy statistics from sweep summary.""" + stats = extract_strategy_stats(sample_sweep_summary) + + assert "balanced" in stats + assert "aggressive" in stats + assert "diplomatic" in stats + assert stats["balanced"]["avg_stability"] == 0.72 + + def test_extract_difficulty_stats(self, sample_sweep_summary: dict) -> None: + """Extract difficulty statistics from sweep summary.""" + stats = extract_difficulty_stats(sample_sweep_summary) + + assert "easy" in stats + assert "normal" in stats + assert "hard" in stats + + def test_compute_win_rate_above_threshold(self) -> None: + """Compute win rate for strategy above stability threshold.""" + stats = {"avg_stability": 0.75} + win_rate = compute_win_rate(stats) + assert 0.5 < win_rate <= 1.0 + + def test_compute_win_rate_below_threshold(self) -> None: + """Compute win rate for strategy below stability threshold.""" + stats = {"avg_stability": 0.3} + win_rate = compute_win_rate(stats) + assert 0.0 <= win_rate < 0.5 + + def test_compute_win_rate_explicit_value(self) -> None: + """Use explicit win_rate when available.""" + stats = {"avg_stability": 0.4, "win_rate": 0.8} + # The compare function checks for explicit win_rate first + # but compute_win_rate approximates from avg_stability + win_rate = compute_win_rate(stats) + # This should return approximation based on avg_stability + assert win_rate < 0.5