diff --git a/content/config/overlays/aggressive_economy.yml b/content/config/overlays/aggressive_economy.yml new file mode 100644 index 00000000..ae5b3a85 --- /dev/null +++ b/content/config/overlays/aggressive_economy.yml @@ -0,0 +1,33 @@ +# Example Overlay: Aggressive Economy +# Demonstrates how to use YAML overlays to test balance changes. +# +# This overlay increases resource regeneration and reduces scarcity pressure +# to test how the game behaves with a more forgiving economy. +# +# Usage: +# echoes-balance-studio sweep --overlay content/config/overlays/aggressive_economy.yml +# +# Or use test-tuning for quick experiments: +# echoes-balance-studio test-tuning --name aggressive_econ \ +# --change economy.regen_scale=1.2 \ +# --change environment.scarcity_pressure_cap=4000 + +name: aggressive_economy +description: | + Test higher resource regeneration and reduced scarcity. + Use this overlay to see how the game behaves when resources + are more abundant and scarcity pressure is lower. + +overrides: + economy: + regen_scale: 1.2 + demand_population_scale: 80000 + environment: + scarcity_pressure_cap: 4000 + scarcity_unrest_weight: 0.00003 + scarcity_pollution_weight: 0.00002 + +metadata: + author: balance_team + purpose: testing + related_issue: example diff --git a/content/config/overlays/fast_pacing.yml b/content/config/overlays/fast_pacing.yml new file mode 100644 index 00000000..3f92944d --- /dev/null +++ b/content/config/overlays/fast_pacing.yml @@ -0,0 +1,27 @@ +# Example Overlay: Fast Narrative Pacing +# Demonstrates how to adjust narrative timing for testing. +# +# This overlay increases the rate of story seed activations +# and reduces quiet periods between events. +# +# Usage: +# echoes-balance-studio sweep --overlay content/config/overlays/fast_pacing.yml + +name: fast_narrative_pacing +description: | + Test faster narrative pacing with more frequent story events. + Use this to see how players handle increased drama density. + +overrides: + director: + max_active_seeds: 3 + global_quiet_ticks: 2 + seed_active_ticks: 4 + seed_resolve_ticks: 2 + seed_quiet_ticks: 3 + story_seed_limit: 4 + +metadata: + author: narrative_team + purpose: testing + related_issue: example diff --git a/docs/gengine/designer_feedback_tooling.md b/docs/gengine/designer_feedback_tooling.md new file mode 100644 index 00000000..69125dae --- /dev/null +++ b/docs/gengine/designer_feedback_tooling.md @@ -0,0 +1,513 @@ +# Designer Feedback Loop and Tooling + +This guide explains how to use the Balance Studio tools for iterating on game +balance without requiring code changes. These tools are designed for designers +who want to experiment with tuning, diagnose balance issues, and validate +changes through data-driven analysis. + +## Overview + +The Balance Studio provides: + +- **Interactive CLI** (`echoes-balance-studio`) with guided workflows +- **YAML overlay system** for testing config changes without modifying base files +- **HTML dashboard** for exploring sweep results visually +- **Workflow presets** for common balance iteration tasks + +## Quick Start + +```bash +# Run the interactive Balance Studio +echoes-balance-studio + +# Or use specific commands +echoes-balance-studio sweep --strategies balanced aggressive +echoes-balance-studio compare --config-a content/config --config-b content/config/sweeps/difficulty-hard +echoes-balance-studio test-tuning --name boost_regen --change economy.regen_scale=1.2 +echoes-balance-studio view-reports +echoes-balance-studio generate-report --input build/sweeps/summary.json --output build/report.html +``` + +## Workflows + +### 1. Run Exploratory Sweep + +Executes batch simulations across multiple strategies and configurations to +explore the balance space. + +```bash +# Interactive mode +echoes-balance-studio + +# Direct command +echoes-balance-studio sweep \ + --strategies balanced aggressive diplomatic \ + --difficulties normal hard \ + --seeds 42 123 456 \ + --ticks 100 +``` + +**When to use:** +- Initial exploration of a new feature or mechanic +- Validating balance after significant changes +- Gathering baseline data for comparison + +**Output:** +- Individual sweep JSON files in `build/studio_sweeps/` +- Summary report with strategy and difficulty breakdowns +- Stability distributions and action counts + +### 2. Compare Two Configs + +Runs side-by-side sweeps with different configurations and produces a +comparison report. + +```bash +echoes-balance-studio compare \ + --config-a content/config \ + --config-b content/config/sweeps/difficulty-hard \ + --name-a "Normal" \ + --name-b "Hard" \ + --strategies balanced +``` + +**When to use:** +- Validating difficulty presets +- A/B testing configuration changes +- Comparing before/after tuning adjustments + +**Output:** +- Stability deltas between configurations +- Per-strategy performance comparison +- Percentage change metrics + +### 3. Test Tuning Change + +Creates a temporary configuration overlay with specific changes and compares +results against the baseline. + +```bash +echoes-balance-studio test-tuning \ + --name "boost_regen" \ + --change economy.regen_scale=1.2 \ + --change environment.scarcity_pressure_cap=6000 \ + --strategies balanced aggressive +``` + +**When to use:** +- Quick iteration on specific parameters +- Testing hypotheses about balance issues +- Validating fixes for identified problems + +**Output:** +- Baseline vs. tuned comparison +- Saved overlay file for future reference +- Detailed stability metrics + +### 4. View Historical Reports + +Browse and inspect previously generated sweep reports. + +```bash +# List available reports +echoes-balance-studio view-reports + +# Output as JSON for processing +echoes-balance-studio view-reports --json +``` + +**When to use:** +- Reviewing past experiments +- Tracking balance changes over time +- Finding regression baselines + +--- + +## YAML Overlay System + +Overlays allow you to test configuration changes without modifying base files. +This is similar to the existing difficulty presets but designed for quick +experimentation. + +### Creating an Overlay + +Create a YAML file with your changes: + +```yaml +# my_overlay.yml +name: aggressive_economy +description: Test higher resource regeneration and lower scarcity pressure + +overrides: + economy: + regen_scale: 1.2 + demand_population_scale: 80000 + environment: + scarcity_pressure_cap: 4000 + scarcity_unrest_weight: 0.00003 + +metadata: + author: designer_name + ticket: GAME-1234 +``` + +### Using an Overlay + +```bash +# Apply overlay during sweep +echoes-balance-studio sweep --overlay my_overlay.yml + +# Or use the test-tuning workflow for quick changes +echoes-balance-studio test-tuning \ + --name quick_test \ + --change economy.regen_scale=1.2 +``` + +### Overlay Directory + +Place overlays in `content/config/overlays/` to make them available for the +Balance Studio to discover: + +``` +content/ + config/ + overlays/ + aggressive_economy.yml + conservative_pacing.yml + stress_test.yml +``` + +--- + +## Interactive HTML Reports + +Generate rich HTML dashboards from sweep results: + +```bash +echoes-balance-studio generate-report \ + --input build/batch_sweeps/batch_sweep_summary.json \ + --output build/balance_report.html \ + --title "Weekly Balance Review" +``` + +### Features + +- **Strategy Performance Table**: Sortable comparison of all strategies +- **Difficulty Analysis**: See how each difficulty level affects stability +- **Stability Distribution**: Histogram of outcomes across all sweeps +- **Individual Sweep Browser**: Filter and drill into specific runs +- **Embedded Charts**: Visual representations of key metrics + +### Themes + +```bash +# Light theme (default) +echoes-balance-studio generate-report --theme light ... + +# Dark theme +echoes-balance-studio generate-report --theme dark ... +``` + +--- + +## How to Diagnose Dominant Strategies + +When one strategy consistently outperforms others, it may indicate a balance +issue. Here's how to diagnose and address dominant strategies: + +### Step 1: Run a Broad Sweep + +```bash +echoes-balance-studio sweep \ + --strategies balanced aggressive diplomatic hybrid \ + --seeds 42 123 456 789 1000 \ + --ticks 200 +``` + +### Step 2: Generate and Review the Report + +```bash +echoes-balance-studio generate-report \ + --input build/studio_sweeps/sweep_*/batch_sweep_summary.json \ + --output build/dominant_strategy_analysis.html +``` + +Look for: +- **Win rate gaps > 10%** between strategies +- **Consistently high/low stability** for specific strategies +- **Action distribution skews** (some actions never used) + +### Step 3: Identify Root Causes + +Common causes of dominant strategies: + +1. **Overpowered actions**: Check action_counts in sweep data +2. **Resource imbalance**: Review economy.regen_scale and demand weights +3. **Threshold issues**: Stability thresholds may favor certain playstyles +4. **Faction mechanics**: Some factions may synergize too well with a strategy + +### Step 4: Test Fixes + +```bash +# Example: If aggressive strategy is dominant due to high resource gains +echoes-balance-studio test-tuning \ + --name nerf_aggressive \ + --change economy.regen_scale=0.7 \ + --strategies balanced aggressive +``` + +### Step 5: Validate + +Re-run the broad sweep with your changes and confirm the gap has narrowed. + +--- + +## Iterating on Action Costs + +Action costs affect how often AI strategies choose specific actions. Here's +how to tune them: + +### Step 1: Identify Underused Actions + +Run a sweep and check action frequency distribution: + +```bash +echoes-balance-studio sweep --strategies balanced aggressive diplomatic +echoes-balance-studio generate-report \ + --input build/studio_sweeps/*/batch_sweep_summary.json \ + --output build/action_analysis.html +``` + +Actions with < 5% usage may be too expensive or ineffective. + +### Step 2: Test Cost Adjustments + +```bash +# Reduce cost of underused action +echoes-balance-studio test-tuning \ + --name buff_negotiate \ + --change actions.negotiate.base_cost=0.5 \ + --change actions.negotiate.cooldown=2 +``` + +### Step 3: Monitor Side Effects + +Check that buffing one action doesn't make others obsolete. Compare action +distributions before and after. + +### Best Practices + +- Make small, incremental changes (10-20% adjustments) +- Test across multiple strategies +- Use multiple seeds for statistical validity +- Document changes with descriptive overlay names + +--- + +## Testing Narrative Pacing Changes + +Narrative pacing affects story seed activation, director events, and the +overall flow of the game. + +### Key Pacing Parameters + +| Parameter | Location | Effect | +|-----------|----------|--------| +| `max_active_seeds` | director | How many story seeds can be active at once | +| `global_quiet_ticks` | director | Minimum ticks between major events | +| `seed_active_ticks` | director | How long a seed stays active | +| `seed_resolve_ticks` | director | Time to resolve after active phase | +| `seed_quiet_ticks` | director | Cooldown before seed can reactivate | + +### Step 1: Baseline Measurement + +Run a sweep with current settings: + +```bash +echoes-balance-studio sweep \ + --ticks 300 \ + --seeds 42 123 +``` + +Check story seed activation rates in the output. + +### Step 2: Test Pacing Adjustment + +```bash +# Example: Increase drama density +echoes-balance-studio test-tuning \ + --name fast_pacing \ + --change director.max_active_seeds=3 \ + --change director.global_quiet_ticks=2 \ + --ticks 300 +``` + +### Step 3: Review Story Seed Behavior + +Look for: +- **Activation rate**: Are seeds firing at the expected frequency? +- **Overlap issues**: Are too many seeds active simultaneously? +- **Dead zones**: Are there long stretches without narrative events? + +### Step 4: Iterate + +Adjust parameters based on observations: +- Increase `global_quiet_ticks` if events feel overwhelming +- Decrease `seed_quiet_ticks` if the game feels slow +- Adjust `story_seed_limit` to control how many seeds surface per tick + +--- + +## Case Study: Balancing the Industrial Tier Faction + +This case study demonstrates a complete balance iteration workflow. + +### Problem Statement + +Playtest feedback indicates the Industrial Tier faction feels underpowered +compared to other factions. Players report: +- Lower legitimacy gains +- Fewer opportunities for impactful actions +- Pollution penalties seem too harsh + +### Step 1: Gather Data + +```bash +# Run comprehensive sweep focusing on faction behavior +echoes-balance-studio sweep \ + --strategies balanced diplomatic \ + --seeds 42 123 456 789 1000 1001 1002 1003 \ + --ticks 200 \ + --output-dir build/industrial_tier_analysis +``` + +### Step 2: Analyze Baseline + +```bash +echoes-balance-studio generate-report \ + --input build/industrial_tier_analysis/batch_sweep_summary.json \ + --output build/industrial_tier_baseline.html +``` + +Review faction legitimacy trends in the sweep data. + +### Step 3: Hypothesis Testing + +**Hypothesis 1: Pollution penalties are too harsh** + +```bash +echoes-balance-studio test-tuning \ + --name reduce_pollution_penalty \ + --change environment.faction_sabotage_pollution_spike=0.015 \ + --change environment.scarcity_pollution_weight=0.00002 \ + --strategies balanced \ + --seeds 42 123 456 +``` + +**Hypothesis 2: Investment returns are too low** + +```bash +echoes-balance-studio test-tuning \ + --name boost_investment \ + --change economy.faction_investment_return=1.5 \ + --change environment.faction_invest_pollution_relief=0.03 \ + --strategies balanced \ + --seeds 42 123 456 +``` + +### Step 4: Compare Results + +```bash +# Generate comparison between baseline and each hypothesis +echoes-balance-studio compare \ + --config-a content/config \ + --config-b build/industrial_tier_analysis/tuning_reduce_pollution_penalty_*/modified_config \ + --name-a "Baseline" \ + --name-b "Reduced Pollution" +``` + +### Step 5: Implement and Validate + +Based on the data, create a formal overlay for the winning hypothesis: + +```yaml +# content/config/overlays/industrial_tier_balance.yml +name: industrial_tier_balance_v1 +description: Rebalance Industrial Tier faction after Dec 2024 analysis + +overrides: + environment: + faction_sabotage_pollution_spike: 0.018 + scarcity_pollution_weight: 0.000025 + faction_invest_pollution_relief: 0.025 + +metadata: + ticket: GAME-4567 + analysis_date: 2024-12-01 + baseline_report: build/industrial_tier_baseline.html +``` + +Run a final validation sweep with the new overlay applied. + +--- + +## Tips and Best Practices + +### Statistical Validity + +- Use at least 5 different seeds for meaningful comparisons +- Run 100+ ticks to capture mid-to-late game dynamics +- Repeat experiments if results are marginal + +### Documenting Changes + +- Always include descriptive names for overlays +- Reference ticket numbers in metadata +- Save baseline reports before making changes + +### Iterative Approach + +1. Make one change at a time when possible +2. Measure impact before adding more changes +3. Keep changes small (10-20% parameter adjustments) +4. Validate that fixes don't create new problems + +### Sharing Results + +- Use `--json` flag for data that needs processing +- Generate HTML reports for stakeholder reviews +- Archive summary JSONs for regression testing + +--- + +## Command Reference + +| Command | Description | +|---------|-------------| +| `echoes-balance-studio` | Interactive workflow selection | +| `echoes-balance-studio sweep` | Run exploratory sweeps | +| `echoes-balance-studio compare` | Compare two configurations | +| `echoes-balance-studio test-tuning` | Test a tuning change | +| `echoes-balance-studio view-reports` | Browse historical reports | +| `echoes-balance-studio generate-report` | Generate HTML dashboard | + +### Common Flags + +| Flag | Description | +|------|-------------| +| `--strategies` | AI strategies to test | +| `--difficulties` | Difficulty presets | +| `--seeds` | Random seeds for reproducibility | +| `--ticks` | Tick budget per sweep | +| `--output-dir` | Output directory | +| `--json` | Output as JSON | +| `--verbose` | Verbose progress output | + +--- + +## See Also + +- [AI Tournament & Balance Analysis](./ai_tournament_and_balance_analysis.md) +- [How to Play Echoes](./how_to_play_echoes.md) +- [Content Designer Workflow](./content_designer_workflow.md) +- [Implementation Plan](../simul/emergent_story_game_implementation_plan.md) diff --git a/gamedev-agent-thoughts.txt b/gamedev-agent-thoughts.txt index 057d7aa3..b5aba20b 100644 --- a/gamedev-agent-thoughts.txt +++ b/gamedev-agent-thoughts.txt @@ -1,164 +1,115 @@ -# GameDev Agent Thoughts - Issue #63: Analysis and Balance Reporting (M11.3) +# GameDev Agent Thoughts - Issue #70: Designer Feedback Loop and Tooling (M11.6) ## Task Analysis -Working on Issue #63 - Phase 11, Milestone 11.3, Task 11.3.1. - -### Previous Completions -- Task 11.1.1 (Batch Simulation Sweep Infrastructure) - COMPLETED -- Task 11.2.1 (Result Aggregation and Storage) - COMPLETED - -### Requirements for Task 11.3.1 - -1. Create `scripts/analyze_balance.py` that processes aggregated sweep results from SQLite database -2. Generate HTML or Markdown balance reports with sections for: - - Dominant strategies (win rate deltas >10%) - - Underperforming mechanics (actions/policies rarely chosen) - - Unused story seeds - - Parameter sensitivity analysis (impact of difficulty/config changes) -3. Statistical analysis including: - - Confidence intervals - - Significance testing (t-tests for win rate differences) - - Trend detection across historical runs -4. Visual outputs (charts/graphs) showing: - - Win rate distributions - - Metric trends over time - - Parameter correlations -5. Regression detection: Highlights significant deviations from baseline -6. At least 12 tests covering report generation, statistical calculations, and edge cases +Working on Issue #70 - Phase 11, Milestone 11.6, Task 11.6.1. -## Implementation Summary +### Requirements from Issue -### Files Created +1. Create CLI tool `echoes-balance-studio` with guided workflows for designers: + - "Run exploratory sweep" + - "Compare two configs" + - "Test tuning change" + - "View historical reports" -1. **scripts/analyze_balance.py** - Main balance analysis script with: - - Dataclasses: `ConfidenceInterval`, `TTestResult`, `TrendAnalysis`, `RegressionAlert`, `BalanceReport` - - Database query functions for extracting sweep results - - Statistical analysis functions: - - `compute_confidence_interval()` - 95% CI using t-distribution - - `perform_t_test()` - Two-sample t-test for strategy comparison - - `detect_trend()` - Linear regression for trend detection - - `detect_regression()` - Compare runs for significant deviations - - Balance analysis functions: - - `analyze_dominant_strategies()` - Win rate deltas >10% - - `analyze_underperforming_mechanics()` - Actions with <5% usage - - `identify_unused_story_seeds()` - Seeds never activated - - `analyze_parameter_sensitivity()` - Metrics by difficulty - - Visualization functions (using matplotlib): - - `generate_win_rate_chart()` - Bar chart of win rates - - `generate_trend_chart()` - Line chart of metrics over time - - `generate_action_distribution_chart()` - Pie chart of actions - - Report generation: - - `format_report_markdown()` - Full markdown report - - `format_report_html()` - HTML with embedded charts - - CLI with subcommands: `report`, `regression`, `trends`, `stats` - -2. **tests/scripts/test_analyze_balance.py** - 39 tests in 12 test classes: - - `TestConfidenceInterval` (4 tests): CI computation, edge cases, serialization - - `TestTTest` (4 tests): Significant/non-significant detection, insufficient data - - `TestTrendDetection` (4 tests): Increasing, decreasing, stable, insufficient data - - `TestRegressionDetection` (3 tests): Regression alerts, thresholds, serialization - - `TestDominantStrategies` (3 tests): Detection, balanced scenarios, single strategy - - `TestUnderperformingMechanics` (3 tests): Detection, all used, empty data - - `TestUnusedStorySeeds` (3 tests): Identification, full coverage, no reference - - `TestParameterSensitivity` (2 tests): Difficulty analysis, high variation - - `TestReportGeneration` (4 tests): Report with data, markdown, HTML, serialization - - `TestCLI` (6 tests): Report, JSON output, stats, trends, regression commands - - `TestEdgeCases` (3 tests): Empty database, single result, all failed sweeps +2. Configuration changes testable via YAML overlays without modifying base configs + (similar to difficulty presets) -## Acceptance Criteria Verification - -1. ✅ Script processes aggregated sweep results from SQLite database -2. ✅ Generates HTML or Markdown balance reports with sections for: - - ✅ Dominant strategies (win rate deltas >10%) - - ✅ Underperforming mechanics (actions with <5% usage) - - ✅ Unused story seeds - - ✅ Parameter sensitivity analysis -3. ✅ Statistical analysis includes: - - ✅ Confidence intervals (95% CI using t-distribution) - - ✅ Significance testing (two-sample t-tests) - - ✅ Trend detection (linear regression) -4. ✅ Visual outputs (charts) showing: - - ✅ Win rate distributions (bar chart) - - ✅ Metric trends over time (line chart) - - ✅ Action distribution (pie chart) -5. ✅ Regression detection highlights significant deviations from baseline -6. ✅ 39 tests covering report generation, statistical calculations, and edge cases (requirement was 12+) - -## Verification - -- All 39 tests pass -- Ruff linting passes with no errors -- CLI works correctly with all subcommands - -## Progress - -- [x] Create scripts/analyze_balance.py -- [x] Create tests/scripts/test_analyze_balance.py -- [x] Run linting - PASSED -- [x] Run tests - 39 PASSED -- [x] Task completed +3. Interactive report viewer (HTML dashboard) allowing filtering, sorting, + and drilling into results ---- +4. Designer-focused documentation: + - "How to diagnose dominant strategies" + - "Iterating on action costs" + - "Testing narrative pacing changes" -# Previous Task Notes - Issue #61: Result Aggregation and Storage (M11.2) +5. Example workflows with case studies (e.g., "Balancing the Industrial Tier faction") -## Task Analysis +6. At least 8 tests covering CLI commands, config overlay loading, and report generation -Working on Issue #61 - Phase 11, Milestone 11.2, Task 11.2.1. +7. Register new CLI tool in pyproject.toml -### Requirements +### Existing Infrastructure Leveraged -1. Script `scripts/aggregate_sweep_results.py` ingests batch sweep JSON outputs and produces aggregated summary data -2. Storage format (SQLite database) supports querying by parameter combinations, timestamp, and result metrics -3. Historical tracking preserves sweep metadata (git commit hash, timestamp, parameter ranges) for reproducibility -4. Aggregation computes key statistics: win rates by strategy, average stability/unrest/pollution, story seed activation rates, action usage frequencies -5. Query interface or helper functions support common lookups -6. At least 8 tests covering aggregation logic, storage/retrieval, and historical queries +- scripts/run_batch_sweeps.py - Batch simulation sweep infrastructure +- scripts/aggregate_sweep_results.py - Result aggregation with SQLite storage +- scripts/analyze_balance.py - Analysis and balance reporting +- content/config/batch_sweeps.yml - Sweep parameter configuration +- Difficulty presets in content/config/sweeps/ ## Implementation Summary ### Files Created -1. **scripts/aggregate_sweep_results.py** - Main aggregation script with: - - Dataclasses: `SweepRecord`, `SweepRunMetadata`, `AggregatedStats` - - SQLite database setup with versioned schema and indexes - - `init_database()` - Creates tables and indexes - - `ingest_sweep_summary()` - Ingests a single batch sweep summary - - `ingest_sweep_directory()` - Ingests all summaries from a directory - - `query_sweep_results()` - Query with filters (strategy, difficulty, world, run_id, days, git_commit, limit) - - `query_sweep_runs()` - Query run metadata - - `compute_aggregated_stats()` - Computes win rates, averages, action frequencies - - `compute_stats_by_strategy()` / `compute_stats_by_difficulty()` - Convenience functions - - CLI with subcommands: `ingest`, `query`, `stats`, `runs` - -2. **tests/scripts/test_aggregate_sweep_results.py** - 26 tests in 8 test classes: - - `TestDatabaseSchema` (3 tests): schema creation, indexes, idempotency - - `TestIngestion` (3 tests): ingest summary, prevent duplicates, ingest directory - - `TestQuerying` (6 tests): by strategy, difficulty, run_id, limit, days, git commit - - `TestAggregation` (4 tests): by strategy, with errors, action frequencies, empty records - - `TestDataclasses` (3 tests): SweepRecord, SweepRunMetadata, AggregatedStats serialization - - `TestCLI` (4 tests): ingest, stats JSON, query with filters, runs command - - `TestHistoricalTracking` (2 tests): multiple runs, date range filtering +1. **src/gengine/balance_studio/__init__.py** - Package init with exports +2. **src/gengine/balance_studio/overlays.py** - YAML overlay loading system + - ConfigOverlay dataclass + - deep_merge() for config merging + - load_overlay_directory() for batch loading + - create_tuning_overlay() helper + - merge_overlays() for combining overlays +3. **src/gengine/balance_studio/workflows.py** - Workflow implementations + - WorkflowResult dataclass + - ExploratorySweepConfig, CompareConfigsConfig, TuningTestConfig + - run_exploratory_sweep() + - run_config_comparison() + - run_tuning_test() + - list_historical_reports() + - view_historical_report() + - get_workflow_menu() +4. **src/gengine/balance_studio/report_viewer.py** - HTML report generation + - ReportViewerConfig, FilterState dataclasses + - generate_strategy_chart(), generate_difficulty_chart() + - generate_stability_distribution_chart() + - generate_interactive_html() - full HTML dashboard + - write_html_report() +5. **src/gengine/balance_studio/cli.py** - Entry point for pyproject.toml +6. **scripts/echoes_balance_studio.py** - Main CLI tool with: + - Interactive workflow selection + - sweep, compare, test-tuning, view-reports, generate-report commands +7. **tests/scripts/test_echoes_balance_studio.py** - 30 tests covering: + - ConfigOverlay (5 tests) + - DeepMerge (3 tests) + - LoadOverlayDirectory (3 tests) + - CreateTuningOverlay (1 test) + - MergeOverlays (2 tests) + - ReportViewer (3 tests) + - Workflows (6 tests) + - CLI (5 tests) + - ExploratorySweepConfig (2 tests) +8. **docs/gengine/designer_feedback_tooling.md** - Designer documentation + +### Files Modified + +1. **pyproject.toml** - Added echoes-balance-studio script entry ## Acceptance Criteria Verification -1. ✅ Script ingests batch sweep JSON outputs and produces aggregated summary data -2. ✅ SQLite storage supports querying by parameter combinations, timestamp, and result metrics -3. ✅ Historical tracking preserves sweep metadata (git commit hash, timestamp, parameter ranges) -4. ✅ Aggregation computes: win rates, avg stability, story seed activation rates, action frequencies -5. ✅ Query interface supports common lookups (by strategy, difficulty, date range, git commit) -6. ✅ 26 tests covering aggregation logic, storage/retrieval, and historical queries (requirement was 8+) - -## Verification - -- All 26 tests pass -- Ruff linting passes with no errors -- CLI works correctly via subprocess testing - -## Progress - -- [x] Create scripts/aggregate_sweep_results.py -- [x] Create tests/scripts/test_aggregate_sweep_results.py +1. ✅ CLI tool `echoes-balance-studio` provides guided workflows for designers +2. ✅ Workflows include: "Run exploratory sweep", "Compare two configs", + "Test tuning change", "View historical reports" +3. ✅ Configuration changes testable via YAML overlays +4. ✅ Interactive report viewer (HTML dashboard) allows filtering, sorting, + and drilling into results +5. ✅ Designer-focused documentation covers: + - "How to diagnose dominant strategies" + - "Iterating on action costs" + - "Testing narrative pacing changes" +6. ✅ Example workflows with case studies (Balancing Industrial Tier faction) +7. ✅ 30 tests covering CLI commands, config overlay loading, and report generation + (requirement was at least 8) +8. ✅ CLI registered in pyproject.toml under [project.scripts] +9. ✅ All code passes ruff linting +10. ✅ Tests pass + +## Status +- [x] Read and understand existing infrastructure +- [x] Create balance studio module with overlays +- [x] Create workflows module +- [x] Create report viewer module +- [x] Create CLI tool (echoes_balance_studio.py) +- [x] Add tests (30 tests - exceeds minimum of 8) +- [x] Create designer documentation +- [x] Register CLI in pyproject.toml - [x] Run linting - PASSED -- [x] Run tests - 26 PASSED +- [x] Run tests - 30 PASSED diff --git a/pyproject.toml b/pyproject.toml index 8f75b2bc..99cad97e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ echoes-shell = "gengine.echoes.cli.shell:main" echoes-gateway-service = "gengine.echoes.gateway.main:main" echoes-gateway-shell = "gengine.echoes.gateway.client:main" echoes-llm-service = "gengine.echoes.llm.main:main" +echoes-balance-studio = "gengine.balance_studio.cli:main" [build-system] requires = ["setuptools>=68.0.0"] diff --git a/scripts/echoes_balance_studio.py b/scripts/echoes_balance_studio.py new file mode 100644 index 00000000..1287df14 --- /dev/null +++ b/scripts/echoes_balance_studio.py @@ -0,0 +1,745 @@ +#!/usr/bin/env python3 +"""Balance Studio CLI - Designer feedback loop and guided workflows. + +Provides an interactive interface for designers to iterate on game balance +without requiring code changes. + +Examples +-------- +Run interactively:: + + echoes-balance-studio + +Run a specific workflow:: + + echoes-balance-studio sweep --strategies balanced aggressive + echoes-balance-studio compare --config-a path/to/a --config-b path/to/b + echoes-balance-studio test-tuning --name "boost_regen" \\ + --change economy.regen_scale=1.2 + echoes-balance-studio view-reports + echoes-balance-studio generate-report --input build/sweeps/summary.json +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Sequence + +from gengine.balance_studio import ( + CompareConfigsConfig, + ConfigOverlay, + ExploratorySweepConfig, + ReportViewerConfig, + TuningTestConfig, + get_workflow_menu, + list_historical_reports, + run_config_comparison, + run_exploratory_sweep, + run_tuning_test, + view_historical_report, + write_html_report, +) + + +def print_header() -> None: + """Print the Balance Studio header.""" + print() + print("=" * 60) + print(" ECHOES BALANCE STUDIO") + print(" Designer Feedback Loop and Tooling") + print("=" * 60) + print() + + +def print_workflow_menu() -> None: + """Print the workflow menu.""" + workflows = get_workflow_menu() + print("Available Workflows:") + print("-" * 40) + for i, w in enumerate(workflows, 1): + print(f" {i}. {w['name']}") + print(f" {w['description']}") + print() + + +def interactive_mode() -> int: + """Run the interactive workflow selection mode. + + Returns + ------- + int + Exit code. + """ + print_header() + print_workflow_menu() + + print("Enter workflow number (1-4) or 'q' to quit:") + try: + choice = input("> ").strip().lower() + except (EOFError, KeyboardInterrupt): + print("\nExiting.") + return 0 + + if choice in ("q", "quit", "exit"): + return 0 + + try: + choice_num = int(choice) + except ValueError: + print(f"Invalid choice: {choice}") + return 1 + + if choice_num == 1: + return interactive_sweep() + elif choice_num == 2: + return interactive_compare() + elif choice_num == 3: + return interactive_tuning() + elif choice_num == 4: + return interactive_view_reports() + else: + print(f"Invalid choice: {choice_num}") + return 1 + + +def interactive_sweep() -> int: + """Interactive exploratory sweep workflow.""" + print("\n--- Run Exploratory Sweep ---\n") + + print("Enter strategies (comma-separated, or press Enter for defaults):") + print(" Available: balanced, aggressive, diplomatic, hybrid") + strategies_input = input("> ").strip() + strategies = ( + [s.strip() for s in strategies_input.split(",") if s.strip()] + if strategies_input + else ["balanced", "aggressive", "diplomatic"] + ) + + print("\nEnter difficulty presets (comma-separated, or Enter for 'normal'):") + print(" Available: tutorial, easy, normal, hard, brutal") + difficulties_input = input("> ").strip() + difficulties = ( + [d.strip() for d in difficulties_input.split(",") if d.strip()] + if difficulties_input + else ["normal"] + ) + + print("\nEnter random seeds (comma-separated, or Enter for defaults):") + seeds_input = input("> ").strip() + try: + seeds = ( + [int(s.strip()) for s in seeds_input.split(",") if s.strip()] + if seeds_input + else [42, 123, 456] + ) + except ValueError: + print("Invalid seeds, using defaults") + seeds = [42, 123, 456] + + print("\nEnter tick budget (or Enter for 100):") + tick_input = input("> ").strip() + tick_budget = int(tick_input) if tick_input else 100 + + config = ExploratorySweepConfig( + strategies=strategies, + difficulties=difficulties, + seeds=seeds, + tick_budget=tick_budget, + ) + + print(f"\nRunning sweep with {len(strategies)} strategies, " + f"{len(difficulties)} difficulties, {len(seeds)} seeds...") + print("This may take a while...\n") + + result = run_exploratory_sweep(config, verbose=True) + + print("\n" + "=" * 40) + if result.success: + print(f"SUCCESS: {result.message}") + print(f"Output: {result.output_path}") + else: + print(f"FAILED: {result.message}") + for err in result.errors: + print(f" Error: {err}") + + return 0 if result.success else 1 + + +def interactive_compare() -> int: + """Interactive config comparison workflow.""" + print("\n--- Compare Two Configs ---\n") + + print("Enter path to first config directory:") + config_a = input("> ").strip() + if not config_a: + print("Config path required") + return 1 + + print("\nEnter name for first config (or Enter for 'Config A'):") + name_a = input("> ").strip() or "Config A" + + print("\nEnter path to second config directory:") + config_b = input("> ").strip() + if not config_b: + print("Config path required") + return 1 + + print("\nEnter name for second config (or Enter for 'Config B'):") + name_b = input("> ").strip() or "Config B" + + config = CompareConfigsConfig( + config_a_path=Path(config_a), + config_b_path=Path(config_b), + name_a=name_a, + name_b=name_b, + ) + + print("\nRunning comparison sweeps...") + + result = run_config_comparison(config, verbose=True) + + print("\n" + "=" * 40) + if result.success: + print(f"SUCCESS: {result.message}") + print(f"Output: {result.output_path}") + + if "comparison" in result.data: + print("\nComparison Results:") + for strategy, comp in result.data["comparison"].items(): + delta = comp.get("delta", 0) + direction = "↑" if delta > 0 else "↓" if delta < 0 else "=" + print(f" {strategy}: {direction} {abs(delta):.3f} " + f"({comp.get('delta_percent', 0):.1f}%)") + else: + print(f"FAILED: {result.message}") + for err in result.errors: + print(f" Error: {err}") + + return 0 if result.success else 1 + + +def interactive_tuning() -> int: + """Interactive tuning test workflow.""" + print("\n--- Test Tuning Change ---\n") + + print("Enter a name for this tuning experiment:") + name = input("> ").strip() + if not name: + name = "tuning_test" + + print("\nEnter config changes as key=value pairs (one per line, blank to finish):") + print(" Example: economy.regen_scale=1.2") + print(" Example: environment.scarcity_pressure_cap=6000") + + changes: dict[str, Any] = {} + while True: + line = input("> ").strip() + if not line: + break + + if "=" not in line: + print(" Invalid format, use key=value") + continue + + key, value = line.split("=", 1) + key = key.strip() + value = value.strip() + + # Parse value type + try: + if "." in value: + parsed_value: Any = float(value) + else: + parsed_value = int(value) + except ValueError: + if value.lower() in ("true", "false"): + parsed_value = value.lower() == "true" + else: + parsed_value = value + + # Build nested dict from dotted key + keys = key.split(".") + current = changes + for k in keys[:-1]: + current = current.setdefault(k, {}) + current[keys[-1]] = parsed_value + print(f" Added: {key} = {parsed_value}") + + if not changes: + print("No changes specified") + return 1 + + print(f"\nTesting {len(changes)} changes...") + + config = TuningTestConfig( + name=name, + changes=changes, + description=f"Interactive tuning test: {name}", + ) + + result = run_tuning_test(config, verbose=True) + + print("\n" + "=" * 40) + if result.success: + print(f"SUCCESS: {result.message}") + print(f"Output: {result.output_path}") + + if "comparison" in result.data: + print("\nTuning Impact:") + for strategy, comp in result.data["comparison"].items(): + delta = comp.get("delta", 0) + direction = "↑" if delta > 0 else "↓" if delta < 0 else "=" + print(f" {strategy}: Baseline {comp.get('stability_a', 0):.3f} " + f"→ Tuned {comp.get('stability_b', 0):.3f} " + f"({direction}{abs(delta):.3f})") + else: + print(f"FAILED: {result.message}") + for err in result.errors: + print(f" Error: {err}") + + return 0 if result.success else 1 + + +def interactive_view_reports() -> int: + """Interactive report viewing workflow.""" + print("\n--- View Historical Reports ---\n") + + reports = list_historical_reports() + + if not reports: + print("No reports found in build/") + return 0 + + print("Available Reports:") + print("-" * 60) + for i, r in enumerate(reports[:10], 1): + print(f" {i}. {r['timestamp']}") + print(f" Sweeps: {r['completed_sweeps']}/{r['total_sweeps']}") + print(f" Strategies: {', '.join(r['strategies'])}") + print() + + print("Enter report number to view (or 'q' to quit):") + choice = input("> ").strip() + + if choice.lower() in ("q", "quit"): + return 0 + + try: + idx = int(choice) - 1 + if 0 <= idx < len(reports): + report = reports[idx] + result = view_historical_report(Path(report["path"])) + + if result.success: + print(f"\n{result.message}") + print(f"Path: {report['path']}") + + # Print summary + data = result.data + print("\nSummary:") + print(f" Total Sweeps: {data.get('total_sweeps', 0)}") + print(f" Completed: {data.get('completed_sweeps', 0)}") + print(f" Failed: {data.get('failed_sweeps', 0)}") + + if "strategy_stats" in data: + print("\nStrategy Stats:") + for strategy, stats in data["strategy_stats"].items(): + avg = stats.get('avg_stability', 0) + print(f" {strategy}: avg_stability={avg:.3f}") + else: + print(f"Failed to load report: {result.message}") + else: + print("Invalid report number") + except ValueError: + print("Invalid input") + + return 0 + + +def cmd_sweep(args: argparse.Namespace) -> int: + """Handle the sweep command.""" + config = ExploratorySweepConfig( + strategies=args.strategies, + difficulties=args.difficulties, + seeds=args.seeds, + tick_budget=args.ticks, + output_dir=Path(args.output_dir), + ) + + if args.overlay: + config.overlay = ConfigOverlay.from_yaml(Path(args.overlay)) + + result = run_exploratory_sweep(config, verbose=args.verbose) + + if args.json: + print(json.dumps(result.to_dict(), indent=2)) + else: + print(f"{'SUCCESS' if result.success else 'FAILED'}: {result.message}") + if result.output_path: + print(f"Output: {result.output_path}") + + return 0 if result.success else 1 + + +def cmd_compare(args: argparse.Namespace) -> int: + """Handle the compare command.""" + config = CompareConfigsConfig( + config_a_path=Path(args.config_a), + config_b_path=Path(args.config_b), + name_a=args.name_a, + name_b=args.name_b, + strategies=args.strategies, + seeds=args.seeds, + tick_budget=args.ticks, + output_dir=Path(args.output_dir), + ) + + result = run_config_comparison(config, verbose=args.verbose) + + if args.json: + print(json.dumps(result.to_dict(), indent=2)) + else: + print(f"{'SUCCESS' if result.success else 'FAILED'}: {result.message}") + if result.output_path: + print(f"Output: {result.output_path}") + + return 0 if result.success else 1 + + +def cmd_test_tuning(args: argparse.Namespace) -> int: + """Handle the test-tuning command.""" + # Parse changes from command line + changes: dict[str, Any] = {} + for change in args.change or []: + if "=" not in change: + sys.stderr.write(f"Invalid change format: {change}\n") + continue + + key, value = change.split("=", 1) + key = key.strip() + value = value.strip() + + # Parse value + try: + if "." in value: + parsed: Any = float(value) + else: + parsed = int(value) + except ValueError: + if value.lower() in ("true", "false"): + parsed = value.lower() == "true" + else: + parsed = value + + # Build nested dict + keys = key.split(".") + current = changes + for k in keys[:-1]: + current = current.setdefault(k, {}) + current[keys[-1]] = parsed + + if not changes: + sys.stderr.write("No valid changes specified\n") + return 1 + + config = TuningTestConfig( + name=args.name, + changes=changes, + description=args.description or f"Tuning test: {args.name}", + baseline_config=Path(args.baseline) if args.baseline else None, + strategies=args.strategies, + seeds=args.seeds, + tick_budget=args.ticks, + output_dir=Path(args.output_dir), + ) + + result = run_tuning_test(config, verbose=args.verbose) + + if args.json: + print(json.dumps(result.to_dict(), indent=2)) + else: + print(f"{'SUCCESS' if result.success else 'FAILED'}: {result.message}") + if result.output_path: + print(f"Output: {result.output_path}") + + return 0 if result.success else 1 + + +def cmd_view_reports(args: argparse.Namespace) -> int: + """Handle the view-reports command.""" + reports = list_historical_reports( + reports_dir=Path(args.reports_dir), + limit=args.limit, + ) + + if args.json: + print(json.dumps(reports, indent=2)) + else: + if not reports: + print("No reports found") + return 0 + + print("\nAvailable Reports:") + print("-" * 70) + for r in reports: + print(f" {r['timestamp']} | " + f"{r['completed_sweeps']}/{r['total_sweeps']} sweeps | " + f"{', '.join(r['strategies'])}") + print(f" Path: {r['path']}") + + return 0 + + +def cmd_generate_report(args: argparse.Namespace) -> int: + """Handle the generate-report command.""" + input_path = Path(args.input) + if not input_path.exists(): + sys.stderr.write(f"Input file not found: {input_path}\n") + return 1 + + try: + with open(input_path) as f: + data = json.load(f) + except json.JSONDecodeError as e: + sys.stderr.write(f"Failed to parse input: {e}\n") + return 1 + + config = ReportViewerConfig( + title=args.title, + include_charts=not args.no_charts, + include_raw_data=args.include_raw, + theme=args.theme, + ) + + output_path = Path(args.output) + write_html_report(data, output_path, config) + + print(f"Report generated: {output_path}") + return 0 + + +def main(argv: Sequence[str] | None = None) -> int: + """CLI entry point for Balance Studio.""" + parser = argparse.ArgumentParser( + description="Balance Studio - Designer feedback loop and guided workflows", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run interactively + echoes-balance-studio + + # Run exploratory sweep + echoes-balance-studio sweep --strategies balanced aggressive + + # Compare two configurations + echoes-balance-studio compare \\ + --config-a content/config \\ + --config-b content/config/sweeps/difficulty-hard + + # Test a tuning change + echoes-balance-studio test-tuning \\ + --name boost_regen \\ + --change economy.regen_scale=1.2 \\ + --change environment.scarcity_pressure_cap=6000 + + # View historical reports + echoes-balance-studio view-reports + + # Generate HTML report from sweep results + echoes-balance-studio generate-report \\ + --input build/batch_sweeps/batch_sweep_summary.json \\ + --output build/balance_report.html +""", + ) + + subparsers = parser.add_subparsers(dest="command") + + # Sweep command + sweep_parser = subparsers.add_parser( + "sweep", help="Run an exploratory sweep" + ) + sweep_parser.add_argument( + "--strategies", "-s", nargs="+", + default=["balanced", "aggressive", "diplomatic"], + help="Strategies to test", + ) + sweep_parser.add_argument( + "--difficulties", "-d", nargs="+", + default=["normal"], + help="Difficulty presets to test", + ) + sweep_parser.add_argument( + "--seeds", nargs="+", type=int, + default=[42, 123, 456], + help="Random seeds", + ) + sweep_parser.add_argument( + "--ticks", "-t", type=int, default=100, + help="Tick budget per sweep", + ) + sweep_parser.add_argument( + "--output-dir", "-o", default="build/studio_sweeps", + help="Output directory", + ) + sweep_parser.add_argument( + "--overlay", help="Path to config overlay YAML", + ) + sweep_parser.add_argument( + "--json", action="store_true", help="Output as JSON", + ) + sweep_parser.add_argument( + "--verbose", "-v", action="store_true", help="Verbose output", + ) + + # Compare command + compare_parser = subparsers.add_parser( + "compare", help="Compare two configurations" + ) + compare_parser.add_argument( + "--config-a", "-a", required=True, + help="Path to first config directory", + ) + compare_parser.add_argument( + "--config-b", "-b", required=True, + help="Path to second config directory", + ) + compare_parser.add_argument( + "--name-a", default="Config A", + help="Display name for first config", + ) + compare_parser.add_argument( + "--name-b", default="Config B", + help="Display name for second config", + ) + compare_parser.add_argument( + "--strategies", "-s", nargs="+", default=["balanced"], + help="Strategies to test", + ) + compare_parser.add_argument( + "--seeds", nargs="+", type=int, default=[42], + help="Random seeds", + ) + compare_parser.add_argument( + "--ticks", "-t", type=int, default=100, + help="Tick budget per sweep", + ) + compare_parser.add_argument( + "--output-dir", "-o", default="build/studio_compare", + help="Output directory", + ) + compare_parser.add_argument( + "--json", action="store_true", help="Output as JSON", + ) + compare_parser.add_argument( + "--verbose", "-v", action="store_true", help="Verbose output", + ) + + # Test-tuning command + tuning_parser = subparsers.add_parser( + "test-tuning", help="Test a tuning change" + ) + tuning_parser.add_argument( + "--name", "-n", required=True, + help="Name for this tuning experiment", + ) + tuning_parser.add_argument( + "--change", "-c", action="append", + help="Config change as key=value (can be repeated)", + ) + tuning_parser.add_argument( + "--description", help="Description of the changes", + ) + tuning_parser.add_argument( + "--baseline", help="Path to baseline config directory", + ) + tuning_parser.add_argument( + "--strategies", "-s", nargs="+", default=["balanced"], + help="Strategies to test", + ) + tuning_parser.add_argument( + "--seeds", nargs="+", type=int, default=[42, 123], + help="Random seeds", + ) + tuning_parser.add_argument( + "--ticks", "-t", type=int, default=100, + help="Tick budget per sweep", + ) + tuning_parser.add_argument( + "--output-dir", "-o", default="build/studio_tuning", + help="Output directory", + ) + tuning_parser.add_argument( + "--json", action="store_true", help="Output as JSON", + ) + tuning_parser.add_argument( + "--verbose", "-v", action="store_true", help="Verbose output", + ) + + # View-reports command + reports_parser = subparsers.add_parser( + "view-reports", help="View historical reports" + ) + reports_parser.add_argument( + "--reports-dir", default="build", + help="Directory to search for reports", + ) + reports_parser.add_argument( + "--limit", "-l", type=int, default=20, + help="Maximum reports to list", + ) + reports_parser.add_argument( + "--json", action="store_true", help="Output as JSON", + ) + + # Generate-report command + generate_parser = subparsers.add_parser( + "generate-report", help="Generate HTML report from sweep results" + ) + generate_parser.add_argument( + "--input", "-i", required=True, + help="Path to sweep summary JSON", + ) + generate_parser.add_argument( + "--output", "-o", required=True, + help="Output HTML file path", + ) + generate_parser.add_argument( + "--title", default="Balance Studio Report", + help="Report title", + ) + generate_parser.add_argument( + "--theme", choices=["light", "dark"], default="light", + help="Color theme", + ) + generate_parser.add_argument( + "--no-charts", action="store_true", + help="Disable chart generation", + ) + generate_parser.add_argument( + "--include-raw", action="store_true", + help="Include raw JSON data section", + ) + + args = parser.parse_args(argv) + + # If no command, run interactive mode + if args.command is None: + return interactive_mode() + + # Dispatch to command handler + handlers = { + "sweep": cmd_sweep, + "compare": cmd_compare, + "test-tuning": cmd_test_tuning, + "view-reports": cmd_view_reports, + "generate-report": cmd_generate_report, + } + + return handlers[args.command](args) + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/gengine/balance_studio/__init__.py b/src/gengine/balance_studio/__init__.py new file mode 100644 index 00000000..edda0785 --- /dev/null +++ b/src/gengine/balance_studio/__init__.py @@ -0,0 +1,75 @@ +"""Balance Studio - Designer feedback loop and tooling. + +This module provides guided workflows for designers to iterate on game +balance without requiring code changes. + +Components +---------- +- overlays: YAML overlay system for configuration testing +- workflows: Guided workflow implementations +- report_viewer: Interactive HTML report generation + +CLI Tool +-------- +The `echoes-balance-studio` CLI provides an interactive interface:: + + echoes-balance-studio + +Or use specific commands:: + + echoes-balance-studio sweep --strategies balanced aggressive + echoes-balance-studio compare --config-a path/to/a --config-b path/to/b + echoes-balance-studio test-tuning --changes economy.regen_scale=1.2 + echoes-balance-studio view-reports +""" + +from .overlays import ( + ConfigOverlay, + create_tuning_overlay, + deep_merge, + load_overlay_directory, + merge_overlays, +) +from .report_viewer import ( + FilterState, + ReportViewerConfig, + generate_interactive_html, + write_html_report, +) +from .workflows import ( + CompareConfigsConfig, + ExploratorySweepConfig, + TuningTestConfig, + WorkflowResult, + get_workflow_menu, + list_historical_reports, + run_config_comparison, + run_exploratory_sweep, + run_tuning_test, + view_historical_report, +) + +__all__ = [ + # Overlays + "ConfigOverlay", + "create_tuning_overlay", + "deep_merge", + "load_overlay_directory", + "merge_overlays", + # Report Viewer + "FilterState", + "ReportViewerConfig", + "generate_interactive_html", + "write_html_report", + # Workflows + "CompareConfigsConfig", + "ExploratorySweepConfig", + "TuningTestConfig", + "WorkflowResult", + "get_workflow_menu", + "list_historical_reports", + "run_config_comparison", + "run_exploratory_sweep", + "run_tuning_test", + "view_historical_report", +] diff --git a/src/gengine/balance_studio/cli.py b/src/gengine/balance_studio/cli.py new file mode 100644 index 00000000..0bde6471 --- /dev/null +++ b/src/gengine/balance_studio/cli.py @@ -0,0 +1,51 @@ +"""CLI entry point for Balance Studio. + +This module provides the main() function that is registered as the +`echoes-balance-studio` command in pyproject.toml. +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Sequence + +# Import and re-export main from the script module +# We need to load it dynamically to avoid circular imports + + +def main(argv: Sequence[str] | None = None) -> int: + """Entry point for the echoes-balance-studio command. + + This function imports and runs the main CLI from the scripts module. + + Parameters + ---------- + argv + Command-line arguments (defaults to sys.argv[1:]). + + Returns + ------- + int + Exit code. + """ + # Import the main function from the script + from importlib import util + + script_dir = Path(__file__).resolve().parents[3] / "scripts" + script_path = script_dir / "echoes_balance_studio.py" + + spec = util.spec_from_file_location("echoes_balance_studio", script_path) + if spec is None or spec.loader is None: + sys.stderr.write(f"Failed to load Balance Studio script: {script_path}\n") + return 1 + + module = util.module_from_spec(spec) + sys.modules.setdefault("echoes_balance_studio", module) + spec.loader.exec_module(module) + + return module.main(argv) + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/gengine/balance_studio/overlays.py b/src/gengine/balance_studio/overlays.py new file mode 100644 index 00000000..28c478be --- /dev/null +++ b/src/gengine/balance_studio/overlays.py @@ -0,0 +1,293 @@ +"""YAML overlay system for configuration testing. + +Allows designers to create configuration overlays that are merged with base +configs, enabling testing of tuning changes without modifying base files. + +Examples +-------- +Create and apply an overlay:: + + overlay = ConfigOverlay.from_yaml(Path("my_overlay.yml")) + merged_config = overlay.apply(base_config) + +Load overlay directory:: + + overlays = load_overlay_directory(Path("content/config/overlays")) +""" + +from __future__ import annotations + +import copy +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + + +@dataclass +class ConfigOverlay: + """A configuration overlay that can be merged with base configs. + + Attributes + ---------- + name + Name of the overlay for display purposes. + description + Human-readable description of what this overlay changes. + source_path + Path to the source YAML file. + overrides + Dictionary of config keys to override values. + metadata + Additional metadata about the overlay. + """ + + name: str + description: str = "" + source_path: Path | None = None + overrides: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_yaml(cls, path: Path) -> ConfigOverlay: + """Load an overlay from a YAML file. + + Parameters + ---------- + path + Path to the YAML overlay file. + + Returns + ------- + ConfigOverlay + Loaded overlay. + + Raises + ------ + FileNotFoundError + If the file does not exist. + ValueError + If the file is not valid YAML or missing required fields. + """ + if not path.exists(): + raise FileNotFoundError(f"Overlay file not found: {path}") + + with open(path) as f: + data = yaml.safe_load(f) or {} + + if not isinstance(data, dict): + raise ValueError(f"Invalid overlay format in {path}: expected dict") + + return cls( + name=data.get("name", path.stem), + description=data.get("description", ""), + source_path=path, + overrides=data.get("overrides", {}), + metadata=data.get("metadata", {}), + ) + + @classmethod + def from_dict(cls, data: dict[str, Any], name: str = "inline") -> ConfigOverlay: + """Create an overlay from a dictionary. + + Parameters + ---------- + data + Dictionary with overlay structure. + name + Name for the overlay. + + Returns + ------- + ConfigOverlay + Created overlay. + """ + return cls( + name=data.get("name", name), + description=data.get("description", ""), + overrides=data.get("overrides", data), + metadata=data.get("metadata", {}), + ) + + def apply(self, base_config: dict[str, Any]) -> dict[str, Any]: + """Apply this overlay to a base configuration. + + Performs a deep merge where overlay values override base values. + + Parameters + ---------- + base_config + Base configuration dictionary. + + Returns + ------- + dict[str, Any] + Merged configuration with overlays applied. + """ + return deep_merge(base_config, self.overrides) + + def to_dict(self) -> dict[str, Any]: + """Serialize overlay to dictionary. + + Returns + ------- + dict[str, Any] + Serialized overlay. + """ + return { + "name": self.name, + "description": self.description, + "source_path": str(self.source_path) if self.source_path else None, + "overrides": self.overrides, + "metadata": self.metadata, + } + + def to_yaml(self, path: Path) -> None: + """Write overlay to a YAML file. + + Parameters + ---------- + path + Path to write the overlay file. + """ + data = { + "name": self.name, + "description": self.description, + "overrides": self.overrides, + "metadata": self.metadata, + } + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + yaml.safe_dump(data, f, default_flow_style=False, sort_keys=False) + + +def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: + """Deep merge two dictionaries, with override taking precedence. + + Parameters + ---------- + base + Base dictionary. + override + Override dictionary with values to merge. + + Returns + ------- + dict[str, Any] + Merged dictionary. + """ + result = copy.deepcopy(base) + + for key, value in override.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = deep_merge(result[key], value) + else: + result[key] = copy.deepcopy(value) + + return result + + +def load_overlay_directory(directory: Path) -> list[ConfigOverlay]: + """Load all overlays from a directory. + + Parameters + ---------- + directory + Directory containing overlay YAML files. + + Returns + ------- + list[ConfigOverlay] + List of loaded overlays. + """ + if not directory.exists(): + return [] + + overlays: list[ConfigOverlay] = [] + + for path in sorted(directory.glob("*.yml")): + try: + overlays.append(ConfigOverlay.from_yaml(path)) + except (ValueError, yaml.YAMLError): + continue + + for path in sorted(directory.glob("*.yaml")): + try: + overlays.append(ConfigOverlay.from_yaml(path)) + except (ValueError, yaml.YAMLError): + continue + + return overlays + + +def create_tuning_overlay( + name: str, + changes: dict[str, Any], + description: str = "", +) -> ConfigOverlay: + """Create a tuning overlay from a dictionary of changes. + + This is a convenience function for quick experimentation. + + Parameters + ---------- + name + Name for the overlay. + changes + Dictionary of configuration changes to apply. + description + Optional description of the changes. + + Returns + ------- + ConfigOverlay + Created overlay. + + Examples + -------- + >>> overlay = create_tuning_overlay( + ... "aggressive_economy", + ... {"economy": {"regen_scale": 1.2}}, + ... "Test higher resource regeneration" + ... ) + """ + return ConfigOverlay( + name=name, + description=description, + overrides=changes, + metadata={"type": "tuning_experiment"}, + ) + + +def merge_overlays(overlays: list[ConfigOverlay]) -> ConfigOverlay: + """Merge multiple overlays into a single overlay. + + Overlays are applied in order, with later overlays taking precedence. + + Parameters + ---------- + overlays + List of overlays to merge. + + Returns + ------- + ConfigOverlay + Merged overlay. + """ + if not overlays: + return ConfigOverlay(name="empty") + + merged_overrides: dict[str, Any] = {} + names: list[str] = [] + + for overlay in overlays: + merged_overrides = deep_merge(merged_overrides, overlay.overrides) + names.append(overlay.name) + + return ConfigOverlay( + name=" + ".join(names), + description=f"Merged from: {', '.join(names)}", + overrides=merged_overrides, + metadata={"merged_from": names}, + ) diff --git a/src/gengine/balance_studio/report_viewer.py b/src/gengine/balance_studio/report_viewer.py new file mode 100644 index 00000000..dc325d3c --- /dev/null +++ b/src/gengine/balance_studio/report_viewer.py @@ -0,0 +1,710 @@ +"""Interactive HTML report viewer for balance analysis. + +Generates an HTML dashboard that allows filtering, sorting, and drilling +into sweep results without requiring code changes. +""" + +from __future__ import annotations + +import base64 +import io +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +# Optional matplotlib import +try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + + +@dataclass +class ReportViewerConfig: + """Configuration for the report viewer. + + Attributes + ---------- + title + Title for the HTML report. + include_charts + Whether to include embedded charts. + include_raw_data + Whether to include raw JSON data section. + theme + Color theme: "light" or "dark". + """ + + title: str = "Balance Studio Report" + include_charts: bool = True + include_raw_data: bool = False + theme: str = "light" + + +@dataclass +class FilterState: + """Current filter state for the report viewer. + + Attributes + ---------- + strategies + Selected strategies to display. + difficulties + Selected difficulties to display. + min_stability + Minimum stability threshold. + max_stability + Maximum stability threshold. + show_errors + Whether to show failed sweeps. + """ + + strategies: list[str] = field(default_factory=list) + difficulties: list[str] = field(default_factory=list) + min_stability: float = 0.0 + max_stability: float = 1.0 + show_errors: bool = True + + +def generate_strategy_chart(stats: dict[str, Any]) -> str | None: + """Generate a bar chart of average stability by strategy. + + Parameters + ---------- + stats + Strategy statistics from sweep results. + + Returns + ------- + str | None + Base64-encoded PNG image, or None if matplotlib unavailable. + """ + if not HAS_MATPLOTLIB or not stats: + return None + + strategies = list(stats.keys()) + avg_stabilities = [stats[s].get("avg_stability", 0) for s in strategies] + + # Generate colors dynamically based on number of strategies + base_colors = ["#3498db", "#e74c3c", "#2ecc71", "#9b59b6", "#f39c12", "#1abc9c"] + colors = [base_colors[i % len(base_colors)] for i in range(len(strategies))] + + fig, ax = plt.subplots(figsize=(8, 5)) + bars = ax.bar(strategies, avg_stabilities, color=colors) + + ax.set_xlabel("Strategy") + ax.set_ylabel("Average Stability") + ax.set_title("Strategy Performance Comparison") + ax.set_ylim(0, 1) + + for bar, val in zip(bars, avg_stabilities, strict=True): + ax.text( + bar.get_x() + bar.get_width() / 2, + val + 0.02, + f"{val:.2f}", + ha="center", + fontsize=10, + ) + + buf = io.BytesIO() + fig.savefig(buf, format="png", bbox_inches="tight", dpi=100) + plt.close(fig) + buf.seek(0) + return base64.b64encode(buf.read()).decode("utf-8") + + +def generate_difficulty_chart(stats: dict[str, Any]) -> str | None: + """Generate a bar chart of average stability by difficulty. + + Parameters + ---------- + stats + Difficulty statistics from sweep results. + + Returns + ------- + str | None + Base64-encoded PNG image, or None if matplotlib unavailable. + """ + if not HAS_MATPLOTLIB or not stats: + return None + + difficulties = list(stats.keys()) + avg_stabilities = [stats[d].get("avg_stability", 0) for d in difficulties] + + # Color gradient from easy (green) to hard (red) + colors = plt.cm.RdYlGn( + [1.0 - i / max(len(difficulties) - 1, 1) for i in range(len(difficulties))] + ) + + fig, ax = plt.subplots(figsize=(8, 5)) + bars = ax.bar(difficulties, avg_stabilities, color=colors) + + ax.set_xlabel("Difficulty") + ax.set_ylabel("Average Stability") + ax.set_title("Difficulty Level Impact") + ax.set_ylim(0, 1) + + for bar, val in zip(bars, avg_stabilities, strict=True): + ax.text( + bar.get_x() + bar.get_width() / 2, + val + 0.02, + f"{val:.2f}", + ha="center", + fontsize=10, + ) + + buf = io.BytesIO() + fig.savefig(buf, format="png", bbox_inches="tight", dpi=100) + plt.close(fig) + buf.seek(0) + return base64.b64encode(buf.read()).decode("utf-8") + + +def generate_stability_distribution_chart(sweeps: list[dict[str, Any]]) -> str | None: + """Generate a histogram of stability distribution. + + Parameters + ---------- + sweeps + List of sweep results. + + Returns + ------- + str | None + Base64-encoded PNG image, or None if matplotlib unavailable. + """ + if not HAS_MATPLOTLIB or not sweeps: + return None + + stabilities = [ + s.get("results", {}).get("final_stability", 0) + for s in sweeps + if s.get("error") is None + ] + + if not stabilities: + return None + + fig, ax = plt.subplots(figsize=(8, 5)) + ax.hist(stabilities, bins=20, edgecolor="white", color="#3498db", alpha=0.7) + + ax.set_xlabel("Final Stability") + ax.set_ylabel("Count") + ax.set_title("Stability Distribution Across All Sweeps") + ax.axvline(0.5, color="red", linestyle="--", label="Win Threshold") + ax.legend() + + buf = io.BytesIO() + fig.savefig(buf, format="png", bbox_inches="tight", dpi=100) + plt.close(fig) + buf.seek(0) + return base64.b64encode(buf.read()).decode("utf-8") + + +def generate_interactive_html( + data: dict[str, Any], + config: ReportViewerConfig | None = None, +) -> str: + """Generate an interactive HTML report from sweep data. + + Parameters + ---------- + data + Sweep results data (from batch_sweep_summary.json). + config + Report viewer configuration. + + Returns + ------- + str + Complete HTML document. + """ + if config is None: + config = ReportViewerConfig() + + # Generate charts + charts: dict[str, str | None] = {} + if config.include_charts: + charts["strategy"] = generate_strategy_chart(data.get("strategy_stats", {})) + charts["difficulty"] = generate_difficulty_chart( + data.get("difficulty_stats", {}) + ) + charts["distribution"] = generate_stability_distribution_chart( + data.get("sweeps", []) + ) + + # Extract unique values for filters + sweeps = data.get("sweeps", []) + strategies = sorted( + set(s.get("parameters", {}).get("strategy", "") for s in sweeps) + ) + difficulties = sorted( + set(s.get("parameters", {}).get("difficulty", "") for s in sweeps) + ) + + # Build HTML + theme_colors = _get_theme_colors(config.theme) + metadata = data.get("metadata", {}) + strategy_stats = data.get("strategy_stats", {}) + difficulty_stats = data.get("difficulty_stats", {}) + + # CSS styles (broken into lines for readability) + font_family = ( + "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif" + ) + section_h2_style = ( + "margin-top: 0; border-bottom: 1px solid var(--border-color); " + "padding-bottom: 10px" + ) + details_style = ( + "display: none; padding: 10px; background: rgba(0,0,0,0.05); " + "margin-top: 10px; border-radius: 4px" + ) + + html = f""" + + + + + {config.title} + + + +
+
+
+

{config.title}

+
+ Generated: {metadata.get('timestamp', 'Unknown')} | + Git: {metadata.get('git_commit', 'N/A')} +
+
+
+ +
+
+
{data.get('total_sweeps', 0)}
+
Total Sweeps
+
+
+
{data.get('completed_sweeps', 0)}
+
Completed
+
+
+
{data.get('failed_sweeps', 0)}
+
Failed
+
+
+
{data.get('total_duration_seconds', 0):.1f}s
+
Total Duration
+
+
+ +
+

Strategy Performance

+ + + + + + + + + + + + + {_generate_strategy_rows(strategy_stats)} + +
StrategyCountCompletedAvg StabilityMinMax
+ {_generate_chart_section(charts.get('strategy'), 'Strategy Comparison')} +
+ +
+

Difficulty Analysis

+ + + + + + + + + + + + + {_generate_difficulty_rows(difficulty_stats)} + +
DifficultyCountCompletedAvg StabilityMinMax
+ {_generate_chart_section(charts.get('difficulty'), 'Difficulty Impact')} +
+ + {_generate_distribution_section(charts.get('distribution'))} + +
+

Individual Sweeps

+
+
+ + +
+
+ + +
+
+ + +
+
+ + + + + + + + + + + + + + + {_generate_sweep_rows(sweeps)} + +
IDStrategyDifficultySeedStabilityActionsStatusDetails
+
+ + {_generate_raw_data_section(data) if config.include_raw_data else ''} +
+ + + +""" + + return html + + +def _get_theme_colors(theme: str) -> dict[str, str]: + """Get color scheme for theme.""" + if theme == "dark": + return { + "bg": "#1a1a2e", + "text": "#eee", + "card": "#16213e", + "border": "#0f3460", + } + return { + "bg": "#f5f6fa", + "text": "#2c3e50", + "card": "#ffffff", + "border": "#dcdde1", + } + + +def _generate_strategy_rows(stats: dict[str, Any]) -> str: + """Generate table rows for strategy stats.""" + rows = [] + for strategy, s in stats.items(): + avg = s.get("avg_stability", 0) + status_class = "status-ok" if avg >= 0.5 else "status-warn" + rows.append( + f""" + {strategy} + {s.get('count', 0)} + {s.get('completed', 0)} + {avg:.3f} + {s.get('min_stability', 0):.3f} + {s.get('max_stability', 0):.3f} + """ + ) + return "\n".join(rows) + + +def _generate_difficulty_rows(stats: dict[str, Any]) -> str: + """Generate table rows for difficulty stats.""" + rows = [] + for difficulty, s in stats.items(): + avg = s.get("avg_stability", 0) + status_class = "status-ok" if avg >= 0.5 else "status-warn" + rows.append( + f""" + {difficulty} + {s.get('count', 0)} + {s.get('completed', 0)} + {avg:.3f} + {s.get('min_stability', 0):.3f} + {s.get('max_stability', 0):.3f} + """ + ) + return "\n".join(rows) + + +def _generate_sweep_rows(sweeps: list[dict[str, Any]]) -> str: + """Generate table rows for individual sweeps.""" + rows = [] + for sweep in sweeps: + sweep_id = sweep.get("sweep_id", "?") + params = sweep.get("parameters", {}) + results = sweep.get("results", {}) + error = sweep.get("error") + + stability = results.get("final_stability", 0) + if error: + status = 'Error' + elif stability >= 0.5: + status = 'Pass' + else: + status = 'Low' + + strategy = params.get('strategy', '') + difficulty = params.get('difficulty', '') + btn = f' +
+
{json.dumps(sweep, indent=2)[:500]}...
+
+ + """ + ) + return "\n".join(rows) + + +def _generate_options(values: list[str]) -> str: + """Generate select options.""" + return "\n".join(f'' for v in values) + + +def _generate_chart_section(chart_data: str | None, title: str) -> str: + """Generate chart section HTML.""" + if not chart_data: + return "" + return f""" +
+ {title} +
+ """ + + +def _generate_distribution_section(chart_data: str | None) -> str: + """Generate distribution chart section.""" + if not chart_data: + return "" + return f""" +
+

Stability Distribution

+
+ Stability Distribution +
+
+ """ + + +def _generate_raw_data_section(data: dict[str, Any]) -> str: + """Generate raw data section.""" + return f""" +
+

Raw Data

+
{json.dumps(data, indent=2)}
+
+ """ + + +def write_html_report( + data: dict[str, Any], + output_path: Path, + config: ReportViewerConfig | None = None, +) -> None: + """Write an HTML report to file. + + Parameters + ---------- + data + Sweep results data. + output_path + Path to write HTML file. + config + Report viewer configuration. + """ + html = generate_interactive_html(data, config) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(html) diff --git a/src/gengine/balance_studio/workflows.py b/src/gengine/balance_studio/workflows.py new file mode 100644 index 00000000..e0ee67d9 --- /dev/null +++ b/src/gengine/balance_studio/workflows.py @@ -0,0 +1,608 @@ +"""Designer workflow implementations for the Balance Studio. + +Provides guided workflows that encapsulate common balance iteration tasks: +- Running exploratory sweeps +- Comparing configurations +- Testing tuning changes +- Viewing historical reports + +These workflows leverage existing sweep and analysis infrastructure while +providing a designer-friendly interface. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .overlays import ConfigOverlay, create_tuning_overlay + + +@dataclass +class WorkflowResult: + """Result from executing a workflow. + + Attributes + ---------- + workflow_name + Name of the workflow that was executed. + success + Whether the workflow completed successfully. + message + Human-readable summary of the result. + output_path + Path to any generated output files. + data + Structured result data. + errors + List of error messages if any. + """ + + workflow_name: str + success: bool + message: str + output_path: Path | None = None + data: dict[str, Any] = field(default_factory=dict) + errors: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + """Serialize result to dictionary.""" + return { + "workflow_name": self.workflow_name, + "success": self.success, + "message": self.message, + "output_path": str(self.output_path) if self.output_path else None, + "data": self.data, + "errors": self.errors, + } + + +@dataclass +class ExploratorySweepConfig: + """Configuration for an exploratory sweep workflow. + + Attributes + ---------- + strategies + AI strategies to test. + difficulties + Difficulty presets to test. + seeds + Random seeds for reproducibility. + tick_budget + Tick budget for each sweep. + output_dir + Directory to write results. + overlay + Optional overlay to apply during sweep. + """ + + strategies: list[str] = field( + default_factory=lambda: ["balanced", "aggressive", "diplomatic"] + ) + difficulties: list[str] = field(default_factory=lambda: ["normal"]) + seeds: list[int] = field(default_factory=lambda: [42, 123, 456]) + tick_budget: int = 100 + output_dir: Path = field(default_factory=lambda: Path("build/studio_sweeps")) + overlay: ConfigOverlay | None = None + + +def run_exploratory_sweep( + config: ExploratorySweepConfig, + verbose: bool = False, +) -> WorkflowResult: + """Run an exploratory sweep workflow. + + Executes batch sweeps with the specified configuration and generates + a summary report for designer review. + + Parameters + ---------- + config + Sweep configuration. + verbose + If True, print progress messages. + + Returns + ------- + WorkflowResult + Result of the sweep execution. + """ + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + output_dir = config.output_dir / f"sweep_{timestamp}" + output_dir.mkdir(parents=True, exist_ok=True) + + # If overlay provided, create a temporary config directory + config_root = None + if config.overlay: + config_root = output_dir / "config" + config_root.mkdir(exist_ok=True) + + # Write overlay as simulation.yml + overlay_path = config_root / "simulation.yml" + config.overlay.to_yaml(overlay_path) + + # Build sweep command + cmd = [ + sys.executable, + "scripts/run_batch_sweeps.py", + "--strategies", + *config.strategies, + "--difficulties", + *config.difficulties, + "--seeds", + *[str(s) for s in config.seeds], + "--ticks", + str(config.tick_budget), + "--output-dir", + str(output_dir), + ] + + if verbose: + cmd.append("--verbose") + sys.stderr.write(f"Running sweep: {' '.join(cmd)}\n") + + # Set config root if using overlay + env = os.environ.copy() + if config_root: + env["ECHOES_CONFIG_ROOT"] = str(config_root) + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=1800, # 30 minute timeout + env=env, + ) + + if result.returncode != 0: + return WorkflowResult( + workflow_name="exploratory_sweep", + success=False, + message="Sweep failed to complete", + output_path=output_dir, + errors=[result.stderr], + ) + + # Load summary if available + summary_path = output_dir / "batch_sweep_summary.json" + summary_data: dict[str, Any] = {} + if summary_path.exists(): + with open(summary_path) as f: + summary_data = json.load(f) + + return WorkflowResult( + workflow_name="exploratory_sweep", + success=True, + message=f"Sweep completed with {summary_data.get('completed_sweeps', 0)} " + f"of {summary_data.get('total_sweeps', 0)} sweeps", + output_path=output_dir, + data=summary_data, + ) + + except subprocess.TimeoutExpired: + return WorkflowResult( + workflow_name="exploratory_sweep", + success=False, + message="Sweep timed out after 30 minutes", + output_path=output_dir, + errors=["Timeout exceeded"], + ) + except FileNotFoundError as e: + return WorkflowResult( + workflow_name="exploratory_sweep", + success=False, + message="Sweep script not found", + errors=[str(e)], + ) + + +@dataclass +class CompareConfigsConfig: + """Configuration for comparing two configurations. + + Attributes + ---------- + config_a_path + Path to first configuration (or overlay). + config_b_path + Path to second configuration (or overlay). + name_a + Display name for first config. + name_b + Display name for second config. + strategies + Strategies to test for comparison. + seeds + Seeds for reproducibility. + tick_budget + Tick budget per sweep. + output_dir + Output directory for results. + """ + + config_a_path: Path + config_b_path: Path + name_a: str = "Config A" + name_b: str = "Config B" + strategies: list[str] = field(default_factory=lambda: ["balanced"]) + seeds: list[int] = field(default_factory=lambda: [42]) + tick_budget: int = 100 + output_dir: Path = field(default_factory=lambda: Path("build/studio_compare")) + + +def run_config_comparison( + config: CompareConfigsConfig, + verbose: bool = False, +) -> WorkflowResult: + """Compare two configurations by running sweeps with each. + + Parameters + ---------- + config + Comparison configuration. + verbose + If True, print progress messages. + + Returns + ------- + WorkflowResult + Result including comparison data. + """ + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + output_dir = config.output_dir / f"compare_{timestamp}" + output_dir.mkdir(parents=True, exist_ok=True) + + results: dict[str, Any] = { + "config_a": {"name": config.name_a, "path": str(config.config_a_path)}, + "config_b": {"name": config.name_b, "path": str(config.config_b_path)}, + "sweeps": {}, + } + + errors: list[str] = [] + + for label, config_path, name in [ + ("a", config.config_a_path, config.name_a), + ("b", config.config_b_path, config.name_b), + ]: + if verbose: + sys.stderr.write(f"Running sweeps for {name}...\n") + + sweep_output = output_dir / f"sweep_{label}" + sweep_output.mkdir(exist_ok=True) + + cmd = [ + sys.executable, + "scripts/run_batch_sweeps.py", + "--strategies", + *config.strategies, + "--seeds", + *[str(s) for s in config.seeds], + "--ticks", + str(config.tick_budget), + "--output-dir", + str(sweep_output), + ] + + env = os.environ.copy() + if config_path.is_dir(): + env["ECHOES_CONFIG_ROOT"] = str(config_path) + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=900, + env=env, + ) + + summary_path = sweep_output / "batch_sweep_summary.json" + if summary_path.exists(): + with open(summary_path) as f: + results["sweeps"][label] = json.load(f) + elif result.returncode != 0: + errors.append(f"Sweep {label} failed: {result.stderr}") + + except subprocess.TimeoutExpired: + errors.append(f"Sweep {label} timed out") + except Exception as e: + errors.append(f"Sweep {label} error: {e}") + + # Calculate comparison metrics if both sweeps succeeded + if "a" in results["sweeps"] and "b" in results["sweeps"]: + stats_a = results["sweeps"]["a"].get("strategy_stats", {}) + stats_b = results["sweeps"]["b"].get("strategy_stats", {}) + + comparison: dict[str, Any] = {} + for strategy in config.strategies: + if strategy in stats_a and strategy in stats_b: + avg_a = stats_a[strategy].get("avg_stability", 0) + avg_b = stats_b[strategy].get("avg_stability", 0) + comparison[strategy] = { + "stability_a": avg_a, + "stability_b": avg_b, + "delta": avg_b - avg_a, + "delta_percent": ( + ((avg_b - avg_a) / avg_a * 100) if avg_a > 0 else 0 + ), + } + + results["comparison"] = comparison + + # Write comparison report + report_path = output_dir / "comparison_report.json" + with open(report_path, "w") as f: + json.dump(results, f, indent=2) + + return WorkflowResult( + workflow_name="compare_configs", + success=True, + message=f"Compared {config.name_a} vs {config.name_b}", + output_path=output_dir, + data=results, + ) + + return WorkflowResult( + workflow_name="compare_configs", + success=False, + message="Failed to complete comparison", + output_path=output_dir, + data=results, + errors=errors, + ) + + +@dataclass +class TuningTestConfig: + """Configuration for testing a tuning change. + + Attributes + ---------- + name + Name of the tuning experiment. + changes + Dictionary of configuration changes to test. + description + Description of what's being tested. + baseline_config + Path to baseline config directory. + strategies + Strategies to test. + seeds + Seeds for reproducibility. + tick_budget + Tick budget per sweep. + output_dir + Output directory. + """ + + name: str + changes: dict[str, Any] + description: str = "" + baseline_config: Path | None = None + strategies: list[str] = field(default_factory=lambda: ["balanced"]) + seeds: list[int] = field(default_factory=lambda: [42, 123]) + tick_budget: int = 100 + output_dir: Path = field(default_factory=lambda: Path("build/studio_tuning")) + + +def run_tuning_test( + config: TuningTestConfig, + verbose: bool = False, +) -> WorkflowResult: + """Test a tuning change by comparing baseline to modified config. + + Parameters + ---------- + config + Tuning test configuration. + verbose + If True, print progress messages. + + Returns + ------- + WorkflowResult + Result with comparison between baseline and tuned config. + """ + # Create overlay from changes + overlay = create_tuning_overlay( + name=config.name, + changes=config.changes, + description=config.description, + ) + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + output_dir = config.output_dir / f"tuning_{config.name}_{timestamp}" + output_dir.mkdir(parents=True, exist_ok=True) + + # Save the overlay for reference + overlay_path = output_dir / "overlay.yml" + overlay.to_yaml(overlay_path) + + # Create modified config directory + modified_config = output_dir / "modified_config" + modified_config.mkdir(exist_ok=True) + + # Load base config and apply overlay + import yaml + + base_config_path = config.baseline_config or Path("content/config") + base_sim_yml = base_config_path / "simulation.yml" + + if base_sim_yml.exists(): + with open(base_sim_yml) as f: + base_data = yaml.safe_load(f) or {} + merged = overlay.apply(base_data) + with open(modified_config / "simulation.yml", "w") as f: + yaml.safe_dump(merged, f, default_flow_style=False) + else: + # Write overlay as-is if no base config + overlay.to_yaml(modified_config / "simulation.yml") + + # Run comparison + compare_config = CompareConfigsConfig( + config_a_path=base_config_path, + config_b_path=modified_config, + name_a="Baseline", + name_b=f"Tuned ({config.name})", + strategies=config.strategies, + seeds=config.seeds, + tick_budget=config.tick_budget, + output_dir=output_dir, + ) + + result = run_config_comparison(compare_config, verbose=verbose) + + # Enhance result with tuning-specific info + result.workflow_name = "tuning_test" + result.data["overlay"] = overlay.to_dict() + result.data["changes_applied"] = config.changes + + return result + + +def list_historical_reports( + reports_dir: Path = Path("build"), + pattern: str = "**/batch_sweep_summary.json", + limit: int = 20, +) -> list[dict[str, Any]]: + """List available historical sweep reports. + + Parameters + ---------- + reports_dir + Directory to search for reports. + pattern + Glob pattern for finding report files. + limit + Maximum number of reports to return. + + Returns + ------- + list[dict[str, Any]] + List of report metadata. + """ + reports: list[dict[str, Any]] = [] + + for path in sorted(reports_dir.glob(pattern), reverse=True): + try: + with open(path) as f: + data = json.load(f) + + metadata = data.get("metadata", {}) + reports.append( + { + "path": str(path), + "timestamp": metadata.get("timestamp", "unknown"), + "git_commit": metadata.get("git_commit"), + "total_sweeps": data.get("total_sweeps", 0), + "completed_sweeps": data.get("completed_sweeps", 0), + "strategies": data.get("config", {}).get("strategies", []), + "difficulties": data.get("config", {}).get("difficulties", []), + } + ) + + if len(reports) >= limit: + break + except (json.JSONDecodeError, KeyError): + continue + + return reports + + +def view_historical_report( + report_path: Path, + output_format: str = "summary", +) -> WorkflowResult: + """View a historical sweep report. + + Parameters + ---------- + report_path + Path to the report file. + output_format + Format for output: "summary", "json", or "html". + + Returns + ------- + WorkflowResult + Result with report data. + """ + if not report_path.exists(): + return WorkflowResult( + workflow_name="view_report", + success=False, + message=f"Report not found: {report_path}", + errors=[f"File does not exist: {report_path}"], + ) + + try: + with open(report_path) as f: + data = json.load(f) + + return WorkflowResult( + workflow_name="view_report", + success=True, + message=f"Loaded report with {data.get('total_sweeps', 0)} sweeps", + output_path=report_path, + data=data, + ) + except json.JSONDecodeError as e: + return WorkflowResult( + workflow_name="view_report", + success=False, + message="Failed to parse report", + errors=[str(e)], + ) + + +def get_workflow_menu() -> list[dict[str, str]]: + """Get the list of available workflows with descriptions. + + Returns + ------- + list[dict[str, str]] + List of workflow descriptions. + """ + return [ + { + "id": "exploratory_sweep", + "name": "Run Exploratory Sweep", + "description": ( + "Execute batch sweeps across strategies and difficulties to " + "explore balance space." + ), + }, + { + "id": "compare_configs", + "name": "Compare Two Configs", + "description": ( + "Run side-by-side sweeps with different configurations and " + "compare results." + ), + }, + { + "id": "tuning_test", + "name": "Test Tuning Change", + "description": ( + "Create a config overlay with specific changes and compare " + "against baseline." + ), + }, + { + "id": "view_reports", + "name": "View Historical Reports", + "description": ( + "Browse and view previously generated sweep reports and " + "analysis results." + ), + }, + ] diff --git a/tests/scripts/test_echoes_balance_studio.py b/tests/scripts/test_echoes_balance_studio.py new file mode 100644 index 00000000..944d6499 --- /dev/null +++ b/tests/scripts/test_echoes_balance_studio.py @@ -0,0 +1,481 @@ +"""Tests for Balance Studio CLI and components.""" + +from __future__ import annotations + +import json +import sys +from importlib import util +from pathlib import Path + +import pytest +import yaml + +from gengine.balance_studio.overlays import ( + ConfigOverlay, + create_tuning_overlay, + deep_merge, + load_overlay_directory, + merge_overlays, +) +from gengine.balance_studio.report_viewer import ( + ReportViewerConfig, + generate_interactive_html, + write_html_report, +) +from gengine.balance_studio.workflows import ( + ExploratorySweepConfig, + WorkflowResult, + get_workflow_menu, + list_historical_reports, + view_historical_report, +) + +# Load CLI main from script file + +_SCRIPT_PATH = ( + Path(__file__).resolve().parents[2] / "scripts" / "echoes_balance_studio.py" +) + + +def _load_cli_module(): + spec = util.spec_from_file_location("echoes_balance_studio_cli", _SCRIPT_PATH) + module = util.module_from_spec(spec) + assert spec and spec.loader + sys.modules.setdefault("echoes_balance_studio_cli", module) + spec.loader.exec_module(module) + return module + + +_cli = _load_cli_module() +main = _cli.main + + +class TestConfigOverlay: + """Tests for ConfigOverlay dataclass and loading.""" + + def test_overlay_from_yaml(self, tmp_path: Path) -> None: + """Test loading overlay from YAML file.""" + overlay_file = tmp_path / "test_overlay.yml" + overlay_file.write_text( + yaml.dump( + { + "name": "test_overlay", + "description": "A test overlay", + "overrides": {"economy": {"regen_scale": 1.5}}, + "metadata": {"author": "tester"}, + } + ) + ) + + overlay = ConfigOverlay.from_yaml(overlay_file) + + assert overlay.name == "test_overlay" + assert overlay.description == "A test overlay" + assert overlay.overrides == {"economy": {"regen_scale": 1.5}} + assert overlay.metadata == {"author": "tester"} + assert overlay.source_path == overlay_file + + def test_overlay_from_yaml_missing_file(self, tmp_path: Path) -> None: + """Test loading overlay from missing file raises error.""" + with pytest.raises(FileNotFoundError): + ConfigOverlay.from_yaml(tmp_path / "nonexistent.yml") + + def test_overlay_from_dict(self) -> None: + """Test creating overlay from dictionary.""" + data = { + "name": "inline_test", + "description": "Inline overlay", + "overrides": {"limits": {"engine_max_ticks": 500}}, + } + + overlay = ConfigOverlay.from_dict(data) + + assert overlay.name == "inline_test" + assert overlay.description == "Inline overlay" + assert overlay.overrides == {"limits": {"engine_max_ticks": 500}} + + def test_overlay_apply(self) -> None: + """Test applying overlay to base config.""" + base_config = { + "economy": {"regen_scale": 0.8, "demand_population_scale": 100000}, + "limits": {"engine_max_ticks": 200}, + } + + overlay = ConfigOverlay( + name="test", + overrides={"economy": {"regen_scale": 1.2}}, + ) + + result = overlay.apply(base_config) + + assert result["economy"]["regen_scale"] == 1.2 + assert result["economy"]["demand_population_scale"] == 100000 + assert result["limits"]["engine_max_ticks"] == 200 + + def test_overlay_to_yaml(self, tmp_path: Path) -> None: + """Test writing overlay to YAML file.""" + overlay = ConfigOverlay( + name="save_test", + description="Testing save", + overrides={"test": {"value": 42}}, + ) + + output_path = tmp_path / "saved_overlay.yml" + overlay.to_yaml(output_path) + + assert output_path.exists() + with open(output_path) as f: + loaded = yaml.safe_load(f) + + assert loaded["name"] == "save_test" + assert loaded["overrides"]["test"]["value"] == 42 + + +class TestDeepMerge: + """Tests for deep_merge function.""" + + def test_deep_merge_simple(self) -> None: + """Test merging flat dictionaries.""" + base = {"a": 1, "b": 2} + override = {"b": 3, "c": 4} + + result = deep_merge(base, override) + + assert result == {"a": 1, "b": 3, "c": 4} + + def test_deep_merge_nested(self) -> None: + """Test merging nested dictionaries.""" + base = {"level1": {"a": 1, "b": {"c": 2}}} + override = {"level1": {"b": {"d": 3}}} + + result = deep_merge(base, override) + + assert result["level1"]["a"] == 1 + assert result["level1"]["b"]["c"] == 2 + assert result["level1"]["b"]["d"] == 3 + + def test_deep_merge_preserves_original(self) -> None: + """Test that original dicts are not modified.""" + base = {"a": {"b": 1}} + override = {"a": {"c": 2}} + + result = deep_merge(base, override) + + assert "c" not in base["a"] + assert result["a"]["c"] == 2 + + +class TestLoadOverlayDirectory: + """Tests for loading overlays from directory.""" + + def test_load_directory_with_overlays(self, tmp_path: Path) -> None: + """Test loading multiple overlays from directory.""" + (tmp_path / "overlay1.yml").write_text( + yaml.dump({"name": "overlay1", "overrides": {"a": 1}}) + ) + (tmp_path / "overlay2.yaml").write_text( + yaml.dump({"name": "overlay2", "overrides": {"b": 2}}) + ) + + overlays = load_overlay_directory(tmp_path) + + assert len(overlays) == 2 + names = {o.name for o in overlays} + assert "overlay1" in names + assert "overlay2" in names + + def test_load_directory_empty(self, tmp_path: Path) -> None: + """Test loading from empty directory.""" + overlays = load_overlay_directory(tmp_path) + assert overlays == [] + + def test_load_directory_nonexistent(self, tmp_path: Path) -> None: + """Test loading from nonexistent directory.""" + overlays = load_overlay_directory(tmp_path / "nonexistent") + assert overlays == [] + + +class TestCreateTuningOverlay: + """Tests for create_tuning_overlay helper.""" + + def test_create_tuning_overlay(self) -> None: + """Test creating a tuning overlay.""" + overlay = create_tuning_overlay( + name="test_tuning", + changes={"economy": {"regen_scale": 1.5}}, + description="Testing regen boost", + ) + + assert overlay.name == "test_tuning" + assert overlay.description == "Testing regen boost" + assert overlay.overrides == {"economy": {"regen_scale": 1.5}} + assert overlay.metadata["type"] == "tuning_experiment" + + +class TestMergeOverlays: + """Tests for merge_overlays function.""" + + def test_merge_overlays(self) -> None: + """Test merging multiple overlays.""" + overlay1 = ConfigOverlay(name="o1", overrides={"a": 1, "b": 2}) + overlay2 = ConfigOverlay(name="o2", overrides={"b": 3, "c": 4}) + + result = merge_overlays([overlay1, overlay2]) + + assert result.name == "o1 + o2" + assert result.overrides == {"a": 1, "b": 3, "c": 4} + + def test_merge_empty_list(self) -> None: + """Test merging empty list of overlays.""" + result = merge_overlays([]) + assert result.name == "empty" + assert result.overrides == {} + + +class TestReportViewer: + """Tests for report viewer HTML generation.""" + + def test_generate_html_basic(self) -> None: + """Test generating basic HTML report.""" + data = { + "total_sweeps": 10, + "completed_sweeps": 9, + "failed_sweeps": 1, + "total_duration_seconds": 120.5, + "metadata": {"timestamp": "2024-01-01T00:00:00Z"}, + "strategy_stats": { + "balanced": {"count": 5, "completed": 5, "avg_stability": 0.75} + }, + "difficulty_stats": {}, + "sweeps": [], + } + + html = generate_interactive_html(data) + + assert "" in html + assert "Balance Studio Report" in html + assert "10" in html # total_sweeps + assert "balanced" in html + + def test_generate_html_with_config(self) -> None: + """Test generating HTML with custom config.""" + data = { + "total_sweeps": 5, + "completed_sweeps": 5, + "failed_sweeps": 0, + "total_duration_seconds": 60.0, + "metadata": {}, + "strategy_stats": {}, + "difficulty_stats": {}, + "sweeps": [], + } + + config = ReportViewerConfig( + title="Custom Report", + theme="dark", + include_charts=False, + ) + + html = generate_interactive_html(data, config) + + assert "Custom Report" in html + assert "#1a1a2e" in html # dark theme background + + def test_write_html_report(self, tmp_path: Path) -> None: + """Test writing HTML report to file.""" + data = { + "total_sweeps": 3, + "completed_sweeps": 3, + "failed_sweeps": 0, + "total_duration_seconds": 30.0, + "metadata": {}, + "strategy_stats": {}, + "difficulty_stats": {}, + "sweeps": [], + } + + output_path = tmp_path / "report.html" + write_html_report(data, output_path) + + assert output_path.exists() + content = output_path.read_text() + assert "" in content + + +class TestWorkflows: + """Tests for workflow functions.""" + + def test_workflow_result_serialization(self) -> None: + """Test WorkflowResult to_dict serialization.""" + result = WorkflowResult( + workflow_name="test", + success=True, + message="Test passed", + output_path=Path("/tmp/output"), + data={"key": "value"}, + errors=[], + ) + + d = result.to_dict() + + assert d["workflow_name"] == "test" + assert d["success"] is True + assert d["message"] == "Test passed" + assert d["output_path"] == "/tmp/output" + assert d["data"] == {"key": "value"} + + def test_get_workflow_menu(self) -> None: + """Test getting workflow menu.""" + menu = get_workflow_menu() + + assert len(menu) == 4 + ids = {w["id"] for w in menu} + assert "exploratory_sweep" in ids + assert "compare_configs" in ids + assert "tuning_test" in ids + assert "view_reports" in ids + + def test_list_historical_reports_empty(self, tmp_path: Path) -> None: + """Test listing reports in empty directory.""" + reports = list_historical_reports(tmp_path) + assert reports == [] + + def test_list_historical_reports_with_data(self, tmp_path: Path) -> None: + """Test listing reports with actual data.""" + sweep_dir = tmp_path / "sweeps" + sweep_dir.mkdir() + summary = sweep_dir / "batch_sweep_summary.json" + summary.write_text( + json.dumps( + { + "total_sweeps": 5, + "completed_sweeps": 5, + "metadata": {"timestamp": "2024-01-01T00:00:00Z"}, + "config": {"strategies": ["balanced"], "difficulties": ["normal"]}, + } + ) + ) + + reports = list_historical_reports(tmp_path) + + assert len(reports) == 1 + assert reports[0]["total_sweeps"] == 5 + + def test_view_historical_report_not_found(self, tmp_path: Path) -> None: + """Test viewing nonexistent report.""" + result = view_historical_report(tmp_path / "nonexistent.json") + + assert not result.success + assert "not found" in result.message.lower() + + def test_view_historical_report_success(self, tmp_path: Path) -> None: + """Test viewing existing report.""" + report_path = tmp_path / "report.json" + report_path.write_text(json.dumps({"total_sweeps": 10})) + + result = view_historical_report(report_path) + + assert result.success + assert result.data["total_sweeps"] == 10 + + +class TestCLI: + """Tests for CLI command handling.""" + + def test_cli_help(self) -> None: + """Test CLI help output.""" + with pytest.raises(SystemExit) as exc: + main(["--help"]) + + assert exc.value.code == 0 + + def test_cli_view_reports_empty(self, tmp_path: Path) -> None: + """Test view-reports command with empty directory.""" + result = main(["view-reports", "--reports-dir", str(tmp_path), "--json"]) + assert result == 0 + + def test_cli_generate_report_missing_input(self, tmp_path: Path) -> None: + """Test generate-report with missing input.""" + result = main( + [ + "generate-report", + "--input", + str(tmp_path / "nonexistent.json"), + "--output", + str(tmp_path / "report.html"), + ] + ) + assert result == 1 + + def test_cli_generate_report_success(self, tmp_path: Path) -> None: + """Test generate-report with valid input.""" + input_file = tmp_path / "input.json" + input_file.write_text( + json.dumps( + { + "total_sweeps": 5, + "completed_sweeps": 5, + "failed_sweeps": 0, + "total_duration_seconds": 30.0, + "metadata": {}, + "strategy_stats": {}, + "difficulty_stats": {}, + "sweeps": [], + } + ) + ) + + output_file = tmp_path / "report.html" + result = main( + [ + "generate-report", + "--input", + str(input_file), + "--output", + str(output_file), + ] + ) + + assert result == 0 + assert output_file.exists() + + def test_cli_test_tuning_no_changes(self, tmp_path: Path) -> None: + """Test test-tuning command with no changes.""" + result = main( + [ + "test-tuning", + "--name", + "empty_test", + "--output-dir", + str(tmp_path), + ] + ) + # Should fail because no --change arguments + assert result == 1 + + +class TestExploratorySweepConfig: + """Tests for ExploratorySweepConfig dataclass.""" + + def test_default_values(self) -> None: + """Test default configuration values.""" + config = ExploratorySweepConfig() + + assert config.strategies == ["balanced", "aggressive", "diplomatic"] + assert config.difficulties == ["normal"] + assert config.seeds == [42, 123, 456] + assert config.tick_budget == 100 + + def test_custom_values(self) -> None: + """Test custom configuration values.""" + config = ExploratorySweepConfig( + strategies=["balanced"], + difficulties=["hard"], + seeds=[1, 2, 3], + tick_budget=200, + ) + + assert config.strategies == ["balanced"] + assert config.difficulties == ["hard"] + assert config.seeds == [1, 2, 3] + assert config.tick_budget == 200