From 70c747b3d0bc7fa7e5d26960fb973e862a63d2cf Mon Sep 17 00:00:00 2001 From: Ross Gardler Date: Wed, 3 Dec 2025 23:37:45 -0800 Subject: [PATCH 1/4] Checkpoint from Copilot CLI for coding agent session --- .pm/tracker.md | 171 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 2 deletions(-) diff --git a/.pm/tracker.md b/.pm/tracker.md index dc219d7b..941fbbc9 100644 --- a/.pm/tracker.md +++ b/.pm/tracker.md @@ -1,6 +1,6 @@ # Project Task Tracker -**Last Updated:** 2025-12-04T07:08:58Z +**Last Updated:** 2025-12-04T07:25:00Z ## Comprehensive Project Status Report @@ -22,6 +22,7 @@ - Open issues: 1 (Issue #56 - Content Pipeline CI, already completed but issue remains open) - Recent commits: 30+ commits in past month, steady delivery cadence - Repository hygiene: Excellent - clean issue backlog, well-documented +- **NEW:** Phase 11 (Automated Balance Workflow) - 6 tasks planned for advanced balance automation and strategy tuning ## Status Summary @@ -226,7 +227,8 @@ The project has closely followed the implementation plan with excellent tracking | 8 | Deployment (Docker/K8s) | 6 | 6 | ✅ 100% | | 9 | AI Testing & Validation | 4 | 4 | ✅ 100% | | 10 | Test Coverage Improvements | 8 | 8 | ✅ 100% | -| **TOTAL** | **All Phases** | **51** | **51** | **✅ 100%** | +| 11 | Automated Balance Workflow | 6 | 0 | ⚙️ 0% | +| **TOTAL** | **All Phases** | **57** | **51** | **⚙️ 89%** | **Optional Polish Tasks** (not included in phase counts): @@ -310,6 +312,21 @@ The project has closely followed the implementation plan with excellent tracking - 10.2.1: Difficulty sweep hardening (LOW priority, 2-3 day effort) - 10.2.2: AI player LLM robustness (LOW priority, future enhancement) +### Phase 11: Automated Balance Workflow ⚙️ IN PLANNING (0%) + +- **Status:** Phase defined, tasks planned but not yet started +- **Progress:** 0/6 tasks complete +- **Objective:** Build advanced automation for data-driven balance iteration and strategy tuning +- **Milestones:** + - ⬜ 11.1.1: Batch simulation sweep infrastructure + - ⬜ 11.2.1: Result aggregation and storage + - ⬜ 11.3.1: Analysis and balance reporting + - ⬜ 11.4.1: Strategy parameter optimization + - ⬜ 11.5.1: CI integration for continuous validation + - ⬜ 11.6.1: Designer feedback loop and tooling +- **Dependencies:** Phase 9 (AI tournaments and balance tooling already exists) +- **Outstanding:** All 6 tasks not yet started + ## Outstanding Work Analysis ### Issue #56: Content Pipeline CI Integration (M8.4) @@ -431,6 +448,12 @@ The project has closely followed the implementation plan with excellent tracking | 10.1.6 | Cross-system integration scenario tests | completed | Medium | Test Agent | 2025-12-03 | | 10.1.7 | Performance and tick-limit regression tests | completed | Low | Test Agent | 2025-12-03 | | 10.1.8 | AI/LLM mocking and coverage for gateways | completed | Medium | Test Agent | 2025-12-03 | +| 11.1.1 | Batch simulation sweep infrastructure (M11.1) | not-started | Medium | gamedev-agent | 2025-12-04 | +| 11.2.1 | Result aggregation and storage (M11.2) | not-started | Medium | gamedev-agent | 2025-12-04 | +| 11.3.1 | Analysis and balance reporting (M11.3) | not-started | High | gamedev-agent | 2025-12-04 | +| 11.4.1 | Strategy parameter optimization (M11.4) | not-started | Low | gamedev-agent | 2025-12-04 | +| 11.5.1 | CI integration for continuous validation (M11.5) | not-started | Medium | gamedev-agent | 2025-12-04 | +| 11.6.1 | Designer feedback loop and tooling (M11.6) | not-started | Low | gamedev-agent | 2025-12-04 | | 10.2.1 | Harden difficulty sweep runtime & monitoring | not-started | Low | Gamedev Agent | 2025-12-02 | | 10.2.2 | AI player LLM robustness & failure telemetry | not-started | Low | Gamedev Agent | 2025-12-02 | @@ -1268,3 +1291,147 @@ The project has closely followed the implementation plan with excellent tracking 2. Expand tests in `tests/echoes/test_llm_*` and gateway tests. 3. Ensure CI configuration does not require real API keys. - **Last Updated:** 2025-12-02 + +## Phase 11: Automated Balance Workflow + +### 11.1.1 — Batch Simulation Sweep Infrastructure (M11.1) + +- **GitHub Issue:** [#58](https://github.com/TheWizardsCode/GEngine/issues/58) +- **Description:** Build infrastructure to run large batches of simulation sweeps with configurable parameter ranges (difficulty presets, strategy mixes, world variations, random seeds) and parallel execution. This extends existing tournament and difficulty sweep tooling to support broader parameter space exploration for balance analysis. +- **Acceptance Criteria:** + - Script `scripts/run_batch_sweeps.py` supports multi-dimensional parameter grids (strategies, difficulties, seeds, worlds, tick budgets). + - Parallel execution using Python multiprocessing or similar to maximize throughput on multi-core hardware. + - JSON output per sweep run includes game results, telemetry, and parameter metadata. + - Configuration file (e.g., `content/config/batch_sweeps.yml`) defines sweep parameter ranges and defaults. + - Documentation describes sweep configuration format and execution workflow. + - At least 10 tests covering parameter grid generation, parallel execution, and output validation. +- **Priority:** Medium +- **Responsible:** gamedev-agent +- **Dependencies:** 9.4.1 (AI tournaments), 7.3.1 (difficulty sweeps), core simulation stability. +- **Risks & Mitigations:** + - Risk: Large parameter grids generate excessive data. Mitigation: Support sampling modes and configurable grid density. + - Risk: Parallel execution causes resource contention. Mitigation: Add worker pool size configuration and resource limits. +- **Next Steps:** + 1. Design parameter grid configuration schema. + 2. Implement batch sweep runner with parallel execution. + 3. Add output format and metadata tracking. + 4. Create test suite covering edge cases (empty grids, single parameter, error handling). +- **Last Updated:** 2025-12-04 + +### 11.2.1 — Result Aggregation and Storage (M11.2) + +- **Description:** Implement result aggregation and storage layer that collects sweep outputs into a queryable database or structured file format. Support historical tracking of sweep runs to enable trend analysis and regression detection across balance iterations. +- **Acceptance Criteria:** + - Script `scripts/aggregate_sweep_results.py` ingests batch sweep JSON outputs and produces aggregated summary data. + - Storage format (SQLite database or structured JSON/Parquet files) supports querying by parameter combinations, timestamp, and result metrics. + - Historical tracking preserves sweep metadata (git commit hash, timestamp, parameter ranges) for reproducibility. + - Aggregation computes key statistics: win rates by strategy, average stability/unrest/pollution, story seed activation rates, action usage frequencies. + - Query interface or helper functions support common lookups (e.g., "show all sweeps for difficulty=hard from last 30 days"). + - At least 8 tests covering aggregation logic, storage/retrieval, and historical queries. +- **Priority:** Medium +- **Responsible:** gamedev-agent +- **Dependencies:** 11.1.1 (batch sweep infrastructure). +- **Risks & Mitigations:** + - Risk: Storage grows unbounded with sweep history. Mitigation: Implement retention policies and data archival. + - Risk: Schema changes break historical data. Mitigation: Use versioned schema with migration support. +- **Next Steps:** + 1. Choose storage format (recommend SQLite for queryability or Parquet for analytics). + 2. Design aggregation schema and statistics computed. + 3. Implement aggregation script with historical tracking. + 4. Add test coverage for data integrity and query patterns. +- **Last Updated:** 2025-12-04 + +### 11.3.1 — Analysis and Balance Reporting (M11.3) + +- **Description:** Build analysis tooling that consumes aggregated sweep data and generates actionable balance reports identifying overpowered/underpowered mechanics, dominant strategies, unused content, and parameter sensitivity. Extend existing `analyze_ai_games.py` functionality with statistical rigor and trend detection. +- **Acceptance Criteria:** + - Script `scripts/analyze_balance.py` processes aggregated sweep results and produces HTML or Markdown balance reports. + - Reports include sections for: dominant strategies (win rate deltas >10%), underperforming mechanics (actions/policies rarely chosen), unused story seeds, parameter sensitivity analysis (impact of difficulty/config changes). + - Statistical analysis includes confidence intervals, significance testing (e.g., t-tests for win rate differences), and trend detection across historical runs. + - Visual outputs (charts/graphs) showing win rate distributions, metric trends over time, and parameter correlations. + - Report highlights regressions (new sweeps showing significant deviations from baseline). + - At least 12 tests covering report generation, statistical calculations, and edge cases (empty data, single run). +- **Priority:** High +- **Responsible:** gamedev-agent +- **Dependencies:** 11.2.1 (result aggregation and storage), 9.4.1 (analysis script foundation). +- **Risks & Mitigations:** + - Risk: Statistical tests produce false positives. Mitigation: Use appropriate significance thresholds and multiple comparison corrections. + - Risk: Reports become too verbose. Mitigation: Summary-first design with detailed breakdowns in appendices. +- **Next Steps:** + 1. Define report structure and key metrics to surface. + 2. Implement statistical analysis functions (win rate deltas, significance tests, trend detection). + 3. Add visualization generation (matplotlib/plotly for charts). + 4. Create test suite with synthetic sweep data. +- **Last Updated:** 2025-12-04 + +### 11.4.1 — Strategy Parameter Optimization (M11.4) + +- **Description:** Implement automated strategy parameter tuning using optimization algorithms (grid search, random search, or Bayesian optimization) to find well-balanced strategy configurations. Goal is to reduce dominant strategy win rate deltas and improve strategic diversity. +- **Acceptance Criteria:** + - Script `scripts/optimize_strategies.py` accepts strategy parameter ranges and optimization targets (e.g., minimize max win rate delta, maximize strategic diversity). + - Supports multiple optimization algorithms: grid search (exhaustive), random search (sampling), and optionally Bayesian optimization (e.g., using `scikit-optimize`). + - Optimization runs batches of sweep simulations with candidate parameter sets and evaluates fitness against targets. + - Output includes Pareto frontier of optimal configurations (trade-offs between competing objectives like balance vs. difficulty). + - Integration with result storage (11.2.1) to track optimization runs and outcomes. + - Documentation describes optimization workflow, tuning targets, and how to interpret results. + - At least 10 tests covering optimization algorithms, fitness evaluation, and parameter validation. +- **Priority:** Low +- **Responsible:** gamedev-agent +- **Dependencies:** 11.1.1 (batch sweeps), 11.2.1 (result storage), stable strategy parameter schema. +- **Risks & Mitigations:** + - Risk: Optimization converges to local optima or overfits to specific scenarios. Mitigation: Use multiple random seeds and validation sets. + - Risk: Computationally expensive for large parameter spaces. Mitigation: Start with coarse grid search, then refine with targeted searches. +- **Next Steps:** + 1. Define strategy parameter schema and tuning ranges. + 2. Implement fitness functions for balance objectives. + 3. Add optimization algorithms (start with grid/random search). + 4. Create test suite with small synthetic parameter spaces. +- **Last Updated:** 2025-12-04 + +### 11.5.1 — CI Integration for Continuous Validation (M11.5) + +- **Description:** Integrate balance sweep and analysis tooling into CI workflows to detect balance regressions automatically on every commit or nightly schedule. Failed balance checks should produce actionable reports and optionally block merges if regressions exceed thresholds. +- **Acceptance Criteria:** + - GitHub Actions workflow `.github/workflows/balance-validation.yml` runs on schedule (nightly) and optionally on relevant file changes (strategy configs, game rules). + - Workflow executes a representative subset of balance sweeps (smaller parameter grid than full exploratory sweeps for speed). + - Analysis step compares current sweep results against baseline (stored historical data from main branch). + - Regression detection identifies significant deviations (e.g., strategy win rate delta increased by >5%, unused content increased, metric variance spiked). + - Workflow produces artifacts: balance report, comparison charts, regression summary. + - Configurable thresholds control whether regressions are warnings vs. failures (blocking). + - Documentation describes CI workflow configuration, baseline management, and interpreting regression reports. + - At least 6 tests for workflow components (subset sweep execution, baseline comparison, threshold enforcement). +- **Priority:** Medium +- **Responsible:** gamedev-agent +- **Dependencies:** 11.1.1 (batch sweeps), 11.3.1 (analysis/reporting), CI infrastructure. +- **Risks & Mitigations:** + - Risk: CI sweeps too slow and delay feedback. Mitigation: Use reduced parameter grid for CI, full sweeps run on-demand or nightly. + - Risk: Baseline drift makes regressions noisy. Mitigation: Refresh baseline periodically (e.g., after intentional balance changes merged). +- **Next Steps:** + 1. Design CI sweep subset (e.g., 3 difficulty presets, 3 strategies, 5 seeds, 100 ticks). + 2. Implement baseline storage and comparison logic. + 3. Create workflow YAML with scheduled and manual triggers. + 4. Add regression threshold configuration and reporting. +- **Last Updated:** 2025-12-04 + +### 11.6.1 — Designer Feedback Loop and Tooling (M11.6) + +- **Description:** Build designer-facing tools and workflows that make balance iteration accessible to non-engineers. Provide intuitive interfaces for running sweeps, viewing reports, and experimenting with tuning changes without requiring code changes. +- **Acceptance Criteria:** + - Command-line tool `echoes-balance-studio` (or similar) provides guided workflows for designers: "Run exploratory sweep", "Compare two configs", "Test tuning change", "View historical reports". + - Configuration changes can be tested via YAML overlays (similar to difficulty presets) without modifying base config files. + - Interactive report viewer (HTML dashboard or Jupyter notebook) allows filtering, sorting, and drilling into sweep results. + - Documentation includes designer-focused guides: "How to diagnose dominant strategies", "Iterating on action costs", "Testing narrative pacing changes". + - Example workflows demonstrated with case studies (e.g., "Balancing the Industrial Tier faction"). + - At least 8 tests covering CLI commands, config overlay loading, and report generation. +- **Priority:** Low +- **Responsible:** gamedev-agent (with designer/PM feedback loop) +- **Dependencies:** 11.1.1 (batch sweeps), 11.3.1 (analysis/reporting), stable config system. +- **Risks & Mitigations:** + - Risk: Tools too complex for non-technical users. Mitigation: Focus on simple, opinionated workflows with sensible defaults. + - Risk: Designer changes break game systems. Mitigation: Include validation and safety checks in config overlays. +- **Next Steps:** + 1. Gather designer persona requirements and common use cases. + 2. Design CLI command structure and interactive workflows. + 3. Implement config overlay system for safe experimentation. + 4. Create designer documentation and tutorial walkthroughs. +- **Last Updated:** 2025-12-04 From d67ccc4595c5b84568dfb7638dd0145f219f8bfe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 07:52:04 +0000 Subject: [PATCH 2/4] feat: Add batch simulation sweep infrastructure (Issue #58) - Add scripts/run_batch_sweeps.py with multi-dimensional parameter grid support - Add content/config/batch_sweeps.yml for sweep configuration - Add 29 tests in tests/scripts/test_batch_sweeps.py - Update docs/gengine/ai_tournament_and_balance_analysis.md with documentation Implements task 11.1.1 (M11.1) from Phase 11 Automated Balance Workflow Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com> --- content/config/batch_sweeps.yml | 72 ++ .../ai_tournament_and_balance_analysis.md | 106 ++- gamedev-agent-thoughts.txt | 830 ++---------------- scripts/run_batch_sweeps.py | 727 +++++++++++++++ tests/scripts/test_batch_sweeps.py | 655 ++++++++++++++ 5 files changed, 1610 insertions(+), 780 deletions(-) create mode 100644 content/config/batch_sweeps.yml create mode 100644 scripts/run_batch_sweeps.py create mode 100644 tests/scripts/test_batch_sweeps.py diff --git a/content/config/batch_sweeps.yml b/content/config/batch_sweeps.yml new file mode 100644 index 00000000..e975ef8f --- /dev/null +++ b/content/config/batch_sweeps.yml @@ -0,0 +1,72 @@ +# Batch Simulation Sweep Configuration +# Defines parameter ranges for multi-dimensional simulation sweeps. +# Used by scripts/run_batch_sweeps.py for balance analysis and regression testing. + +# Parameter Grid - each combination is tested (Cartesian product) +parameters: + # AI strategies to test + strategies: + - balanced + - aggressive + - diplomatic + + # Difficulty presets (maps to content/config/sweeps/difficulty-/) + difficulties: + - normal + + # Random seeds for deterministic reproducibility + # Can be explicit list or range definition + seeds: + - 42 + - 123 + - 456 + + # World bundles to test (from content/worlds/) + worlds: + - default + + # Tick budgets for simulation length + tick_budgets: + - 100 + +# Parallel execution settings +parallel: + # Maximum worker processes (null = auto-detect based on CPU count) + max_workers: null + + # Timeout per individual sweep in seconds + timeout_per_sweep: 300 + +# Output configuration +output: + # Directory for sweep result JSON files + dir: build/batch_sweeps + + # Include full telemetry in output (increases file size) + include_telemetry: true + + # Include game state summary in output + include_summary: true + +# Sampling configuration for large parameter spaces +# When enabled, samples from the grid instead of full Cartesian product +sampling: + # Sampling mode: "full" (all combinations), "random", "latin_hypercube" + mode: full + + # Number of samples to take (only used when mode != "full") + sample_count: 100 + + # Random seed for sampling reproducibility + sample_seed: 42 + +# Metadata included in every sweep output +metadata: + # Include git commit hash if available + include_git_commit: true + + # Include timestamp + include_timestamp: true + + # Include runtime environment info + include_runtime_info: true diff --git a/docs/gengine/ai_tournament_and_balance_analysis.md b/docs/gengine/ai_tournament_and_balance_analysis.md index 209b77e1..c3ecd996 100644 --- a/docs/gengine/ai_tournament_and_balance_analysis.md +++ b/docs/gengine/ai_tournament_and_balance_analysis.md @@ -1,9 +1,9 @@ # Section 13: AI Tournament & Balance Analysis -**Last Updated:** 2025-12-03 +**Last Updated:** 2025-12-04 ## Overview -This section describes how to use the AI tournament and balance analysis tooling introduced in Phase 9. These tools help designers and developers run large batches of AI-driven games in parallel, compare strategy performance, and identify balance issues or underutilized content. +This section describes how to use the AI tournament, batch sweep, and balance analysis tooling introduced in Phases 9 and 11. These tools help designers and developers run large batches of AI-driven games in parallel, compare strategy performance, and identify balance issues or underutilized content. ## Running AI Tournaments @@ -17,6 +17,97 @@ uv run python scripts/run_ai_tournament.py --games 100 --output build/tournament - `--output`: Path to save the aggregated results - Additional flags allow you to specify strategies, seeds, and world configs. +## Running Batch Simulation Sweeps + +The batch sweep script (Phase 11, M11.1) enables multi-dimensional parameter space exploration for comprehensive balance analysis. It generates a Cartesian product of parameter combinations and executes them in parallel. + +### Configuration + +Batch sweeps are configured via `content/config/batch_sweeps.yml`: + +```yaml +parameters: + strategies: + - balanced + - aggressive + - diplomatic + difficulties: + - normal + - hard + seeds: + - 42 + - 123 + - 456 + worlds: + - default + tick_budgets: + - 100 + - 200 + +parallel: + max_workers: null # Auto-detect CPU count + timeout_per_sweep: 300 + +output: + dir: build/batch_sweeps + include_telemetry: true + +sampling: + mode: full # Options: full, random, latin_hypercube + sample_count: 100 +``` + +### Running Batch Sweeps + +**Basic execution with default configuration:** +```bash +uv run python scripts/run_batch_sweeps.py --output-dir build/sweeps --verbose +``` + +**Override parameters via CLI:** +```bash +uv run python scripts/run_batch_sweeps.py \ + --strategies balanced aggressive \ + --difficulties normal hard \ + --seeds 42 123 456 \ + --ticks 100 200 \ + --output-dir build/custom_sweeps +``` + +**Use custom configuration file:** +```bash +uv run python scripts/run_batch_sweeps.py --config path/to/custom_sweeps.yml +``` + +### Output Format + +Each sweep produces a JSON file containing: +- `parameters`: Full parameter set (strategy, difficulty, seed, world, tick_budget) +- `results`: Game outcome data (final_stability, actions_taken, story_seeds_activated) +- `telemetry`: Metrics and profiling data (environment, faction_legitimacy, economy) +- `metadata`: Timestamp, git commit, runtime info + +A summary file `batch_sweep_summary.json` aggregates all results with: +- Strategy-level statistics (avg/min/max stability, win rates) +- Difficulty-level statistics +- Total sweep counts and failure rates + +### CLI Options + +| Flag | Description | +|------|-------------| +| `--config, -c` | Path to YAML configuration file | +| `--strategies, -s` | Override strategies to test | +| `--difficulties, -d` | Override difficulty presets | +| `--seeds` | Override random seeds | +| `--worlds, -w` | Override world bundles | +| `--ticks, -t` | Override tick budgets | +| `--workers` | Max parallel workers | +| `--output-dir, -o` | Output directory for results | +| `--json` | Output summary as JSON | +| `--verbose, -v` | Print progress during execution | +| `--no-write` | Skip writing individual sweep files | + ## Analyzing Tournament Results After running a tournament, use the analysis script to generate comparative reports. This tool surfaces win rate differences, balance anomalies, and unused story seeds. @@ -34,10 +125,11 @@ The report includes: ## Balance Iteration Workflow -1. Run a tournament with a large number of games and varied strategies. -2. Analyze the results to identify dominant strategies, underpowered/overpowered actions, and unused content. -3. Adjust simulation parameters or authored content as needed. -4. Repeat the process to validate improvements. +1. **Initial Exploration**: Run batch sweeps with diverse parameter combinations to establish baseline metrics. +2. **Tournament Validation**: Run focused tournaments on specific strategy combinations. +3. **Analysis**: Use the analysis script to identify dominant strategies, underpowered/overpowered actions, and unused content. +4. **Adjustment**: Modify simulation parameters or authored content based on findings. +5. **Regression Testing**: Re-run batch sweeps to validate improvements and ensure no regressions. ## CI Integration @@ -45,8 +137,10 @@ A nightly CI workflow automatically runs tournaments and archives results for on ## Usage Tips - Use different world configs and seeds to stress-test balance across scenarios. +- Start with `sampling.mode: random` for large parameter spaces before running full sweeps. - Review the analysis report regularly to guide design iteration. - Archived CI artifacts provide a historical record of balance changes. +- Use `--verbose` during development to monitor sweep progress. ## See Also - [How to Play Echoes](./how_to_play_echoes.md) diff --git a/gamedev-agent-thoughts.txt b/gamedev-agent-thoughts.txt index 82efed67..fed0c5cc 100644 --- a/gamedev-agent-thoughts.txt +++ b/gamedev-agent-thoughts.txt @@ -1,774 +1,56 @@ -# Gamedev Agent Thoughts - Issue #24 M9.2 Rule-Based AI Action Layer - -## Implementation Plan - -Task: Implement M9.2 Rule-Based AI Action Layer as defined in the implementation plan. - -### Current Status -- Working from branch: copilot/add-rule-based-ai-layer -- Existing observer module: src/gengine/ai_player/observer.py (complete) -- Existing tests: tests/ai_player/test_observer.py (37 tests passing) - -### Files to Create -1. src/gengine/ai_player/strategies.py - Rule-based decision heuristics -2. src/gengine/ai_player/actor.py - Action selection and submission -3. tests/ai_player/test_strategies.py - Strategy tests -4. tests/ai_player/test_actor.py - Actor tests including 100-tick regression - -### Design Decisions - -1. **StrategyType Enum**: BALANCED, AGGRESSIVE, DIPLOMATIC -2. **BaseStrategy Abstract Class**: Common interface for all strategies -3. **Strategy Implementations**: - - BalancedStrategy: Moderate intervention, stabilize at 0.6, support factions at 0.4 - - AggressiveStrategy: Frequent intervention, higher thresholds, prefer direct actions - - DiplomaticStrategy: Prefer negotiation, build relationships, lower thresholds - -4. **AIActor Class**: - - Wraps Observer with strategy execution - - select_action(): Analyze state and choose action - - submit_intent(): Submit via SimEngine.apply_action - - act(): Full loop - observe, decide, act - - Decision logging for telemetry - -### Implementation Steps -- [ ] Create strategies.py with StrategyType enum and strategy classes -- [ ] Create actor.py with AIActor class -- [ ] Update __init__.py to export new classes -- [ ] Create test_strategies.py -- [ ] Create test_actor.py with 100-tick regression tests -- [ ] Run tests and ensure 100% coverage for critical paths -- [ ] Update documentation - -## Log - -## Implementation Complete - -### Files Created -1. src/gengine/ai_player/strategies.py - Rule-based decision heuristics - - StrategyType enum: BALANCED, AGGRESSIVE, DIPLOMATIC - - StrategyConfig dataclass with configurable thresholds - - StrategyDecision dataclass for tracking decisions - - BaseStrategy abstract class - - BalancedStrategy: Moderate intervention, stability 0.6, faction 0.4 - - AggressiveStrategy: Frequent actions, higher thresholds, larger deployments - - DiplomaticStrategy: Prefers negotiation, relationship building - -2. src/gengine/ai_player/actor.py - Action selection and submission - - ActorConfig dataclass - - ActionReceipt dataclass for tracking submitted actions - - ActorReport dataclass for session summaries - - AIActor class with run(), select_action(), submit_intent(), act() - - Decision logging for telemetry - - Factory functions: create_actor_from_engine, create_actor_from_service - -3. src/gengine/ai_player/__init__.py - Updated exports - -4. tests/ai_player/test_strategies.py - 41 tests for strategy module -5. tests/ai_player/test_actor.py - 34 tests including 100-tick regression - -### Test Results -- AI player tests: 112 passed -- Full test suite: 598 passed -- AI player coverage: 94% overall - - strategies.py: 92% - - actor.py: 96% - - observer.py: 94% - -### Telemetry Capture -- Command: uv run python scripts/run_headless_sim.py --world default --ticks 200 --lod balanced --seed 42 --output build/feature-m9-2-ai-action-layer.json -- Ticks executed: 200 -- Final stability: 1.0 -- Suppressed events: 343 -- Story seeds activated: spire-data-leak, hollow-supply-chain, energy-quota-crisis - -<<<<<<< HEAD -### Key Design Decisions -1. Strategies work with summary API (which returns counts not lists) - graceful fallback to default districts -2. _should_act/_should_inspect allow action on first tick -3. Remote client uses submit_actions() not action() method -4. Faction legitimacy accessed via state.factions dict, not state.factions list -======= -**Playtest Instructions:** -1. Start shell: `uv run echoes-shell --world default` -2. Run simulation: `run 20` -3. Check progression: `summary` (look for progression section) -4. Note skill levels and reputation changes -5. Exit: `exit` - - -## Session Start: 2025-11-30T21:59 UTC - Repository Review & Phase 7 Work -===================================================================== - -### Initial Assessment -- Working branch: copilot/linguistic-tahr -- All 453 tests pass -- Repository clean and synced - -### Task Status Review -1. **Task 7.1.1 (Progression Systems)** - Issue #11 - - ✅ VERIFIED COMPLETE: 48 progression tests pass - - ProgressionState, SkillDomain, AccessTier, ReputationState models implemented - - ProgressionSystem integrated with SimEngine - - Documentation updated in GDD, implementation plan, gameplay guide - - **Recommendation: Close GitHub Issue #11** - -2. **Task 7.4.1 (Campaign UX Flows)** - Issue #13 - - ❌ NOT STARTED according to tracker - - Dependencies: Snapshot persistence (✅), post-mortem generator (✅), CLI/gateway surfaces (✅) - - Scope: Autosaves, campaign picker, end-of-run summaries - -### Plan for Task 7.4.1 -1. Review existing persistence and post-mortem infrastructure -2. Design campaign data model and autosave mechanism -3. Implement campaign picker CLI commands (list/start/resume/end) -4. Add autosave functionality triggered at configurable intervals -5. Polish end-of-run flow with post-mortem integration -6. Write tests for campaign management -7. Update documentation - -### Next Steps -- Will create feature branch for 7.4.1 work -- Starting with campaign data model design - -### Task 7.4.1 Implementation - Campaign UX Flows -[2025-11-30 22:20 UTC] - -**Implementation Complete:** - -1. **Campaign Module** (`src/gengine/echoes/campaign/`): - - Created `Campaign` dataclass: id, name, world, timestamps, tick, ended status - - Created `CampaignManager` class: create, list, load, save, autosave, end - - Created `CampaignSettings` dataclass: configurable via simulation.yml - - Added `_json_default` helper for datetime serialization - - Autosave at configurable intervals with automatic cleanup of old saves - - Post-mortem generation and persistence on campaign end - -2. **Shell Integration** (`src/gengine/echoes/cli/shell.py`): - - Extended `ShellBackend` interface with campaign methods - - `LocalBackend` implements full campaign functionality - - `ServiceBackend` raises NotImplementedError (campaigns are local-only) - - Added campaign commands: list, new, resume, end, status - - `--campaign ` CLI flag for direct campaign resumption - - Autosave triggers automatically after `advance_ticks()` - -3. **Settings** (`src/gengine/echoes/settings.py`): - - Added `CampaignSettings` Pydantic model - - Added `campaign` field to `SimulationConfig` - -4. **Configuration** (`content/config/simulation.yml`): - - Added `campaign` section with: - - `campaigns_dir`: campaigns - - `autosave_interval`: 50 - - `max_autosaves`: 3 - - `generate_postmortem_on_end`: true - -5. **Tests** (`tests/echoes/test_campaign.py`): - - 23 tests covering: - - CampaignSettings (defaults, from_dict) - - Campaign model (to_dict, from_dict) - - CampaignManager (create, list, load, save, autosave, end, delete) - - Integration with LocalBackend - -**Test Results:** -- 476 tests pass (453 existing + 23 new) -- All campaign functionality verified - -**Telemetry Captured:** -- `build/feature-m7-4-campaign-ux.json` (200 ticks, seed 42, balanced LOD) - -**Documentation Updated:** -- `docs/gengine/how_to_play_echoes.md`: Added Section 12 (Campaign Management) -- `docs/simul/emergent_story_game_implementation_plan.md`: Marked M7.4 complete -- `docs/simul/emergent_story_game_gdd.md`: Updated progress log -- `README.md`: Added campaign commands to CLI section -- `.pm/tracker.md`: Updated task status to completed - -**Playtest Instructions:** -1. Start shell: `uv run echoes-shell --world default` -2. Create campaign: `camcd /home/rgardler/projects/GEngine-agents - -git status # confirm only resolved files are staged/unstaged - -git add .pm/tracker.md gamedev-agent-thoughts.txt - -git commit # this will finalize the merge commit on mainpaign new "Test Campaign"` -3. Run simulation: `run 50` (autosave should trigger) -4. Check status: `campaign status` -5. Exit and resume: `uv run echoes-shell --campaign ` -6. End campaign: `campaign end` - -**GitHub Issues:** -- Issue #11 (Task 7.1.1 - Progression): Ready to close -- Issue #13 (Task 7.4.1 - Campaign UX): Ready to close - - -## Task 9.1.1 - AI Observer Foundation Review & Polish (2025-11-30T23:13 UTC) -===================================================================== - -### Initial Assessment -- Working branch: copilot/vivid-chinchilla -- Task: Review and polish AI Observer implementation (Issue #19) - -### Acceptance Criteria Analysis: -1. ✅ Observer connects via both SimEngine and SimServiceClient - VERIFIED -2. ✅ Generates structured JSON and optional natural language commentary - VERIFIED -3. ⚠️ Integration tests validate trend detection - NEEDS ENHANCEMENT -4. ⚠️ README documents usage with examples - NEEDS ENHANCEMENT - -### Gaps Identified: -1. No integration test that uses SimServiceClient (only local SimEngine tests exist) -2. README could use more comprehensive service mode examples - -### Changes Planned: -1. Add integration test for Observer with SimServiceClient -2. Enhance README with service mode programmatic example -3. Update tracker.md to mark task 9.1.1 as completed - -### Implementation Log: -- All 33 existing AI observer tests pass -- Observer implementation verified to support both local and remote modes - - - -### Changes Made: -1. Fixed bug in Observer._get_state() - service mode now properly unwraps 'data' field from response -2. Added 4 new integration tests in TestObserverWithSimServiceClient class: - - test_observer_with_service_client_observes_ticks - - test_observer_with_service_client_detects_trends - - test_observer_with_service_client_generates_commentary - - test_observer_with_service_client_json_output -3. Enhanced README.md with comprehensive service mode programmatic example -4. Updated .pm/tracker.md to mark task 9.1.1 as completed - -### Test Results: -- All 37 AI observer tests pass -- All acceptance criteria verified - -### Files Changed: -- src/gengine/ai_player/observer.py (bug fix: unwrap service 'data' field) -- tests/ai_player/test_observer.py (added 4 SimServiceClient integration tests) -- README.md (added service mode programmatic example) -- .pm/tracker.md (marked 9.1.1 as completed) -- gamedev-agent-thoughts.txt (this log) - -### Task 9.1.1 Status: COMPLETED -All acceptance criteria met: -✅ Observer connects via both SimEngine and SimServiceClient -✅ Generates structured JSON and optional natural language commentary -✅ Integration tests validate trend detection -✅ README documents usage with examples - - -## Task 8.1.1: Containerization (Docker + Compose) (2025-11-30) -===================================================================== - -### Initial Assessment -- Working branch: copilot/scornful-wasp -- Repository synced and clean - -### Project Structure Analysis -Services to containerize: -1. **Simulation Service** (port 8000): `src/gengine/echoes/service/main.py` -2. **Gateway Service** (port 8100): `src/gengine/echoes/gateway/main.py` -3. **LLM Service** (port 8001): `src/gengine/echoes/llm/main.py` - -### Environment Variables Inventory -**Simulation:** -- ECHOES_SERVICE_HOST (default: "0.0.0.0") -- ECHOES_SERVICE_PORT (default: "8000") -- ECHOES_SERVICE_WORLD (default: "default") -- ECHOES_CONFIG_ROOT (optional) - -**Gateway:** -- ECHOES_GATEWAY_SERVICE_URL (default: "http://localhost:8000") -- ECHOES_GATEWAY_LLM_URL (optional) -- ECHOES_GATEWAY_HOST (default: "0.0.0.0") -- ECHOES_GATEWAY_PORT (default: "8100") - -**LLM:** -- ECHOES_LLM_PROVIDER (default: "stub") -- ECHOES_LLM_API_KEY (required for real providers) -- ECHOES_LLM_MODEL (required for real providers) -- ECHOES_LLM_TEMPERATURE (default: 0.7) -- ECHOES_LLM_MAX_TOKENS (default: 1000) -- ECHOES_LLM_TIMEOUT (default: 30) -- ECHOES_LLM_MAX_RETRIES (default: 2) - -### Implementation Plan -1. [x] Analyze project structure and dependencies -2. [x] Create Dockerfile (shared multi-stage build) -3. [x] Create docker-compose.yml for multi-service orchestration -4. [x] Create .env.sample with documented variables -5. [x] Update README.md with Docker section -6. [ ] Verify Docker setup - -### Implementation Complete - -**Files Created:** -1. `Dockerfile` - Multi-stage build supporting all three services - - Base stage: Python 3.12 slim + uv + dependencies - - Runtime stage: Application code + content + entrypoint - - Development stage: Dev dependencies for hot-reload - - SERVICE env var selects which service to run (simulation/gateway/llm) - -2. `docker-compose.yml` - Multi-service orchestration - - simulation service (port 8000): Core simulation API - - gateway service (port 8100): WebSocket gateway - - llm service (port 8001): Natural language processing - - echoes-network bridge for inter-service communication - - Health checks for all services - - Volume mounts for content directory - -3. `.env.sample` - Documented environment variables - - Port mappings - - World selection - - LLM provider configuration (stub/openai/anthropic) - -**README.md Updates:** -- Added comprehensive Docker section with: - - Prerequisites - - Quick start commands - - Service URLs table - - Configuration guidance - - Individual service commands - - Host connection instructions - - Development mode setup - - Health check documentation - - Network configuration - -**.gitignore Updates:** -- Added `!.env.sample` exception to track the sample env file - -## Task 7.1.3: Enable Per-Agent Success Modifiers by Default (Issue #25) -===================================================================== -2025-12-01T03:53:52 UTC - -### Initial Assessment -- Working branch: copilot/enable-per-agent-success-modifiers -- Repository synced and clean -- All 523 tests pass - -### Acceptance Criteria from Issue: -1. Run scenario tests with `enable_per_agent_modifiers: true` across all difficulty presets -2. Validate that per-agent bonuses/penalties don't destabilize difficulty balance -3. Update `content/config/simulation.yml` to set `enable_per_agent_modifiers: true` -4. Document any observed balance impacts in gameplay guide -5. All existing tests pass with modifiers enabled - -### Plan: -1. Run difficulty sweeps with modifiers enabled (temporarily modify config) -2. Analyze balance impact comparing before/after -3. If acceptable, make the change permanent -4. Document findings -5. Verify all tests pass - - -### Step 2: Difficulty Sweeps with Modifiers Enabled -Ran sweeps with enable_per_agent_modifiers: true across all 5 presets: -- tutorial: stb=1.00 unrest=0.20 poll=0.44 (unchanged) -- easy: stb=1.00 unrest=0.24 poll=0.46 (unchanged) -- normal: stb=1.00 unrest=0.20 poll=0.49 (unchanged) -- hard: stb=0.00 unrest=1.00 poll=0.99 (unchanged) -- brutal: stb=0.00 unrest=1.00 poll=1.00 (unchanged) - -### Step 3: Balance Analysis -Result: Metrics IDENTICAL before/after enabling modifiers. -The ±10% bonus/penalty envelope is intentionally small and does not affect -the macro-level stability, unrest, or pollution curves. - -### Step 4: Configuration Update -Updated content/config/simulation.yml line 101: - enable_per_agent_modifiers: false -> enable_per_agent_modifiers: true - -### Step 5: Documentation Update -Updated docs/gengine/how_to_play_echoes.md Section 11.4: -- Changed example to show enable_per_agent_modifiers: true -- Added note explaining scenario testing confirmed balance stability - -### Step 6: Test Verification -All 523 tests pass with modifiers enabled. - -### Task 7.1.3 Status: COMPLETED - ->>>>>>> main - - -## Issue #24 Verification - 2025-12-01T06:10 UTC -========================================================= - -### Issue #24 (Rule-Based AI Action Layer - Task 9.2.1) VERIFICATION - -**Acceptance Criteria Verification:** - -1. ✅ **Strategies (balanced/aggressive/diplomatic) implemented** - - File: src/gengine/ai_player/strategies.py - - BalancedStrategy: Moderate intervention, stability threshold 0.6, faction threshold 0.4 - - AggressiveStrategy: Frequent actions, higher thresholds, larger deployments - - DiplomaticStrategy: Prefers negotiation, relationship building - -2. ✅ **AI actor submits valid intents and handles responses** - - File: src/gengine/ai_player/actor.py - - AIActor.run(): Full observation-decision-action loop - - AIActor.select_action(): Evaluates strategy and returns decision - - AIActor.submit_intent(): Submits via SimEngine.apply_action API - - AIActor.act(): Single observe-decide-act cycle - - Handles both local (SimEngine) and remote (SimServiceClient) modes - -3. ✅ **Regression test shows stabilization behavior** - - 100-tick regression tests in tests/ai_player/test_actor.py: - - test_balanced_stabilizes_failing_city - - test_aggressive_100_tick_run - - test_diplomatic_100_tick_run - - test_deterministic_100_tick_outcome - - All tests verify AI can maintain/improve stability - -4. ✅ **Telemetry captures decision rationale** - - ActorReport.telemetry includes: - - action_counts: counts by intent type - - priority_stats: avg/max/min priority - - strategy_type: which strategy was used - - final_state: stability, tick - - rationales: list of decision rationales (up to last 10) - - StrategyDecision.to_dict() includes state_snapshot for debugging - -**Test Results:** -- AI player tests: 112 passed -- Full test suite: 598 passed -- Coverage: 94% overall for AI player module - -**Tracker Status:** -- Task 9.2.1 marked as COMPLETED in .pm/tracker.md -- Documentation updated in README.md and implementation plan - -**Conclusion:** -All acceptance criteria for Issue #24 are met. The implementation is production-ready. - - -## Issue #34 - LLM-Enhanced AI Decisions (Task 9.3.1) - 2025-12-01T07:42 UTC -========================================================= - -### Initial Assessment -- Working branch: copilot/enhance-ai-decisions-issue-34 -- Repository synced and clean -- All 112 AI player tests pass - -### Acceptance Criteria from Issue: -1. Hybrid strategy routes routine actions to rules and complex choices to LLM -2. Budget enforcement prevents runaway costs -3. Scenario tests compare rule-only vs hybrid -4. Telemetry distinguishes rule vs LLM decisions -5. Docs cover prompts and trade-offs - -### Implementation Plan: - -1. **Extend StrategyType enum** with HYBRID value -2. **Create HybridStrategyConfig** with LLM budget controls: - - llm_call_budget: Maximum LLM calls per session - - complexity_thresholds: When to trigger LLM vs rules - - cost_tracking: calls_used, estimated_cost -3. **Create HybridStrategy class**: - - Wraps a fallback rule-based strategy - - Routes routine decisions to rules - - Routes complex decisions to LLM (via LLMDecisionLayer) - - Falls back to rules when budget exhausted -4. **Create LLMDecisionLayer** (llm_strategy.py): - - LLMDecisionRequest dataclass - - build_decision_prompt() for LLM context - - parse_llm_response() to create StrategyDecision - - Async provider calls with timeout/error handling -5. **Add decision_source telemetry**: - - Add decision_source field to StrategyDecision - - Track LLM call counts, latency, fallback rates -6. **Add complexity detection**: - - Multiple factions with low legitimacy - - Critical stability crisis - - Conflicting story seeds active -7. **Tests**: - - test_hybrid_strategy.py with stub provider - - Budget enforcement tests - - Decision routing tests - - Scenario comparisons -8. **Documentation**: - - Update README with hybrid strategy examples - - Document prompts and trade-offs - -### Files Created/Modified: -- src/gengine/ai_player/llm_strategy.py (NEW) -- src/gengine/ai_player/strategies.py (MODIFIED - added HYBRID type, HybridStrategy class, decision_source) -- src/gengine/ai_player/__init__.py (MODIFIED - new exports) -- tests/ai_player/test_hybrid_strategy.py (NEW) -- README.md (MODIFIED - documentation updates) -- docs/simul/emergent_story_game_implementation_plan.md (MODIFIED - marked M9.3 complete) - -### Test Results: -- AI player tests: 158 passed -- Full test suite: 644 passed -- AI player coverage: 91% - - llm_strategy.py: 76% - - strategies.py: 93% - - actor.py: 97% - - observer.py: 94% - - __init__.py: 100% - -### Telemetry Capture: -- Command: uv run python scripts/run_headless_sim.py --world default --ticks 200 --lod balanced --seed 42 --output build/feature-m9-3-llm-hybrid.json -- Ticks executed: 200 -- Suppressed events: 343 - -### Acceptance Criteria Status: -1. ✅ Hybrid strategy routes routine actions to rules and complex choices to LLM -2. ✅ Budget enforcement prevents runaway costs (LLMStrategyConfig.llm_call_budget) -3. ✅ Scenario tests compare rule-only vs hybrid (TestHybridStrategyScenarios class) -4. ✅ Telemetry distinguishes rule vs LLM decisions (decision_source field) -5. ✅ Docs cover prompts and trade-offs (README.md updated) - -### Task 9.3.1 Status: COMPLETED -All acceptance criteria for Issue #34 are met. - - -## Issue #39 - Dedicated Metrics Endpoints for Gateway and LLM Services - 2025-12-02T19:00 UTC -========================================================= - -### Initial Assessment -- Working branch: copilot/add-dedicated-metrics-endpoints -- Current commit: fc3318b -- Repository synced and clean - -### Acceptance Criteria from Issue: -1. Gateway exposes `/metrics` endpoint with request counts, latencies, error breakdowns, provider-level stats -2. LLM service exposes `/metrics` endpoint with request counts, latencies, error breakdowns, provider-level stats -3. Prometheus annotations point to `/metrics` instead of `/healthz` -4. ServiceMonitor targets updated to metrics paths/ports -5. Documentation distinguishes health vs. metrics endpoints - -### Implementation Plan: -1. [ ] Add `/metrics` endpoint to gateway/app.py with tracking of: - - Request counts (total, by endpoint, by result type) - - Request latencies - - Error counts - - Active WebSocket connections - - LLM service integration stats -2. [ ] Add `/metrics` endpoint to llm/app.py with tracking of: - - Request counts (total, by endpoint) - - Request latencies - - Error counts - - Provider-level stats - - Token usage -3. [ ] Update k8s/base/gateway-deployment.yaml: prometheus.io/path → /metrics -4. [ ] Update k8s/base/llm-deployment.yaml: prometheus.io/path → /metrics -5. [ ] Update k8s/base/servicemonitor.yaml: gateway and LLM paths → /metrics -6. [ ] Update docs/gengine/Deploy_GEngine_To_Kubernetes.md -7. [ ] Run tests to verify no regressions - - -### Implementation Complete - -**Files Modified:** -1. `src/gengine/echoes/gateway/app.py` - Added GatewayMetrics class and /metrics endpoint - - GatewayMetrics tracks: request counts, latencies, errors, connections, LLM integration - - /metrics endpoint returns JSON with all metrics for Prometheus - - Metrics tracked during WebSocket handler execution - -2. `src/gengine/echoes/llm/app.py` - Added LLMMetrics class and /metrics endpoint - - LLMMetrics tracks: request counts, latencies, errors, provider stats, token usage - - /metrics endpoint returns JSON with all metrics for Prometheus - - Metrics tracked for both parse_intent and narrate endpoints - -3. `k8s/base/gateway-deployment.yaml` - Updated prometheus.io/path from /healthz to /metrics -4. `k8s/base/llm-deployment.yaml` - Updated prometheus.io/path from /healthz to /metrics -5. `k8s/base/servicemonitor.yaml` - Updated gateway and LLM paths from /healthz to /metrics - -6. `docs/gengine/Deploy_GEngine_To_Kubernetes.md` - Comprehensive documentation update - - Added Health Check Endpoints section explaining /healthz purpose - - Added Metrics Endpoints section explaining /metrics purpose - - Added Example Metrics Responses showing JSON structure for all 3 services - - Updated Prometheus Annotations section - - Updated verification commands - -7. `tests/echoes/test_gateway_service.py` - Added 9 new tests: - - test_gateway_metrics_endpoint - - test_gateway_metrics_track_websocket_connections - - test_gateway_metrics_track_commands - - TestGatewayMetrics class with 7 tests - -8. `tests/echoes/test_llm_app.py` - Added 11 new tests: - - test_metrics_endpoint - - test_metrics_track_parse_intent - - test_metrics_track_narrate - - TestLLMMetrics class with 8 tests - -**Test Results:** -- Gateway/LLM tests: 39 passed (19 original + 20 new) -- Coverage: gateway/app.py 89%, llm/app.py 91% - -**Acceptance Criteria Status:** -1. ✅ Gateway exposes /metrics endpoint with request counts, latencies, error breakdowns, connections, LLM integration stats -2. ✅ LLM service exposes /metrics endpoint with request counts, latencies, error breakdowns, provider stats, token usage -3. ✅ Prometheus annotations point to /metrics (updated gateway-deployment.yaml, llm-deployment.yaml) -4. ✅ ServiceMonitor targets updated to /metrics paths (updated servicemonitor.yaml) -5. ✅ Documentation distinguishes health vs. metrics endpoints with example responses - -### Task Complete: Issue #39 - Dedicated Metrics Endpoints for Gateway and LLM Services - - -## Issue #51 - M9.1 Observer AI Foundation Verification - 2025-12-04T06:05 UTC -========================================================= - -### Task: Verify and complete Issue 51 - M9.1 Observer AI Foundation - -### Acceptance Criteria Analysis: - -1. ✅ **Observer connects to both local and service-mode simulations** - - `Observer(engine=SimEngine())` for local mode - - `Observer(client=SimServiceClient())` for service mode - - Factory helpers: `create_observer_from_engine()`, `create_observer_from_service()` - - Tests: `TestObserverWithSimServiceClient` (4 integration tests) - -2. ✅ **Generates tick-by-tick analysis capturing stability trends, faction legitimacy shifts, story seed activations** - - `ObservationReport` captures all three metrics - - `TrendAnalysis` tracks increasing/decreasing/stable trends - - `_check_story_seeds()` records activated story seeds - - `_check_alerts()` detects critical conditions - -3. ✅ **Outputs structured JSON plus optional natural language commentary** - - `ObservationReport.to_dict()` provides JSON output - - `_generate_commentary()` creates natural language summaries with labels - - `run_ai_observer.py` CLI outputs JSON with `--output` flag - - Verbose mode with `--verbose` flag - -4. ✅ **Integration test asserts observer detects a scripted stability crash** (ENHANCED) - - Added new test: `test_observer_detects_scripted_stability_crash()` - - Test scripts stability to crash (0.9 → 0.3) - - Verifies alert is generated with "stability critical" message - - Verifies alert contains the critical stability value - -5. ✅ **README includes observer invocation examples and use cases** - - Section "AI Player Observer" with comprehensive documentation - - Local mode examples - - Service mode examples with programmatic usage - - Configuration options documented - -### Changes Made: - -**tests/ai_player/test_observer.py:** -- Added `test_observer_detects_scripted_stability_crash()` test (lines 256-308) -- Test scripts stability crash by: - 1. Starting with high stability (0.9) - 2. Observing a few ticks - 3. Forcing stability to drop to 0.3 (below 0.5 threshold) - 4. Verifying alerts are generated with correct messages - -### Test Results: -- All 38 observer tests pass -- Linting: All checks passed - -### Issue #51 Status: COMPLETED -All acceptance criteria met. The implementation was already substantially complete, with only the scripted stability crash test needing enhancement. - - -## Issue #49 - AI Tournaments & Balance Tooling (Task 9.4.1) - 2025-12-04T04:49 UTC -========================================================= - -### Initial Assessment -- Working branch: copilot/resulting-meadowlark -- Repository synced and clean -- All 822 tests pass -- Coverage: 92.12% - -### Acceptance Criteria from Issue: -1. Tournament script runs 100+ games in parallel with configurable strategies -2. Comparative reports surface win rate deltas and balance anomalies -3. Analysis identifies unused story seeds or overpowered actions -4. Documentation guides designers through balance iteration workflow -5. CI integration runs nightly tournaments and archives results - -### Implementation Plan: - -1. **scripts/run_ai_tournament.py** - Tournament runner - - Execute N parallel games using ProcessPoolExecutor - - Support configurable strategies: BALANCED, AGGRESSIVE, DIPLOMATIC, HYBRID - - Support different world configs and random seeds - - Capture per-game telemetry: final stability, story seed activations, actions taken - - Aggregate results into structured JSON reports - - CLI interface similar to run_difficulty_sweeps.py - -2. **scripts/analyze_ai_games.py** - Analysis script - - Read tournament results JSON - - Calculate win rates per strategy (based on final stability threshold) - - Identify story seeds that never triggered - - Flag dominant strategies and balance outliers - - Generate human-readable summary reports - -3. **tests/ai_player/test_tournament.py** - Tournament tests - - Test tournament runner with small game count - - Test result aggregation - - Test parallel execution - -4. **tests/ai_player/test_analysis.py** - Analysis tests - - Test win rate calculation - - Test unused seed detection - - Test outlier detection - -5. **.github/workflows/ai-tournament.yml** - CI workflow - - Nightly tournament runs - - Archive results as artifacts - -6. **Documentation Updates** - - docs/gengine/how_to_play_echoes.md - balance iteration workflow - - docs/simul/emergent_story_game_implementation_plan.md - mark M9.4 complete - -### Implementation Log: - -[2025-12-04T04:49 UTC] Started implementation - -**Step 1: Created scripts/run_ai_tournament.py** -- Tournament runner with ProcessPoolExecutor for parallel game execution -- GameResult, TournamentConfig, TournamentReport dataclasses -- run_single_game() function for isolated game execution -- run_tournament() with stats aggregation per strategy -- CLI interface with --games, --ticks, --strategies, --seed, --output, --verbose - -**Step 2: Created scripts/analyze_ai_games.py** -- Analysis module for tournament results -- BalanceAnomaly, AnalysisReport dataclasses -- analyze_win_rates(), analyze_actions(), analyze_story_seeds() functions -- detect_anomalies() for identifying balance issues -- generate_recommendations() for actionable guidance -- CLI interface with --input, --world, --output, --json - -**Step 3: Created tests/scripts/test_ai_tournament.py** -- 19 tests covering GameResult, TournamentConfig, TournamentReport -- run_single_game and run_tournament tests -- Tournament determinism tests -- CLI tests - -**Step 4: Created tests/scripts/test_ai_analysis.py** -- 29 tests covering BalanceAnomaly, AnalysisReport -- Win rate, action, and story seed analysis tests -- Anomaly detection and recommendation tests -- CLI tests - -**Step 5: Created .github/workflows/ai-tournament.yml** -- Nightly tournament workflow (2:00 AM UTC) -- Manual trigger with configurable games/ticks -- Archives results for 90 days - -**Step 6: Updated documentation** -- docs/gengine/how_to_play_echoes.md: Added Section 13 (AI Tournaments & Balance Tooling) -- docs/simul/emergent_story_game_implementation_plan.md: Marked M9.4 complete -- README.md: Added AI Tournaments section - -**Test Results:** -- All 870 tests pass (822 existing + 48 new) -- Coverage: 92.10% -- Linting: All checks passed - -### Task 9.4.1 Status: COMPLETED - -All acceptance criteria met: -✅ Tournament script runs 100+ games in parallel with configurable strategies -✅ Comparative reports surface win rate deltas and balance anomalies -✅ Analysis identifies unused story seeds or overpowered actions -✅ Documentation guides designers through balance iteration workflow -✅ CI integration runs nightly tournaments and archives results +# GameDev Agent Thoughts - Issue #58: Batch Simulation Sweep Infrastructure (M11.1) + +## Task Analysis - Complete + +Implemented batch simulation sweep infrastructure for Phase 11, M11.1 task. + +## Implementation Summary + +### Files Created/Modified + +1. **content/config/batch_sweeps.yml** - Configuration file defining: + - Parameter grid (strategies, difficulties, seeds, worlds, tick_budgets) + - Parallel execution settings (max_workers, timeout_per_sweep) + - Output configuration + - Sampling options for large parameter spaces + - Metadata settings + +2. **scripts/run_batch_sweeps.py** - Main batch sweep runner: + - Generates Cartesian product of parameter combinations + - Uses ProcessPoolExecutor for parallel execution + - Produces JSON output per sweep with parameters, results, telemetry, metadata + - Supports CLI overrides for quick testing + - Progress reporting to stderr + +3. **tests/scripts/test_batch_sweeps.py** - 29 tests covering: + - SweepParameters creation and serialization + - SweepResult creation and serialization with errors + - BatchSweepConfig defaults and YAML loading + - Parameter grid generation (single, Cartesian product, large, sampling, empty) + - Single sweep execution (multiple strategies, invalid world, telemetry) + - Batch sweep execution (small, multiple strategies, stats, metadata) + - BatchSweepReport serialization + - CLI tests (basic run, JSON output, config file, overrides) + - Determinism tests (same seed produces same result) + +4. **docs/gengine/ai_tournament_and_balance_analysis.md** - Updated documentation: + - Added batch sweep section with configuration format + - CLI options reference table + - Output format description + - Updated balance iteration workflow + +## Verification + +- All 29 batch sweep tests pass +- All 113 scripts tests pass +- Smoke test confirms end-to-end functionality +- Output format matches acceptance criteria + +## Acceptance Criteria Status + +- [x] Script `scripts/run_batch_sweeps.py` supports multi-dimensional parameter grids +- [x] Parallel execution using ProcessPoolExecutor +- [x] JSON output per sweep includes game results, telemetry, and parameter metadata +- [x] Configuration file `content/config/batch_sweeps.yml` defines sweep parameter ranges +- [x] Documentation describes sweep configuration format and execution workflow +- [x] At least 10 tests covering parameter grid generation, parallel execution, and output validation (29 tests implemented) diff --git a/scripts/run_batch_sweeps.py b/scripts/run_batch_sweeps.py new file mode 100644 index 00000000..e052cc99 --- /dev/null +++ b/scripts/run_batch_sweeps.py @@ -0,0 +1,727 @@ +#!/usr/bin/env python3 +"""Run batch simulation sweeps with configurable parameter grids. + +Executes multi-dimensional parameter sweeps across strategies, difficulties, +seeds, worlds, and tick budgets using parallel execution for balance analysis. + +Examples +-------- +Basic sweep with default configuration:: + + uv run python scripts/run_batch_sweeps.py --output-dir build/sweeps + +Sweep with specific parameters:: + + uv run python scripts/run_batch_sweeps.py \\ + --strategies balanced aggressive --seeds 42 123 --ticks 100 200 + +Use custom configuration file:: + + uv run python scripts/run_batch_sweeps.py --config content/config/batch_sweeps.yml +""" + +from __future__ import annotations + +import argparse +import itertools +import json +import os +import subprocess +import sys +from concurrent.futures import ProcessPoolExecutor, as_completed +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from time import perf_counter +from typing import Any, Sequence + +import yaml + +# Set environment to avoid import issues in worker processes +os.environ.setdefault("ECHOES_CONFIG_ROOT", "content/config") + +# Default configuration path +DEFAULT_CONFIG_PATH = Path("content/config/batch_sweeps.yml") + +# Available strategies +AVAILABLE_STRATEGIES = ["balanced", "aggressive", "diplomatic", "hybrid"] + +# Available difficulty presets +AVAILABLE_DIFFICULTIES = ["tutorial", "easy", "normal", "hard", "brutal"] + + +@dataclass +class SweepParameters: + """Parameters for a single sweep run.""" + + strategy: str + difficulty: str + seed: int + world: str + tick_budget: int + + def to_dict(self) -> dict[str, Any]: + return { + "strategy": self.strategy, + "difficulty": self.difficulty, + "seed": self.seed, + "world": self.world, + "tick_budget": self.tick_budget, + } + + +@dataclass +class SweepResult: + """Result from a single sweep run.""" + + sweep_id: int + parameters: SweepParameters + final_stability: float + actions_taken: int + ticks_run: int + story_seeds_activated: list[str] = field(default_factory=list) + action_counts: dict[str, int] = field(default_factory=dict) + telemetry: dict[str, Any] = field(default_factory=dict) + duration_seconds: float = 0.0 + error: str | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "sweep_id": self.sweep_id, + "parameters": self.parameters.to_dict(), + "results": { + "final_stability": round(self.final_stability, 4), + "actions_taken": self.actions_taken, + "ticks_run": self.ticks_run, + "story_seeds_activated": self.story_seeds_activated, + "action_counts": self.action_counts, + }, + "telemetry": self.telemetry, + "duration_seconds": round(self.duration_seconds, 3), + "error": self.error, + } + + +@dataclass +class BatchSweepConfig: + """Configuration for batch sweep execution.""" + + strategies: list[str] = field(default_factory=lambda: ["balanced"]) + difficulties: list[str] = field(default_factory=lambda: ["normal"]) + seeds: list[int] = field(default_factory=lambda: [42]) + worlds: list[str] = field(default_factory=lambda: ["default"]) + tick_budgets: list[int] = field(default_factory=lambda: [100]) + max_workers: int | None = None + timeout_per_sweep: int = 300 + output_dir: Path = field(default_factory=lambda: Path("build/batch_sweeps")) + include_telemetry: bool = True + include_summary: bool = True + sampling_mode: str = "full" + sample_count: int = 100 + sample_seed: int = 42 + + @classmethod + def from_yaml(cls, path: Path) -> BatchSweepConfig: + """Load configuration from YAML file.""" + if not path.exists(): + return cls() + + with open(path) as f: + data = yaml.safe_load(f) or {} + + params = data.get("parameters", {}) + parallel = data.get("parallel", {}) + output = data.get("output", {}) + sampling = data.get("sampling", {}) + + return cls( + strategies=params.get("strategies", ["balanced"]), + difficulties=params.get("difficulties", ["normal"]), + seeds=params.get("seeds", [42]), + worlds=params.get("worlds", ["default"]), + tick_budgets=params.get("tick_budgets", [100]), + max_workers=parallel.get("max_workers"), + timeout_per_sweep=parallel.get("timeout_per_sweep", 300), + output_dir=Path(output.get("dir", "build/batch_sweeps")), + include_telemetry=output.get("include_telemetry", True), + include_summary=output.get("include_summary", True), + sampling_mode=sampling.get("mode", "full"), + sample_count=sampling.get("sample_count", 100), + sample_seed=sampling.get("sample_seed", 42), + ) + + +@dataclass +class BatchSweepReport: + """Aggregated report from batch sweep execution.""" + + config: dict[str, Any] + total_sweeps: int + completed_sweeps: int + failed_sweeps: int + results: list[SweepResult] + strategy_stats: dict[str, dict[str, Any]] + difficulty_stats: dict[str, dict[str, Any]] + total_duration_seconds: float + metadata: dict[str, Any] + + def to_dict(self) -> dict[str, Any]: + return { + "config": self.config, + "total_sweeps": self.total_sweeps, + "completed_sweeps": self.completed_sweeps, + "failed_sweeps": self.failed_sweeps, + "strategy_stats": self.strategy_stats, + "difficulty_stats": self.difficulty_stats, + "total_duration_seconds": round(self.total_duration_seconds, 2), + "metadata": self.metadata, + "sweeps": [r.to_dict() for r in self.results], + } + + +def generate_parameter_grid(config: BatchSweepConfig) -> list[SweepParameters]: + """Generate parameter combinations from configuration. + + Creates Cartesian product of all parameter lists, optionally applying + sampling for large parameter spaces. + + Parameters + ---------- + config + Batch sweep configuration with parameter lists. + + Returns + ------- + list[SweepParameters] + List of parameter combinations to test. + """ + # Generate full Cartesian product + combinations = list( + itertools.product( + config.strategies, + config.difficulties, + config.seeds, + config.worlds, + config.tick_budgets, + ) + ) + + # Apply sampling if not in full mode + if config.sampling_mode != "full" and len(combinations) > config.sample_count: + import random + + rng = random.Random(config.sample_seed) + + if config.sampling_mode == "random": + combinations = rng.sample(combinations, config.sample_count) + elif config.sampling_mode == "latin_hypercube": + # Simple approximation: stratified sampling + combinations = rng.sample(combinations, config.sample_count) + + return [ + SweepParameters( + strategy=strategy, + difficulty=difficulty, + seed=seed, + world=world, + tick_budget=tick_budget, + ) + for strategy, difficulty, seed, world, tick_budget in combinations + ] + + +def run_single_sweep( + sweep_id: int, + params: SweepParameters, + include_telemetry: bool = True, +) -> SweepResult: + """Run a single sweep with the given parameters. + + This function is designed to be called in a separate process. + + Parameters + ---------- + sweep_id + Unique identifier for this sweep. + params + Parameters for the sweep. + include_telemetry + Whether to include full telemetry in results. + + Returns + ------- + SweepResult + Results from the sweep execution. + """ + start_time = perf_counter() + try: + # Import inside function for process isolation + from gengine.ai_player import ActorConfig, AIActor + from gengine.ai_player.strategies import StrategyType + from gengine.echoes.settings import load_simulation_config + from gengine.echoes.sim import SimEngine + + # Map strategy name to type + strategy_map = { + "balanced": StrategyType.BALANCED, + "aggressive": StrategyType.AGGRESSIVE, + "diplomatic": StrategyType.DIPLOMATIC, + "hybrid": StrategyType.HYBRID, + } + strategy_type = strategy_map.get(params.strategy.lower(), StrategyType.BALANCED) + + # Load config for difficulty preset + config_root = Path("content/config/sweeps") / f"difficulty-{params.difficulty}" + if config_root.exists(): + config = load_simulation_config(config_root=config_root) + else: + config = load_simulation_config() + + # Initialize engine + engine = SimEngine(config=config) + engine.initialize_state(world=params.world) + + # Set seed by advancing one tick + engine.advance_ticks(1, seed=params.seed) + + # Create actor with config + # Ensure analysis_interval doesn't exceed tick_budget + analysis_interval = min(10, params.tick_budget) + actor_config = ActorConfig( + strategy_type=strategy_type, + tick_budget=params.tick_budget, + analysis_interval=analysis_interval, + log_decisions=False, + ) + actor = AIActor(engine=engine, config=actor_config) + + # Run the simulation + report = actor.run() + + # Extract story seeds from final state + final_state = engine.query_view("summary") + story_seeds = [] + seed_data = final_state.get("story_seeds", []) + if isinstance(seed_data, list): + for seed_info in seed_data: + if isinstance(seed_info, dict): + seed_id = seed_info.get("seed_id") or seed_info.get("id", "unknown") + story_seeds.append(seed_id) + + # Build telemetry if requested + telemetry: dict[str, Any] = {} + if include_telemetry: + telemetry = { + "environment": final_state.get("environment", {}), + "faction_legitimacy": final_state.get("faction_legitimacy", {}), + "economy": final_state.get("economy", {}), + "profiling": engine.state.metadata.get("profiling", {}), + } + + duration = perf_counter() - start_time + + return SweepResult( + sweep_id=sweep_id, + parameters=params, + final_stability=report.final_stability, + actions_taken=report.actions_taken, + ticks_run=report.ticks_run, + story_seeds_activated=story_seeds, + action_counts=report.telemetry.get("action_counts", {}), + telemetry=telemetry, + duration_seconds=duration, + ) + + except Exception as e: + duration = perf_counter() - start_time + return SweepResult( + sweep_id=sweep_id, + parameters=params, + final_stability=0.0, + actions_taken=0, + ticks_run=0, + duration_seconds=duration, + error=str(e), + ) + + +def _get_git_commit() -> str | None: + """Get current git commit hash if available.""" + try: + result = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + return result.stdout.strip() + except (subprocess.SubprocessError, FileNotFoundError): + pass + return None + + +def _build_metadata(config: BatchSweepConfig) -> dict[str, Any]: + """Build metadata dictionary for sweep report.""" + metadata: dict[str, Any] = {} + + # Git commit + git_commit = _get_git_commit() + if git_commit: + metadata["git_commit"] = git_commit + + # Timestamp + metadata["timestamp"] = datetime.now(timezone.utc).isoformat() + + # Runtime info + metadata["runtime"] = { + "python_version": sys.version, + "max_workers": config.max_workers, + } + + return metadata + + +def _calculate_stats( + results: list[SweepResult], key_func: Any +) -> dict[str, dict[str, Any]]: + """Calculate statistics grouped by a key function.""" + grouped: dict[str, list[SweepResult]] = {} + for r in results: + key = key_func(r) + grouped.setdefault(key, []).append(r) + + stats: dict[str, dict[str, Any]] = {} + for key, group in grouped.items(): + successful = [g for g in group if g.error is None] + stabilities = [g.final_stability for g in successful] + actions = [g.actions_taken for g in successful] + + stats[key] = { + "count": len(group), + "completed": len(successful), + "failed": len(group) - len(successful), + "avg_stability": ( + sum(stabilities) / len(stabilities) if stabilities else 0.0 + ), + "min_stability": min(stabilities) if stabilities else 0.0, + "max_stability": max(stabilities) if stabilities else 0.0, + "avg_actions": sum(actions) / len(actions) if actions else 0.0, + "total_actions": sum(actions), + } + + return stats + + +def run_batch_sweeps( + config: BatchSweepConfig, + verbose: bool = False, +) -> BatchSweepReport: + """Run batch sweeps with the given configuration. + + Parameters + ---------- + config + Batch sweep configuration. + verbose + If True, print progress to stderr. + + Returns + ------- + BatchSweepReport + Aggregated results from all sweeps. + """ + start_time = perf_counter() + + # Generate parameter grid + parameter_grid = generate_parameter_grid(config) + + if verbose: + sys.stderr.write( + f"Starting batch sweep: {len(parameter_grid)} combinations\n" + f" Strategies: {config.strategies}\n" + f" Difficulties: {config.difficulties}\n" + f" Seeds: {config.seeds}\n" + f" Worlds: {config.worlds}\n" + f" Tick budgets: {config.tick_budgets}\n" + ) + + # Run sweeps in parallel + results: list[SweepResult] = [] + completed = 0 + + max_workers = config.max_workers or min(4, os.cpu_count() or 1) + + with ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + run_single_sweep, i, params, config.include_telemetry + ): i + for i, params in enumerate(parameter_grid) + } + + for future in as_completed(futures): + result = future.result() + results.append(result) + completed += 1 + + if verbose and completed % 10 == 0: + sys.stderr.write( + f"Progress: {completed}/{len(parameter_grid)} sweeps completed\n" + ) + + # Sort results by sweep_id for consistent ordering + results.sort(key=lambda r: r.sweep_id) + + # Calculate statistics + strategy_stats = _calculate_stats(results, lambda r: r.parameters.strategy) + difficulty_stats = _calculate_stats(results, lambda r: r.parameters.difficulty) + + total_duration = perf_counter() - start_time + + if verbose: + sys.stderr.write( + f"\nBatch sweep complete: {len(results)} sweeps in {total_duration:.1f}s\n" + ) + + return BatchSweepReport( + config={ + "strategies": config.strategies, + "difficulties": config.difficulties, + "seeds": config.seeds, + "worlds": config.worlds, + "tick_budgets": config.tick_budgets, + "sampling_mode": config.sampling_mode, + }, + total_sweeps=len(parameter_grid), + completed_sweeps=sum(1 for r in results if r.error is None), + failed_sweeps=sum(1 for r in results if r.error is not None), + results=results, + strategy_stats=strategy_stats, + difficulty_stats=difficulty_stats, + total_duration_seconds=total_duration, + metadata=_build_metadata(config), + ) + + +def write_sweep_outputs( + report: BatchSweepReport, + output_dir: Path, + verbose: bool = False, +) -> None: + """Write individual sweep results and summary to output directory. + + Parameters + ---------- + report + Batch sweep report. + output_dir + Directory to write output files. + verbose + If True, print progress to stderr. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + # Write individual sweep results + for result in report.results: + params = result.parameters + filename = ( + f"sweep_{result.sweep_id:04d}_" + f"{params.strategy}_{params.difficulty}_" + f"seed{params.seed}_tick{params.tick_budget}.json" + ) + output_path = output_dir / filename + output_path.write_text(json.dumps(result.to_dict(), indent=2, sort_keys=True)) + + # Write summary report + summary_path = output_dir / "batch_sweep_summary.json" + summary_path.write_text(json.dumps(report.to_dict(), indent=2, sort_keys=True)) + + if verbose: + sys.stderr.write(f"Results written to {output_dir}\n") + sys.stderr.write(f" Individual sweeps: {len(report.results)} files\n") + sys.stderr.write(f" Summary: {summary_path}\n") + + +def print_summary_table(report: BatchSweepReport) -> None: + """Print a human-readable summary of batch sweep results.""" + print("\n" + "=" * 80) + print("BATCH SWEEP RESULTS") + print("=" * 80) + print( + f"\nSweeps: {report.completed_sweeps}/{report.total_sweeps} completed " + f"({report.failed_sweeps} failed)" + ) + print(f"Total duration: {report.total_duration_seconds:.1f}s") + + # Strategy breakdown + print("\n" + "-" * 80) + print("BY STRATEGY") + print("-" * 80) + print( + f"{'Strategy':<12} {'Count':>8} {'Completed':>10} {'Avg Stab':>10} " + f"{'Min Stab':>10} {'Max Stab':>10}" + ) + print("-" * 80) + + for strategy, stats in report.strategy_stats.items(): + print( + f"{strategy:<12} {stats['count']:>8} {stats['completed']:>10} " + f"{stats['avg_stability']:>10.3f} {stats['min_stability']:>10.3f} " + f"{stats['max_stability']:>10.3f}" + ) + + # Difficulty breakdown + print("\n" + "-" * 80) + print("BY DIFFICULTY") + print("-" * 80) + print( + f"{'Difficulty':<12} {'Count':>8} {'Completed':>10} {'Avg Stab':>10} " + f"{'Min Stab':>10} {'Max Stab':>10}" + ) + print("-" * 80) + + for difficulty, stats in report.difficulty_stats.items(): + print( + f"{difficulty:<12} {stats['count']:>8} {stats['completed']:>10} " + f"{stats['avg_stability']:>10.3f} {stats['min_stability']:>10.3f} " + f"{stats['max_stability']:>10.3f}" + ) + + print("=" * 80) + + +def main(argv: Sequence[str] | None = None) -> int: + """CLI entry point for running batch sweeps.""" + parser = argparse.ArgumentParser( + description="Run batch simulation sweeps with configurable parameter grids.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run with default configuration + uv run python scripts/run_batch_sweeps.py + + # Run with specific strategies and seeds + uv run python scripts/run_batch_sweeps.py --strategies balanced aggressive --seeds 42 123 + + # Use custom configuration file + uv run python scripts/run_batch_sweeps.py --config content/config/batch_sweeps.yml + + # Save results to specific directory + uv run python scripts/run_batch_sweeps.py --output-dir build/my_sweeps +""", + ) + parser.add_argument( + "--config", + "-c", + type=Path, + default=DEFAULT_CONFIG_PATH, + help=f"Configuration file path (default: {DEFAULT_CONFIG_PATH})", + ) + parser.add_argument( + "--strategies", + "-s", + nargs="+", + choices=AVAILABLE_STRATEGIES, + default=None, + help="Override strategies to test", + ) + parser.add_argument( + "--difficulties", + "-d", + nargs="+", + choices=AVAILABLE_DIFFICULTIES, + default=None, + help="Override difficulty presets to test", + ) + parser.add_argument( + "--seeds", + nargs="+", + type=int, + default=None, + help="Override random seeds to test", + ) + parser.add_argument( + "--worlds", + "-w", + nargs="+", + default=None, + help="Override world bundles to test", + ) + parser.add_argument( + "--ticks", + "-t", + nargs="+", + type=int, + default=None, + help="Override tick budgets to test", + ) + parser.add_argument( + "--workers", + type=int, + default=None, + help="Max parallel workers (default: auto)", + ) + parser.add_argument( + "--output-dir", + "-o", + type=Path, + default=None, + help="Output directory for sweep results", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output as JSON instead of table", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Print progress during execution", + ) + parser.add_argument( + "--no-write", + action="store_true", + help="Skip writing individual sweep files", + ) + + args = parser.parse_args(argv) + + # Load configuration from file + config = BatchSweepConfig.from_yaml(args.config) + + # Apply CLI overrides + if args.strategies: + config.strategies = args.strategies + if args.difficulties: + config.difficulties = args.difficulties + if args.seeds: + config.seeds = args.seeds + if args.worlds: + config.worlds = args.worlds + if args.ticks: + config.tick_budgets = args.ticks + if args.workers: + config.max_workers = args.workers + if args.output_dir: + config.output_dir = args.output_dir + + # Run batch sweeps + report = run_batch_sweeps(config, verbose=args.verbose) + + # Write outputs + if not args.no_write: + write_sweep_outputs(report, config.output_dir, verbose=args.verbose) + + # Print results + if args.json: + print(json.dumps(report.to_dict(), indent=2, sort_keys=True)) + else: + print_summary_table(report) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/scripts/test_batch_sweeps.py b/tests/scripts/test_batch_sweeps.py new file mode 100644 index 00000000..33ee1cf9 --- /dev/null +++ b/tests/scripts/test_batch_sweeps.py @@ -0,0 +1,655 @@ +"""Tests for batch simulation sweep infrastructure.""" + +from __future__ import annotations + +import json +import sys +from importlib import util +from pathlib import Path + +import pytest + +_MODULE_PATH = ( + Path(__file__).resolve().parents[2] / "scripts" / "run_batch_sweeps.py" +) + + +def _load_batch_sweep_module(): + spec = util.spec_from_file_location("batch_sweep_driver", _MODULE_PATH) + module = util.module_from_spec(spec) + assert spec and spec.loader + sys.modules.setdefault("batch_sweep_driver", module) + spec.loader.exec_module(module) + return module + + +_driver = _load_batch_sweep_module() +SweepParameters = _driver.SweepParameters +SweepResult = _driver.SweepResult +BatchSweepConfig = _driver.BatchSweepConfig +BatchSweepReport = _driver.BatchSweepReport +generate_parameter_grid = _driver.generate_parameter_grid +run_single_sweep = _driver.run_single_sweep +run_batch_sweeps = _driver.run_batch_sweeps +main = _driver.main + + +class TestSweepParameters: + """Tests for the SweepParameters dataclass.""" + + def test_sweep_parameters_creation(self) -> None: + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=100, + ) + assert params.strategy == "balanced" + assert params.difficulty == "normal" + assert params.seed == 42 + assert params.world == "default" + assert params.tick_budget == 100 + + def test_sweep_parameters_to_dict(self) -> None: + params = SweepParameters( + strategy="aggressive", + difficulty="hard", + seed=123, + world="default", + tick_budget=200, + ) + data = params.to_dict() + + assert data["strategy"] == "aggressive" + assert data["difficulty"] == "hard" + assert data["seed"] == 123 + assert data["world"] == "default" + assert data["tick_budget"] == 200 + + +class TestSweepResult: + """Tests for the SweepResult dataclass.""" + + def test_sweep_result_default_values(self) -> None: + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=100, + ) + result = SweepResult( + sweep_id=1, + parameters=params, + final_stability=0.75, + actions_taken=10, + ticks_run=100, + ) + assert result.story_seeds_activated == [] + assert result.action_counts == {} + assert result.telemetry == {} + assert result.duration_seconds == 0.0 + assert result.error is None + + def test_sweep_result_to_dict(self) -> None: + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=100, + ) + result = SweepResult( + sweep_id=1, + parameters=params, + final_stability=0.7567, + actions_taken=10, + ticks_run=100, + story_seeds_activated=["seed-1", "seed-2"], + action_counts={"INSPECT": 5, "NEGOTIATE": 5}, + telemetry={"stability_trend": "stable"}, + duration_seconds=1.234, + ) + + data = result.to_dict() + + assert data["sweep_id"] == 1 + assert data["parameters"]["strategy"] == "balanced" + assert data["results"]["final_stability"] == 0.7567 + assert data["results"]["actions_taken"] == 10 + assert data["results"]["ticks_run"] == 100 + assert data["results"]["story_seeds_activated"] == ["seed-1", "seed-2"] + assert data["results"]["action_counts"] == {"INSPECT": 5, "NEGOTIATE": 5} + assert data["telemetry"] == {"stability_trend": "stable"} + assert data["duration_seconds"] == 1.234 + assert data["error"] is None + + def test_sweep_result_with_error(self) -> None: + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=100, + ) + result = SweepResult( + sweep_id=1, + parameters=params, + final_stability=0.0, + actions_taken=0, + ticks_run=0, + error="Connection failed", + ) + + data = result.to_dict() + assert data["error"] == "Connection failed" + + +class TestBatchSweepConfig: + """Tests for the BatchSweepConfig dataclass.""" + + def test_default_config(self) -> None: + config = BatchSweepConfig() + assert config.strategies == ["balanced"] + assert config.difficulties == ["normal"] + assert config.seeds == [42] + assert config.worlds == ["default"] + assert config.tick_budgets == [100] + assert config.max_workers is None + assert config.timeout_per_sweep == 300 + assert config.sampling_mode == "full" + + def test_custom_config(self) -> None: + config = BatchSweepConfig( + strategies=["balanced", "aggressive"], + difficulties=["easy", "normal", "hard"], + seeds=[42, 123, 456], + worlds=["default"], + tick_budgets=[50, 100, 200], + max_workers=4, + timeout_per_sweep=600, + ) + assert config.strategies == ["balanced", "aggressive"] + assert config.difficulties == ["easy", "normal", "hard"] + assert len(config.seeds) == 3 + assert len(config.tick_budgets) == 3 + assert config.max_workers == 4 + assert config.timeout_per_sweep == 600 + + def test_config_from_yaml(self, tmp_path: Path) -> None: + yaml_content = """ +parameters: + strategies: + - balanced + - diplomatic + difficulties: + - normal + - hard + seeds: + - 42 + - 123 + worlds: + - default + tick_budgets: + - 50 + - 100 +parallel: + max_workers: 2 + timeout_per_sweep: 120 +output: + dir: build/test_sweeps + include_telemetry: false +sampling: + mode: random + sample_count: 50 +""" + config_file = tmp_path / "batch_sweeps.yml" + config_file.write_text(yaml_content) + + config = BatchSweepConfig.from_yaml(config_file) + + assert config.strategies == ["balanced", "diplomatic"] + assert config.difficulties == ["normal", "hard"] + assert config.seeds == [42, 123] + assert config.tick_budgets == [50, 100] + assert config.max_workers == 2 + assert config.timeout_per_sweep == 120 + assert config.include_telemetry is False + assert config.sampling_mode == "random" + assert config.sample_count == 50 + + def test_config_from_missing_yaml(self, tmp_path: Path) -> None: + """Test that missing YAML returns default config.""" + config = BatchSweepConfig.from_yaml(tmp_path / "nonexistent.yml") + assert config.strategies == ["balanced"] + assert config.difficulties == ["normal"] + + +class TestGenerateParameterGrid: + """Tests for parameter grid generation.""" + + def test_single_parameter_grid(self) -> None: + """Test grid with single value for each parameter.""" + config = BatchSweepConfig( + strategies=["balanced"], + difficulties=["normal"], + seeds=[42], + worlds=["default"], + tick_budgets=[100], + ) + grid = generate_parameter_grid(config) + + assert len(grid) == 1 + assert grid[0].strategy == "balanced" + assert grid[0].difficulty == "normal" + assert grid[0].seed == 42 + assert grid[0].world == "default" + assert grid[0].tick_budget == 100 + + def test_cartesian_product_grid(self) -> None: + """Test that grid generates Cartesian product.""" + config = BatchSweepConfig( + strategies=["balanced", "aggressive"], + difficulties=["easy", "hard"], + seeds=[42], + worlds=["default"], + tick_budgets=[100], + ) + grid = generate_parameter_grid(config) + + # 2 strategies × 2 difficulties × 1 seed × 1 world × 1 tick = 4 + assert len(grid) == 4 + + # Verify all combinations exist + combinations = {(p.strategy, p.difficulty) for p in grid} + expected = { + ("balanced", "easy"), + ("balanced", "hard"), + ("aggressive", "easy"), + ("aggressive", "hard"), + } + assert combinations == expected + + def test_large_parameter_grid(self) -> None: + """Test grid with multiple values for all parameters.""" + config = BatchSweepConfig( + strategies=["balanced", "aggressive", "diplomatic"], + difficulties=["easy", "normal", "hard"], + seeds=[42, 123], + worlds=["default"], + tick_budgets=[50, 100], + ) + grid = generate_parameter_grid(config) + + # 3 × 3 × 2 × 1 × 2 = 36 + assert len(grid) == 36 + + def test_sampling_reduces_grid_size(self) -> None: + """Test that sampling reduces grid when enabled.""" + config = BatchSweepConfig( + strategies=["balanced", "aggressive", "diplomatic"], + difficulties=["easy", "normal", "hard"], + seeds=[42, 123, 456], + worlds=["default"], + tick_budgets=[50, 100], + sampling_mode="random", + sample_count=10, + ) + grid = generate_parameter_grid(config) + + # Full grid would be 3 × 3 × 3 × 1 × 2 = 54, but sampled to 10 + assert len(grid) == 10 + + def test_empty_parameter_returns_empty_grid(self) -> None: + """Test that empty parameter list returns empty grid.""" + config = BatchSweepConfig( + strategies=[], + difficulties=["normal"], + seeds=[42], + worlds=["default"], + tick_budgets=[100], + ) + grid = generate_parameter_grid(config) + assert len(grid) == 0 + + +class TestRunSingleSweep: + """Tests for single sweep execution.""" + + def test_run_single_sweep_balanced(self) -> None: + """Test running a single sweep with balanced strategy.""" + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=10, + ) + result = run_single_sweep(sweep_id=1, params=params, include_telemetry=True) + + assert result.sweep_id == 1 + assert result.parameters.strategy == "balanced" + assert result.error is None + assert 0.0 <= result.final_stability <= 1.0 + assert result.duration_seconds > 0 + assert result.ticks_run == 10 + + def test_run_single_sweep_aggressive(self) -> None: + """Test running a single sweep with aggressive strategy.""" + params = SweepParameters( + strategy="aggressive", + difficulty="normal", + seed=43, + world="default", + tick_budget=10, + ) + result = run_single_sweep(sweep_id=2, params=params, include_telemetry=False) + + assert result.sweep_id == 2 + assert result.parameters.strategy == "aggressive" + assert result.error is None + + def test_run_single_sweep_invalid_world(self) -> None: + """Test that invalid world produces error.""" + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="nonexistent_world", + tick_budget=10, + ) + result = run_single_sweep(sweep_id=1, params=params, include_telemetry=False) + + assert result.error is not None + assert result.ticks_run == 0 + + def test_run_single_sweep_includes_telemetry(self) -> None: + """Test that telemetry is included when requested.""" + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=10, + ) + result = run_single_sweep(sweep_id=1, params=params, include_telemetry=True) + + assert result.error is None + assert "environment" in result.telemetry or result.telemetry == {} + + +class TestRunBatchSweeps: + """Tests for batch sweep execution.""" + + def test_run_batch_sweeps_small(self) -> None: + """Run a small batch sweep to verify basic functionality.""" + config = BatchSweepConfig( + strategies=["balanced"], + difficulties=["normal"], + seeds=[42], + worlds=["default"], + tick_budgets=[5], + max_workers=1, + ) + + report = run_batch_sweeps(config, verbose=False) + + assert report.total_sweeps == 1 + assert report.completed_sweeps == 1 + assert report.failed_sweeps == 0 + assert len(report.results) == 1 + + def test_run_batch_sweeps_multiple_strategies(self) -> None: + """Test batch sweep with multiple strategies.""" + config = BatchSweepConfig( + strategies=["balanced", "aggressive"], + difficulties=["normal"], + seeds=[42], + worlds=["default"], + tick_budgets=[5], + max_workers=2, + ) + + report = run_batch_sweeps(config, verbose=False) + + assert report.total_sweeps == 2 + assert "balanced" in report.strategy_stats + assert "aggressive" in report.strategy_stats + + def test_run_batch_sweeps_calculates_stats(self) -> None: + """Verify that batch sweep calculates strategy statistics.""" + config = BatchSweepConfig( + strategies=["balanced"], + difficulties=["normal"], + seeds=[42, 43], + worlds=["default"], + tick_budgets=[5], + max_workers=1, + ) + + report = run_batch_sweeps(config, verbose=False) + + assert "balanced" in report.strategy_stats + stats = report.strategy_stats["balanced"] + assert "count" in stats + assert "completed" in stats + assert "avg_stability" in stats + assert "min_stability" in stats + assert "max_stability" in stats + + def test_run_batch_sweeps_includes_metadata(self) -> None: + """Test that batch sweep includes metadata.""" + config = BatchSweepConfig( + strategies=["balanced"], + difficulties=["normal"], + seeds=[42], + worlds=["default"], + tick_budgets=[5], + max_workers=1, + ) + + report = run_batch_sweeps(config, verbose=False) + + assert "timestamp" in report.metadata + assert "runtime" in report.metadata + + +class TestBatchSweepReport: + """Tests for BatchSweepReport.""" + + def test_report_to_dict(self) -> None: + """Test report serialization to dictionary.""" + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=100, + ) + result = SweepResult( + sweep_id=1, + parameters=params, + final_stability=0.8, + actions_taken=5, + ticks_run=100, + ) + + report = BatchSweepReport( + config={"strategies": ["balanced"], "difficulties": ["normal"]}, + total_sweeps=1, + completed_sweeps=1, + failed_sweeps=0, + results=[result], + strategy_stats={"balanced": {"count": 1, "avg_stability": 0.8}}, + difficulty_stats={"normal": {"count": 1, "avg_stability": 0.8}}, + total_duration_seconds=5.5, + metadata={"timestamp": "2025-01-01T00:00:00Z"}, + ) + + data = report.to_dict() + + assert data["total_sweeps"] == 1 + assert data["completed_sweeps"] == 1 + assert data["failed_sweeps"] == 0 + assert len(data["sweeps"]) == 1 + assert "balanced" in data["strategy_stats"] + assert "normal" in data["difficulty_stats"] + assert data["total_duration_seconds"] == 5.5 + + +class TestBatchSweepCLI: + """Tests for the batch sweep CLI.""" + + def test_cli_basic_run( + self, tmp_path: Path, capsys: pytest.CaptureFixture + ) -> None: + """Test CLI with minimal arguments.""" + output_dir = tmp_path / "output" + + exit_code = main([ + "--strategies", "balanced", + "--difficulties", "normal", + "--seeds", "42", + "--ticks", "5", + "--output-dir", str(output_dir), + "--workers", "1", + ]) + + assert exit_code == 0 + assert (output_dir / "batch_sweep_summary.json").exists() + + captured = capsys.readouterr() + assert "BATCH SWEEP RESULTS" in captured.out + + def test_cli_json_output( + self, tmp_path: Path, capsys: pytest.CaptureFixture + ) -> None: + """Test CLI with JSON output format.""" + exit_code = main([ + "--strategies", "balanced", + "--difficulties", "normal", + "--seeds", "42", + "--ticks", "5", + "--workers", "1", + "--json", + "--no-write", + ]) + + assert exit_code == 0 + captured = capsys.readouterr() + data = json.loads(captured.out) + assert "total_sweeps" in data + assert "strategy_stats" in data + + def test_cli_with_config_file( + self, tmp_path: Path, capsys: pytest.CaptureFixture + ) -> None: + """Test CLI with configuration file.""" + yaml_content = """ +parameters: + strategies: + - balanced + difficulties: + - normal + seeds: + - 42 + worlds: + - default + tick_budgets: + - 5 +parallel: + max_workers: 1 +""" + config_file = tmp_path / "config.yml" + config_file.write_text(yaml_content) + output_dir = tmp_path / "output" + + exit_code = main([ + "--config", str(config_file), + "--output-dir", str(output_dir), + ]) + + assert exit_code == 0 + assert (output_dir / "batch_sweep_summary.json").exists() + + def test_cli_override_config_values( + self, tmp_path: Path, capsys: pytest.CaptureFixture + ) -> None: + """Test that CLI arguments override config file values.""" + yaml_content = """ +parameters: + strategies: + - balanced + - aggressive + seeds: + - 42 + - 123 + tick_budgets: + - 100 +""" + config_file = tmp_path / "config.yml" + config_file.write_text(yaml_content) + + exit_code = main([ + "--config", str(config_file), + "--strategies", "diplomatic", # Override from config + "--seeds", "456", # Override from config + "--ticks", "5", # Override from config + "--workers", "1", + "--json", + "--no-write", + ]) + + assert exit_code == 0 + captured = capsys.readouterr() + data = json.loads(captured.out) + + # Should only have diplomatic strategy (override) + assert "diplomatic" in data["strategy_stats"] + assert "balanced" not in data["strategy_stats"] + + +class TestSweepDeterminism: + """Tests for sweep determinism with fixed seeds.""" + + def test_same_seed_produces_same_result(self) -> None: + """Running the same sweep twice with same seed should produce same result.""" + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=10, + ) + + result1 = run_single_sweep(sweep_id=1, params=params, include_telemetry=False) + result2 = run_single_sweep(sweep_id=2, params=params, include_telemetry=False) + + assert result1.final_stability == result2.final_stability + assert result1.actions_taken == result2.actions_taken + assert result1.ticks_run == result2.ticks_run + + def test_different_seeds_may_differ(self) -> None: + """Different seeds may produce different outcomes.""" + params1 = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=10, + ) + params2 = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=12345, + world="default", + tick_budget=10, + ) + + result1 = run_single_sweep(sweep_id=1, params=params1, include_telemetry=False) + result2 = run_single_sweep(sweep_id=2, params=params2, include_telemetry=False) + + # Results may or may not differ, but both should complete successfully + assert result1.error is None + assert result2.error is None From 6325b0bbbec36099e6977def6a2623aa57ac9b10 Mon Sep 17 00:00:00 2001 From: Ross Gardler Date: Thu, 4 Dec 2025 01:27:48 -0800 Subject: [PATCH 3/4] test: Improve batch sweep test coverage for helpers and file output --- tests/scripts/test_batch_sweeps.py | 122 +++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/tests/scripts/test_batch_sweeps.py b/tests/scripts/test_batch_sweeps.py index 33ee1cf9..c15555ac 100644 --- a/tests/scripts/test_batch_sweeps.py +++ b/tests/scripts/test_batch_sweeps.py @@ -31,6 +31,9 @@ def _load_batch_sweep_module(): generate_parameter_grid = _driver.generate_parameter_grid run_single_sweep = _driver.run_single_sweep run_batch_sweeps = _driver.run_batch_sweeps +write_sweep_outputs = _driver.write_sweep_outputs +_calculate_stats = _driver._calculate_stats +_build_metadata = _driver._build_metadata main = _driver.main @@ -455,6 +458,125 @@ def test_run_batch_sweeps_includes_metadata(self) -> None: assert "runtime" in report.metadata +class TestInternalHelpers: + """Tests for internal helper functions used by batch sweeps.""" + + def test_calculate_stats_mixed_success_and_failure(self) -> None: + """_calculate_stats correctly aggregates successful results only.""" + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=10, + ) + success1 = SweepResult( + sweep_id=1, + parameters=params, + final_stability=0.2, + actions_taken=5, + ticks_run=10, + ) + success2 = SweepResult( + sweep_id=2, + parameters=params, + final_stability=0.8, + actions_taken=15, + ticks_run=10, + ) + failure = SweepResult( + sweep_id=3, + parameters=params, + final_stability=0.0, + actions_taken=0, + ticks_run=0, + error="failed", + ) + + stats = _calculate_stats([success1, success2, failure], lambda r: r.parameters.strategy) + + assert "balanced" in stats + s = stats["balanced"] + assert s["count"] == 3 + assert s["completed"] == 2 + assert s["failed"] == 1 + assert pytest.approx(s["avg_stability"]) == 0.5 + assert s["min_stability"] == 0.2 + assert s["max_stability"] == 0.8 + assert pytest.approx(s["avg_actions"]) == 10.0 + assert s["total_actions"] == 20 + + def test_build_metadata_includes_git_and_runtime(self, monkeypatch: pytest.MonkeyPatch) -> None: + """_build_metadata includes git commit hash and runtime details.""" + + class DummyCompletedProcess: + def __init__(self) -> None: + self.returncode = 0 + self.stdout = "abc123\n" + + def fake_run(*_args, **_kwargs): # type: ignore[no-untyped-def] + return DummyCompletedProcess() + + monkeypatch.setattr(_driver.subprocess, "run", fake_run) + + config = BatchSweepConfig(max_workers=4) + metadata = _build_metadata(config) + + assert metadata["git_commit"] == "abc123" + assert "timestamp" in metadata + assert "runtime" in metadata + assert metadata["runtime"]["max_workers"] == 4 + + +class TestWriteSweepOutputs: + """Tests for writing sweep output files.""" + + def test_write_sweep_outputs_creates_expected_files(self, tmp_path: Path) -> None: + params = SweepParameters( + strategy="balanced", + difficulty="normal", + seed=42, + world="default", + tick_budget=10, + ) + result = SweepResult( + sweep_id=1, + parameters=params, + final_stability=0.75, + actions_taken=10, + ticks_run=10, + ) + + report = BatchSweepReport( + config={"strategies": ["balanced"]}, + total_sweeps=1, + completed_sweeps=1, + failed_sweeps=0, + results=[result], + strategy_stats={"balanced": {"count": 1}}, + difficulty_stats={"normal": {"count": 1}}, + total_duration_seconds=1.0, + metadata={"timestamp": "2025-01-01T00:00:00Z"}, + ) + + output_dir = tmp_path / "outputs" + write_sweep_outputs(report, output_dir, verbose=False) + + sweep_file = output_dir / "sweep_0001_balanced_normal_seed42_tick10.json" + summary_file = output_dir / "batch_sweep_summary.json" + + assert sweep_file.exists() + assert summary_file.exists() + + sweep_data = json.loads(sweep_file.read_text()) + assert sweep_data["sweep_id"] == 1 + assert sweep_data["parameters"]["strategy"] == "balanced" + + summary_data = json.loads(summary_file.read_text()) + assert summary_data["total_sweeps"] == 1 + assert len(summary_data["sweeps"]) == 1 + + class TestBatchSweepReport: """Tests for BatchSweepReport.""" From b7b3415971be3e6f779f11580d03b4c52d4ad77c Mon Sep 17 00:00:00 2001 From: Ross Gardler Date: Thu, 4 Dec 2025 01:35:12 -0800 Subject: [PATCH 4/4] test: Improve coverage for gateway client session handling --- tests/echoes/test_gateway_client.py | 39 ++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/echoes/test_gateway_client.py b/tests/echoes/test_gateway_client.py index e4db7eec..260498ad 100644 --- a/tests/echoes/test_gateway_client.py +++ b/tests/echoes/test_gateway_client.py @@ -3,9 +3,46 @@ from __future__ import annotations import json +import unittest from unittest.mock import AsyncMock, patch -from gengine.echoes.gateway.client import _render_response, main +from gengine.echoes.gateway.client import _render_response, _run_session, main + + +class TestGatewayClientSession(unittest.IsolatedAsyncioTestCase): + """Tests for the async session handling.""" + + async def test_run_session_executes_script(self) -> None: + """Verify that _run_session connects, sends commands, and handles responses.""" + url = "ws://test" + script = ["cmd1", "cmd2"] + + with patch("gengine.echoes.gateway.client.websockets.connect") as mock_connect: + mock_ws = AsyncMock() + # Mock the context manager + mock_connect.return_value.__aenter__.return_value = mock_ws + + # Sequence of messages received: + # 1. Welcome message + # 2. Response to cmd1 + # 3. Response to cmd2 (with should_exit=True to stop the loop) + mock_ws.recv.side_effect = [ + json.dumps({"output": "Welcome"}), + json.dumps({"output": "Result1"}), + json.dumps({"output": "Result2", "should_exit": True}), + ] + + await _run_session(url, script) + + # Verify connect called with url + mock_connect.assert_called_with(url) + + # Verify sends + # 1. cmd1 + # 2. cmd2 + assert mock_ws.send.call_count == 2 + mock_ws.send.assert_any_call(json.dumps({"command": "cmd1"})) + mock_ws.send.assert_any_call(json.dumps({"command": "cmd2"})) def test_render_response_with_valid_json() -> None: