diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json index c4b1441..d29ffcb 100644 --- a/infrastructure/step_function.json +++ b/infrastructure/step_function.json @@ -2252,36 +2252,17 @@ } ], "ResultPath": "$.substrate_check_poll", - "Next": "CheckShellRunSkipDirector" - }, - "CheckShellRunSkipDirector": { - "Type": "Choice", - "Comment": "Shell-run / preflight guard for the advisory tail (ROADMAP L4504). The Friday-PM Preflight Pipeline (shell_run=true) dry-executes the Saturday SF to exercise bootstrap paths — but ReportCard + Director have NO dry path (their payload is only {date}; the Director Lambda gates solely on DIRECTOR_ENABLED). Left ungated, a Friday preflight would run ReportCard for real over backtest/{Fri-date}/* that the dry workload never wrote (→ a degenerate, mostly-N/A card) and, once DIRECTOR_ENABLED is flipped on, fire a REAL Opus Director call that merges that degenerate plan into the SHARED, non-date-scoped carry-over ledger (director/carryover_ledger.json) — polluting the state the real Saturday run reads. So on shell_run=true we hard-skip BOTH advisory states straight to the shell-run-aware notify; the preflight's purpose is bootstrap exercise and both Lambdas already have their own canaries. shell_run absent/false → Default → ReportCard, BYTE-IDENTICAL to the pre-guard real Saturday run.", - "Choices": [ - { - "And": [ - { - "Variable": "$.shell_run", - "IsPresent": true - }, - { - "Variable": "$.shell_run", - "BooleanEquals": true - } - ], - "Next": "CheckShellRunNotify" - } - ], - "Default": "ReportCard" + "Next": "ReportCard" }, "ReportCard": { "Type": "Task", - "Comment": "Evaluator Report Card v2 (Layer B, Option B) — builds s3://alpha-engine-research/evaluator/{date}/report_card.json from the persisted per-module artifacts (the 7-tile MetricRecord substrate the Director will consume). Runs here, after the Evaluator + substrate health checks, so it reads fresh grades. NON-FATAL: its own Catch routes to CheckShellRunNotify so an advisory grading failure never breaks the run that produced the real trading artifacts.", + "Comment": "Evaluator Report Card v2 (Layer B, Option B) — builds s3://alpha-engine-research/evaluator/{date}/report_card.json from the persisted per-module artifacts (the 7-tile MetricRecord substrate the Director will consume). Runs here, after the Evaluator + substrate health checks, so it reads fresh grades. NON-FATAL: its own Catch routes to CheckShellRunNotify so an advisory grading failure never breaks the run that produced the real trading artifacts. PREFLIGHT-AWARE (ROADMAP L4504): dry_run.$=$.research_dry (the canonical shell-run-dry signal — false on the real Saturday run, true on the Friday-PM Preflight Pipeline). On the preflight the handler still exercises the full read+compute path (container boot, lib/numpy/pandas imports, S3-read IAM/transport, the tile compute) but does NOT persist the degenerate, mostly-N/A preflight card — mirrors how the other advisory Lambdas (eval-judge / rationale-clustering / replay-concordance / counterfactual) run dry rather than skip, per the shell-run keystone's 'dry-execute, don't skip' principle.", "Resource": "arn:aws:states:::lambda:invoke", "Parameters": { "FunctionName": "alpha-engine-evaluator:live", "Payload": { - "date.$": "$.run_date" + "date.$": "$.run_date", + "dry_run.$": "$.research_dry" } }, "TimeoutSeconds": 300, @@ -2311,12 +2292,13 @@ }, "Director": { "Type": "Task", - "Comment": "Director (Layer C, Part II) — the FINAL Saturday-SF task. A single structured Opus call over the fresh Report Card v2 (evaluator/{date}/report_card.json) produces an advisory DirectorWeeklyActionPlan at director/{date}/action_plan.json + updates the carry-over ledger. Runs only after a SUCCESSFUL ReportCard (a failed ReportCard skips straight to notify — no card to weigh). FLAG-GATED: the alpha-engine-evaluator-director Lambda is a no-op (status=disabled) until DIRECTOR_ENABLED is flipped on. NON-FATAL: its own Catch routes to CheckShellRunNotify so an advisory failure never breaks the run that produced the real trading artifacts.", + "Comment": "Director (Layer C, Part II) — the FINAL Saturday-SF task. A single structured Opus call over the fresh Report Card v2 (evaluator/{date}/report_card.json) produces an advisory DirectorWeeklyActionPlan at director/{date}/action_plan.json + updates the carry-over ledger. Runs only after a SUCCESSFUL ReportCard (a failed ReportCard skips straight to notify — no card to weigh). FLAG-GATED: the alpha-engine-evaluator-director Lambda is a no-op (status=disabled) until DIRECTOR_ENABLED is flipped on. NON-FATAL: its own Catch routes to CheckShellRunNotify so an advisory failure never breaks the run that produced the real trading artifacts. PREFLIGHT-AWARE (ROADMAP L4504): dry_run.$=$.research_dry. On the Friday-PM Preflight Pipeline (research_dry=true) the handler runs a no-Opus / no-write PROBE — it constructs the real Opus client (exercising the langchain-anthropic import + the SSM ANTHROPIC_API_KEY fetch / ReadAnthropicSecret IAM grant) and reads the carry-over ledger, then STOPS short of the paid .invoke() and writes NOTHING (no action_plan, no ledger mutation). This is load-bearing: the carry-over ledger is shared + non-date-scoped (director/carryover_ledger.json), so a preflight write would pollute the real Saturday run. A broken import / revoked key grant surfaces as a caught preflight failure ~18h before the real Saturday Director would hit it.", "Resource": "arn:aws:states:::lambda:invoke", "Parameters": { "FunctionName": "alpha-engine-evaluator-director:live", "Payload": { - "date.$": "$.run_date" + "date.$": "$.run_date", + "dry_run.$": "$.research_dry" } }, "TimeoutSeconds": 300, diff --git a/tests/test_sf_friday_shell_run_wiring.py b/tests/test_sf_friday_shell_run_wiring.py index e0d4b4d..0a15d72 100644 --- a/tests/test_sf_friday_shell_run_wiring.py +++ b/tests/test_sf_friday_shell_run_wiring.py @@ -674,19 +674,17 @@ def test_dry_lambda_payload_references_control_var( class TestConsolidatedNotify: def test_substrate_check_routes_to_notify_gate(self, states): - # The substrate check now flows through the shell-run guard - # (CheckShellRunSkipDirector, ROADMAP L4504) and then two non-fatal - # advisory states (evaluator Report Card v2, then the Director) before - # the notify gate. On a real Saturday run the guard's Default routes to - # ReportCard; ReportCard's SUCCESS Next feeds the Director; its Catch - # skips straight to CheckShellRunNotify. The Director's own Next AND - # Catch both land on CheckShellRunNotify, so the path to the notify gate - # is preserved whether grading/advisory succeed or fail. + # The substrate check flows into two non-fatal advisory states (evaluator + # Report Card v2, then the Director) before the notify gate. ReportCard's + # SUCCESS Next feeds the Director; its Catch skips straight to + # CheckShellRunNotify. The Director's own Next AND Catch both land on + # CheckShellRunNotify, so the path to the notify gate is preserved whether + # grading/advisory succeed or fail. On the Friday preflight the states + # still RUN (dry, see test_advisory_tail_runs_dry_on_preflight) — they are + # not skipped — so the success edge is identical on real + preflight runs. assert ( - states["WaitForWeeklySubstrateHealthCheck"]["Next"] - == "CheckShellRunSkipDirector" + states["WaitForWeeklySubstrateHealthCheck"]["Next"] == "ReportCard" ) - assert states["CheckShellRunSkipDirector"]["Default"] == "ReportCard" report_card = states["ReportCard"] assert report_card["Next"] == "Director" assert all(c["Next"] == "CheckShellRunNotify" for c in report_card["Catch"]) @@ -694,30 +692,33 @@ def test_substrate_check_routes_to_notify_gate(self, states): assert director["Next"] == "CheckShellRunNotify" assert all(c["Next"] == "CheckShellRunNotify" for c in director["Catch"]) - def test_shell_run_skips_advisory_tail(self, states): - """ROADMAP L4504: on a Friday-PM Preflight Pipeline (shell_run=true) the - advisory tail (ReportCard + Director) MUST be hard-skipped straight to - the notify gate. Left ungated, the Director would run a real Opus call - over a degenerate preflight card and pollute the shared, non-date-scoped - carry-over ledger (director/carryover_ledger.json) that the real - Saturday run reads. The guard sits between the substrate health check - and ReportCard so BOTH advisory states are bypassed on the preflight. + def test_advisory_tail_runs_dry_on_preflight(self, states): + """ROADMAP L4504: ReportCard + Director were added after the shell-run + keystone and were given NO dry path — their payloads only carried {date}, + and the Director Lambda gates solely on DIRECTOR_ENABLED. Left ungated, + the Friday-PM Preflight Pipeline would run ReportCard for real (writing a + degenerate, mostly-N/A card) and, once DIRECTOR_ENABLED is flipped on, + fire a REAL Opus Director call that merges that plan into the SHARED, + non-date-scoped carry-over ledger (director/carryover_ledger.json), + polluting the state the real Saturday run reads. + + Fix (keystone-consistent: dry-execute, don't skip): both payloads thread + dry_run.$=$.research_dry — the canonical shell-run-dry signal, false on the + real Saturday run / true on the preflight. The handlers then run a no-write + (ReportCard) / no-Opus-no-write probe (Director) on the preflight, still + exercising container boot / imports / IAM / S3-read. Mirrors the other + advisory Lambdas (eval-judge / rationale-clustering / replay-concordance / + counterfactual) which all run dry via $.research_dry rather than skipping. """ - gate = states["CheckShellRunSkipDirector"] - assert gate["Type"] == "Choice" - # shell_run absent/false → Default → ReportCard (byte-identical Saturday). - assert gate["Default"] == "ReportCard" - # shell_run present AND true → skip the whole advisory tail to notify. - choices = gate["Choices"] - assert len(choices) == 1 - assert choices[0]["Next"] == "CheckShellRunNotify" - conds = choices[0]["And"] - assert {c["Variable"] for c in conds} == {"$.shell_run"} - assert any(c.get("IsPresent") is True for c in conds) - assert any(c.get("BooleanEquals") is True for c in conds) - # The skip target is the notify gate, NOT ReportCard/Director. - assert "ReportCard" not in {choices[0]["Next"]} - assert "Director" not in {choices[0]["Next"]} + for state_name in ("ReportCard", "Director"): + payload = states[state_name]["Parameters"]["Payload"] + assert payload.get("dry_run.$") == "$.research_dry", ( + f"{state_name}.Payload must thread dry_run.$=$.research_dry so the " + f"Friday preflight runs it dry (no write / no Opus call); got " + f"{payload.get('dry_run.$')!r}" + ) + # date must still flow so the dry run keys off the same RUN_DATE. + assert payload.get("date.$") == "$.run_date" def test_shell_run_notify_reuses_sns_substrate(self, states): """NotifyShellRunComplete surfaces the user-facing 'Saturday diff --git a/tests/test_sf_payload_uniqueness.py b/tests/test_sf_payload_uniqueness.py index aca708a..2bff028 100644 --- a/tests/test_sf_payload_uniqueness.py +++ b/tests/test_sf_payload_uniqueness.py @@ -105,11 +105,13 @@ def _flatten_states(sf_doc: dict) -> dict: "AggregateCosts": frozenset({"date.$", "dry_run_llm.$"}), # Evaluator Report Card v2 (Layer B) — alpha-engine-evaluator:live. Builds # evaluator/{date}/report_card.json; non-fatal (own Catch → notify gate). - "ReportCard": frozenset({"date.$"}), + # dry_run.$=$.research_dry → no-write on the Friday preflight (ROADMAP L4504). + "ReportCard": frozenset({"date.$", "dry_run.$"}), # Director (Layer C, Part II) — alpha-engine-evaluator-director:live. Final # advisory task; reads the fresh report card, writes director/{date}/ # action_plan.json; flag-gated (DIRECTOR_ENABLED) + non-fatal (own Catch). - "Director": frozenset({"date.$"}), + # dry_run.$=$.research_dry → no-Opus / no-write probe on the preflight (L4504). + "Director": frozenset({"date.$", "dry_run.$"}), } # Weekday SF — alpha-engine-predictor Lambdas diff --git a/tests/test_sf_substrate_check_wiring.py b/tests/test_sf_substrate_check_wiring.py index d8df8ba..e319ce0 100644 --- a/tests/test_sf_substrate_check_wiring.py +++ b/tests/test_sf_substrate_check_wiring.py @@ -77,25 +77,16 @@ def test_wait_for_substrate_routes_to_notify_complete(self, states): # unchanged NotifyComplete, so the REAL Saturday run (no shell_run # input) still ends at NotifyComplete — strict superset preserved. # - # The shell-run guard (CheckShellRunSkipDirector, ROADMAP L4504) now - # sits first: on a Friday-PM preflight (shell_run=true) it skips the - # whole advisory tail straight to CheckShellRunNotify; its Default routes - # the REAL Saturday run (no shell_run) to ReportCard, unchanged. - # # Two non-fatal advisory states (evaluator Report Card v2, then the - # Director) sit between the guard and the notify gate. ReportCard's - # SUCCESS edge feeds the Director (which weighs the fresh card); - # ReportCard's Catch skips the Director straight to notify (no card to - # weigh). The Director's own Next AND Catch both land on - # CheckShellRunNotify, so every path still preserves the success edge. - assert ( - states["WaitForWeeklySubstrateHealthCheck"]["Next"] - == "CheckShellRunSkipDirector" - ) - assert states["CheckShellRunSkipDirector"]["Default"] == "ReportCard" + # Director) sit between the substrate poll and the notify gate. ReportCard's + # SUCCESS edge feeds the Director (which weighs the fresh card); ReportCard's + # Catch skips the Director straight to notify (no card to weigh). The + # Director's own Next AND Catch both land on CheckShellRunNotify, so every + # path still preserves the success edge. On the Friday preflight both states + # RUN (dry, via dry_run.$=$.research_dry — ROADMAP L4504), they are not + # skipped, so the wiring is identical on real + preflight runs. assert ( - states["CheckShellRunSkipDirector"]["Choices"][0]["Next"] - == "CheckShellRunNotify" + states["WaitForWeeklySubstrateHealthCheck"]["Next"] == "ReportCard" ) assert states["ReportCard"]["Next"] == "Director" assert all(c["Next"] == "CheckShellRunNotify" for c in states["ReportCard"]["Catch"])