diff --git a/pyproject.toml b/pyproject.toml index e3c2322781..20452cb640 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,10 @@ sql_sqlite = [ "aiosqlite>=0.21.0", ] redis = ["redis[hiredis] >= 7.0.0"] +sii-gateway = [ + "langgraph>=1.1.0", + "structlog>=25.0.0", +] [project.scripts] crawlee = "crawlee._cli:cli" diff --git a/src/state.py b/src/state.py new file mode 100644 index 0000000000..346bc3d654 --- /dev/null +++ b/src/state.py @@ -0,0 +1,207 @@ +"""SII Recruitment Gateway — LangGraph state schema and graph topology. + +Defines :class:`GraphState`, all ISO 27001:2022 control constants, and the +cyclic Supervisor-Worker graph for the SII technical-screening pipeline. +Node *logic* is **intentionally absent** from this module; every node is +implemented in its own dedicated file under ``src/nodes/`` (P2-P5). + +Classification: CONFIDENTIAL +Applicable regulations: Ley 21.180, Ley 21.663, ISO 27001:2022 + +ISO 27001:2022 controls anchored at this layer: + * A.8.2 — Asset classification (state schema carries classification labels) + * A.5.34 — Privacy and PII protection (PII-bearing fields are redaction-ready) +""" + +from __future__ import annotations + +import operator +from typing import Annotated, Any + +from langgraph.graph import END, START, StateGraph + +# ── ISO 27001:2022 control identifiers ──────────────────────────────────────── +# +# Each value corresponds to the primary ISO 27001:2022 Annex A control that +# governs the respective pipeline node. Nodes are required to append their +# control ID to ``GraphState.iso_controls_applied`` and write a structured +# entry to ``GraphState.audit_trace`` on every invocation. + +ISO_CONTROL_INGESTOR: str = 'A.8.2' # Asset Classification +ISO_CONTROL_RAG_ROUTER: str = 'A.8.24' # Use of Cryptography (local-only RAG) +ISO_CONTROL_EVALUATORS: str = 'A.8.10' # Information Deletion (ephemeral eval) +ISO_CONTROL_REFLECTION: str = 'A.5.34' # Privacy and PII Protection +ISO_CONTROL_REPORT: str = 'A.8.2' # Asset Classification (CONFIDENTIAL output) + +# ── Graph routing constants ──────────────────────────────────────────────────── + +CONSENSUS_THRESHOLD: float = 0.7 +"""Minimum weighted consensus score required to bypass re-evaluation.""" + +MAX_REFLECTION_LOOPS: int = 2 +"""Hard cap on reflection → evaluators re-route iterations (prevents cycles).""" + +# ── Routing destination literals ────────────────────────────────────────────── + +_DEST_EVALUATORS: str = 'evaluators' +_DEST_REPORT: str = 'report_generator' + + +# ── Shared pipeline state ───────────────────────────────────────────────────── + + +class GraphState(dict[str, Any]): # type: ignore[misc] + """Shared mutable state threaded through every LangGraph node. + + Classification: CONFIDENTIAL (Ley 21.663 / ISO 27001:2022 A.8.2) + + Attributes: + transcript: Raw or transcribed candidate speech text. Never logged + without redaction (ISO 27001:2022 A.5.34). + code_payload: Candidate-submitted source code treated as **inert data**; + never executed outside an ephemeral sandbox (ISO 27001:2022 A.8.10). + security_score: Aggregated OWASP security evaluation score in [0.0, 1.0]. + logic_rubric: Algorithmic-complexity and architecture sub-scores keyed by + evaluator dimension name. + audit_trace: Append-only log of node transitions. Each entry is a + ``dict`` containing at minimum ``node``, ``iso_control``, + ``timestamp``, and ``prev_hash`` (SHA-256 chain-of-trust). + iso_controls_applied: Ordered list of ISO 27001:2022 control IDs + applied during the current pipeline run. Every node appends its + primary control on entry. + reflection_loops: Counter of reflection → evaluators re-route iterations + performed so far. Bounded by :data:`MAX_REFLECTION_LOOPS`. + consensus_score: Final weighted consensus computed by the reflection node + (security 40 %, logic 35 %, architecture 25 %). + candidate_id: Opaque pseudonymous identifier. Must **never** contain + raw PII such as a RUT, name, or e-mail address. + """ + + transcript: str + code_payload: str + security_score: float + logic_rubric: dict[str, Any] + audit_trace: Annotated[list[dict[str, Any]], operator.add] + iso_controls_applied: Annotated[list[str], operator.add] + reflection_loops: int + consensus_score: float + candidate_id: str + + +# ── Conditional router ──────────────────────────────────────────────────────── + + +def route_reflection(state: GraphState) -> str: + """Choose the next node after the reflection step. + + Routes back to *evaluators* when the pipeline has not yet reached + consensus and the re-evaluation loop limit has not been exhausted. + Otherwise advances to *report_generator*. + + Args: + state: Current graph state after the reflection node has executed. + + Returns: + ``"evaluators"`` if ``consensus_score < CONSENSUS_THRESHOLD`` and + ``reflection_loops < MAX_REFLECTION_LOOPS``; otherwise + ``"report_generator"``. + """ + below_threshold = float(state.get('consensus_score', 0.0)) < CONSENSUS_THRESHOLD + loops_remaining = int(state.get('reflection_loops', 0)) < MAX_REFLECTION_LOOPS + if below_threshold and loops_remaining: + return _DEST_EVALUATORS + return _DEST_REPORT + + +# ── Placeholder node ────────────────────────────────────────────────────────── + + +def _placeholder_node(_state: GraphState) -> dict[str, Any]: + """Pass-through stub used during graph topology validation only. + + **Replace** each occurrence with the real node function imported from the + corresponding ``src/nodes/`` module once P2-P5 are implemented. + + Args: + _state: Current pipeline state (unused in the stub). + + Returns: + An empty dict (no state mutation) — LangGraph merges this with the + existing state automatically. + """ + return {} + + +# ── Graph factory ───────────────────────────────────────────────────────────── + + +def build_graph() -> StateGraph: + """Construct the SII Recruitment Gateway LangGraph cyclic graph. + + Wires the Supervisor-Worker topology described in the P1 architecture + specification. No node *logic* is embedded here; each node is a stub + that must be replaced when importing the real implementations from + ``src/nodes/*``. + + Topology:: + + START + │ + ingestor (ISO A.8.2 — asset classification) + │ + rag_router (ISO A.8.24 — local-only retrieval, zero network egress) + │ + evaluators (ISO A.8.10 — ephemeral container destruction post-eval) + │ + reflection (ISO A.5.34 — PII redaction before scoring) + │◄──────────── consensus < 0.7 AND loops < MAX_REFLECTION_LOOPS + │ + report_generator (ISO A.8.2 — CONFIDENTIAL output labelling) + │ + END + + Conditional edge: + ``reflection`` → ``evaluators`` when + ``consensus_score < 0.7`` **and** ``reflection_loops < 2``. + Otherwise ``reflection`` → ``report_generator``. + + Returns: + A :class:`~langgraph.graph.StateGraph` instance ready to be compiled + by calling ``.compile()`` after substituting the real node functions. + + Note: + IMPORTANT: This module creates only the graph structure. Do **not** + merge node logic into this file. The modular structure (one file per + node under ``src/nodes/``) is required for the node-by-node + development workflow in P2-P5. + """ + graph: StateGraph = StateGraph(GraphState) + + # ── Register nodes ──────────────────────────────────────────────────────── + # Each entry maps node name → implementation function. + # Swap _placeholder_node for the real function once each P prompt is done. + graph.add_node('ingestor', _placeholder_node) # P2 + graph.add_node('rag_router', _placeholder_node) # P3 + graph.add_node('evaluators', _placeholder_node) # P4 + graph.add_node('reflection', _placeholder_node) # P5 + graph.add_node('report_generator', _placeholder_node) # P5 + + # ── Deterministic edges ─────────────────────────────────────────────────── + graph.add_edge(START, 'ingestor') + graph.add_edge('ingestor', 'rag_router') + graph.add_edge('rag_router', 'evaluators') + graph.add_edge('evaluators', 'reflection') + + # ── Conditional edge: reflection → evaluators OR report_generator ───────── + graph.add_conditional_edges( + 'reflection', + route_reflection, + { + _DEST_EVALUATORS: _DEST_EVALUATORS, + _DEST_REPORT: _DEST_REPORT, + }, + ) + + graph.add_edge('report_generator', END) + + return graph diff --git a/tests/test_state.py b/tests/test_state.py new file mode 100644 index 0000000000..c4214f344d --- /dev/null +++ b/tests/test_state.py @@ -0,0 +1,192 @@ +"""Tests for src/state.py — SII Recruitment Gateway LangGraph topology.""" + +from __future__ import annotations + +import pytest +from langgraph.graph import StateGraph + +from src.state import ( + CONSENSUS_THRESHOLD, + ISO_CONTROL_EVALUATORS, + ISO_CONTROL_INGESTOR, + ISO_CONTROL_RAG_ROUTER, + ISO_CONTROL_REFLECTION, + ISO_CONTROL_REPORT, + MAX_REFLECTION_LOOPS, + GraphState, + build_graph, + route_reflection, +) + +# ── GraphState ──────────────────────────────────────────────────────────────── + + +def test_graph_state_required_keys_present() -> None: + """GraphState must declare all keys required by the P1 specification.""" + required_keys = { + 'transcript', + 'code_payload', + 'security_score', + 'logic_rubric', + 'audit_trace', + 'iso_controls_applied', + 'reflection_loops', + 'consensus_score', + 'candidate_id', + } + # GraphState inherits from dict; verify annotations carry the required names. + annotations = GraphState.__annotations__ + missing = required_keys - set(annotations) + assert not missing, f'Missing GraphState keys: {missing}' + + +def test_graph_state_iso_controls_applied_key_exists() -> None: + """GraphState must include the iso_controls_applied field (P1 ISO update).""" + assert 'iso_controls_applied' in GraphState.__annotations__ + + +def test_graph_state_audit_trace_key_exists() -> None: + """GraphState must include the audit_trace append-only log field.""" + assert 'audit_trace' in GraphState.__annotations__ + + +# ── ISO control constants ───────────────────────────────────────────────────── + + +def test_iso_control_constants_are_strings() -> None: + """All ISO control identifiers must be non-empty strings.""" + controls = [ + ISO_CONTROL_INGESTOR, + ISO_CONTROL_RAG_ROUTER, + ISO_CONTROL_EVALUATORS, + ISO_CONTROL_REFLECTION, + ISO_CONTROL_REPORT, + ] + for ctrl in controls: + assert isinstance(ctrl, str), f'ISO control must be a string, got {type(ctrl)!r}' + assert ctrl, f'ISO control must be non-empty, got {ctrl!r}' + + +def test_iso_control_ingestor_value() -> None: + """Ingestor node must be governed by ISO 27001:2022 A.8.2 (Asset Classification).""" + assert ISO_CONTROL_INGESTOR == 'A.8.2' + + +def test_iso_control_rag_router_value() -> None: + """RAG-router node must be governed by ISO 27001:2022 A.8.24 (local-only crypto/RAG).""" + assert ISO_CONTROL_RAG_ROUTER == 'A.8.24' + + +def test_iso_control_evaluators_value() -> None: + """Evaluators node must be governed by ISO 27001:2022 A.8.10 (Information Deletion).""" + assert ISO_CONTROL_EVALUATORS == 'A.8.10' + + +def test_iso_control_reflection_value() -> None: + """Reflection node must be governed by ISO 27001:2022 A.5.34 (Privacy & PII).""" + assert ISO_CONTROL_REFLECTION == 'A.5.34' + + +def test_iso_control_report_value() -> None: + """Report-generator node must be governed by ISO 27001:2022 A.8.2 (CONFIDENTIAL).""" + assert ISO_CONTROL_REPORT == 'A.8.2' + + +# ── Routing constants ───────────────────────────────────────────────────────── + + +def test_consensus_threshold_value() -> None: + """Consensus threshold must equal 0.7 as specified in P1.""" + assert pytest.approx(0.7) == CONSENSUS_THRESHOLD + + +def test_max_reflection_loops_value() -> None: + """Max reflection-loop counter must equal 2 to prevent infinite cycles.""" + assert MAX_REFLECTION_LOOPS == 2 + + +# ── route_reflection ────────────────────────────────────────────────────────── + + +def test_route_reflection_below_threshold_first_loop_routes_to_evaluators() -> None: + """Routes to evaluators when consensus < 0.7 and no loops consumed yet.""" + state = GraphState({'consensus_score': 0.5, 'reflection_loops': 0}) + assert route_reflection(state) == 'evaluators' + + +def test_route_reflection_below_threshold_one_loop_routes_to_evaluators() -> None: + """Routes to evaluators when consensus < 0.7 and one loop consumed.""" + state = GraphState({'consensus_score': 0.4, 'reflection_loops': 1}) + assert route_reflection(state) == 'evaluators' + + +def test_route_reflection_below_threshold_max_loops_routes_to_report() -> None: + """Routes to report_generator when consensus < 0.7 but loop limit reached.""" + state = GraphState({'consensus_score': 0.5, 'reflection_loops': MAX_REFLECTION_LOOPS}) + assert route_reflection(state) == 'report_generator' + + +def test_route_reflection_above_threshold_routes_to_report() -> None: + """Routes to report_generator when consensus >= 0.7 regardless of loops.""" + state = GraphState({'consensus_score': 0.9, 'reflection_loops': 0}) + assert route_reflection(state) == 'report_generator' + + +def test_route_reflection_exactly_at_threshold_routes_to_report() -> None: + """Consensus equal to threshold (0.7) is NOT below threshold — routes forward.""" + state = GraphState({'consensus_score': CONSENSUS_THRESHOLD, 'reflection_loops': 0}) + assert route_reflection(state) == 'report_generator' + + +def test_route_reflection_missing_keys_defaults_to_evaluators() -> None: + """Missing state keys default to 0 and 0 loops, routing back to evaluators.""" + state = GraphState({}) + assert route_reflection(state) == 'evaluators' + + +def test_route_reflection_return_type_is_string() -> None: + """route_reflection must always return a string node name.""" + state = GraphState({'consensus_score': 0.3, 'reflection_loops': 0}) + result = route_reflection(state) + assert isinstance(result, str) + + +# ── build_graph ─────────────────────────────────────────────────────────────── + + +def test_build_graph_returns_state_graph_instance() -> None: + """build_graph must return a LangGraph StateGraph.""" + graph = build_graph() + assert isinstance(graph, StateGraph) + + +def test_build_graph_contains_all_required_nodes() -> None: + """Graph must contain all five pipeline nodes specified in P1.""" + required_nodes = {'ingestor', 'rag_router', 'evaluators', 'reflection', 'report_generator'} + graph = build_graph() + registered = set(graph.nodes) + # LangGraph injects __start__ / __end__ virtual nodes; ignore those. + pipeline_nodes = {n for n in registered if not n.startswith('__')} + assert required_nodes <= pipeline_nodes, f'Missing nodes: {required_nodes - pipeline_nodes}' + + +def test_build_graph_compiles_without_error() -> None: + """Graph must compile successfully before any node logic is attached.""" + graph = build_graph() + compiled = graph.compile() + assert compiled is not None + + +def test_build_graph_produces_independent_instances() -> None: + """Each call to build_graph must return a fresh, independent StateGraph.""" + graph_a = build_graph() + graph_b = build_graph() + assert graph_a is not graph_b + + +def test_build_graph_conditional_edge_registered() -> None: + """A conditional edge from reflection must be wired in the graph.""" + graph = build_graph() + # LangGraph stores branch data keyed by source node; verify reflection has one. + branches = graph.branches.get('reflection', {}) + assert branches, 'reflection node must have at least one conditional branch registered'