From 7287ea681ab67513de2dc498d9848f644ae927b3 Mon Sep 17 00:00:00 2001 From: keola808hunt-dot Date: Thu, 4 Jun 2026 15:32:56 -0700 Subject: [PATCH] fix(quantmind): canonicalise LLM ids + repair magic resolver structured-output schemas Two openai-agents SDK structured-output failures stopped paper_flow and the magic NL resolver from running end-to-end. Both stem from QuantMind's rich Pydantic models meeting the SDK's strict-schema requirements. paper_flow (the crash): With strict_json_schema=False the model emits human-readable node-id slugs ("root", "intro") into UUID-typed Paper/TreeNode fields -> 94 uuid_parsing errors. The domain UUID is load-bearing (tested node-id uniqueness, UUID-keyed JSON round-trips, stable dedup identity), so rather than weaken it, new PaperExtraction(Paper) carries a mode="before" validator (canonicalize_tree_ids) that maps each distinct slug to one UUID and rewrites every id slot (nodes keys, node_id, parent_id, children_ids, root_node_id, citation anchors), passing values that are already UUIDs through unchanged. paper_flow defaults output_type to it; the domain model is untouched. magic resolver (latent, hidden by mocked-Runner tests): ResolvedFlowConfig embeds BaseFlowCfg.model_settings: ModelSettings, whose callable fields cannot be JSON-schema'd (PydanticInvalidForJsonSchema), and the discriminated-union/knowledge schemas trip strict mode's additionalProperties guard. Both layers fixed: SkipJsonSchema on model_settings (an execution knob, never LLM-set) + AgentOutputSchema(..., strict_json_schema=False) on the resolver output type (same pattern as paper_flow). Local-file provenance now uses Path.as_posix() for portable, cross-platform paths (fixes a Windows-only test). Tests: new test_extraction.py (slug->uuid + uuid passthrough); new ResolverOutputSchemaTests (real non-mocked schema build, both layers); repaired test_output_type_override_propagated + test_basemodel_renders_json_schema to the post-fix contracts; fallback + wrap guards added. Full suite 238/238; ruff + basedpyright clean. Verified by live-fire: paper_flow on arXiv 2606.05138 (the exact paper that 94-errored) and the magic resolver via real gpt-4o-mini. Co-Authored-By: Claude Opus 4.8 --- quantmind/configs/base.py | 8 +- quantmind/flows/paper.py | 17 ++-- quantmind/knowledge/__init__.py | 7 +- quantmind/knowledge/_extraction.py | 106 +++++++++++++++++++++++ quantmind/knowledge/paper.py | 22 ++++- quantmind/magic.py | 12 ++- tests/flows/test_paper.py | 30 ++++++- tests/knowledge/test_extraction.py | 134 +++++++++++++++++++++++++++++ tests/test_magic.py | 47 +++++++++- 9 files changed, 366 insertions(+), 17 deletions(-) create mode 100644 quantmind/knowledge/_extraction.py create mode 100644 tests/knowledge/test_extraction.py diff --git a/quantmind/configs/base.py b/quantmind/configs/base.py index 05b5ef5..c5dfce0 100644 --- a/quantmind/configs/base.py +++ b/quantmind/configs/base.py @@ -8,6 +8,7 @@ from agents import ModelSettings from pydantic import BaseModel, ConfigDict +from pydantic.json_schema import SkipJsonSchema class BaseFlowCfg(BaseModel): @@ -17,7 +18,12 @@ class BaseFlowCfg(BaseModel): # Model & execution model: str = "gpt-4o" - model_settings: ModelSettings | None = None + # ``ModelSettings`` carries callable fields that cannot be rendered to JSON + # schema. It is an execution knob (set programmatically), never something the + # magic NL resolver should populate, so skip it during schema generation — + # otherwise the resolver's structured-output schema build raises + # PydanticInvalidForJsonSchema. The field still validates and round-trips. + model_settings: SkipJsonSchema[ModelSettings | None] = None max_turns: int = 10 timeout_seconds: float = 300.0 diff --git a/quantmind/flows/paper.py b/quantmind/flows/paper.py index 5b8373b..68d1500 100644 --- a/quantmind/flows/paper.py +++ b/quantmind/flows/paper.py @@ -13,7 +13,7 @@ from typing import Any, TypeVar -from agents import Agent, RunHooks, Tool +from agents import Agent, AgentOutputSchema, RunHooks, Tool from quantmind.configs import PaperFlowCfg from quantmind.configs.paper import ( @@ -25,7 +25,7 @@ RawText, ) from quantmind.flows._runner import run_with_observability -from quantmind.knowledge import Paper +from quantmind.knowledge import Paper, PaperExtraction from quantmind.preprocess.fetch import ( Fetched, fetch_arxiv, @@ -85,7 +85,10 @@ async def paper_flow( unpaywall fallback is its own follow-up issue). """ cfg = cfg or PaperFlowCfg() - out_type: type[Paper] = output_type or Paper # type: ignore[assignment] + # Default to the slug-tolerant extraction model so the LLM's human-readable + # node ids are canonicalised to UUIDs (see knowledge._extraction). Callers + # passing their own output_type own that concern themselves. + out_type: type[Paper] = output_type or PaperExtraction # type: ignore[assignment] raw_md, source_meta = await _fetch_and_format(input) @@ -98,7 +101,11 @@ async def paper_flow( ), "model": cfg.model, "tools": list(extra_tools or []), - "output_type": out_type, + # QuantMind's knowledge models (Paper, etc.) emit `additionalProperties` + # in their JSON schema, which the openai-agents SDK's strict-schema mode + # rejects. Wrap with strict_json_schema=False to use non-strict structured + # output (the SDK-recommended fix). Fixes the agent-setup crash on paper_flow. + "output_type": AgentOutputSchema(out_type, strict_json_schema=False), "input_guardrails": list(extra_input_guardrails or []), "output_guardrails": list(extra_output_guardrails or []), } @@ -140,7 +147,7 @@ async def _fetch_and_format( md = await _format_by_content_type(raw) return md, { "source": "local", - "path": str(input.path), + "path": input.path.as_posix(), "content_type": raw.content_type, } if isinstance(input, RawText): diff --git a/quantmind/knowledge/__init__.py b/quantmind/knowledge/__init__.py index 972a535..7ec3a54 100644 --- a/quantmind/knowledge/__init__.py +++ b/quantmind/knowledge/__init__.py @@ -24,7 +24,11 @@ from quantmind.knowledge.earnings import Earnings from quantmind.knowledge.factor import Factor from quantmind.knowledge.news import News -from quantmind.knowledge.paper import Paper, PaperKnowledgeCard +from quantmind.knowledge.paper import ( + Paper, + PaperExtraction, + PaperKnowledgeCard, +) from quantmind.knowledge.thesis import Thesis __all__ = [ @@ -43,6 +47,7 @@ "Factor", "News", "Paper", + "PaperExtraction", "PaperKnowledgeCard", "Thesis", ] diff --git a/quantmind/knowledge/_extraction.py b/quantmind/knowledge/_extraction.py new file mode 100644 index 0000000..0b4e605 --- /dev/null +++ b/quantmind/knowledge/_extraction.py @@ -0,0 +1,106 @@ +"""Slug -> UUID canonicalisation for LLM-extracted TreeKnowledge. + +With ``strict_json_schema=False`` the openai-agents SDK lets the model fill +``UUID`` id fields with human-readable slugs ("root", "intro"). Slugs are the +right shape for one-shot tree generation: the model references nodes it is +creating in the same response, and self-consistent slugs beat invented UUIDs. +The domain model still stores ``UUID``: a deliberate, tested invariant (node-id +uniqueness + UUID-typed JSON round-trips in ``tests/knowledge``) and the basis +for stable dedup/identity across re-runs. This module bridges the +two: it maps each distinct slug to a fresh ``UUID`` and rewrites every id slot, +leaving values that are already valid UUIDs untouched. +""" + +from collections.abc import Callable +from typing import Any +from uuid import UUID, uuid4 + +_ResolveFn = Callable[[Any], Any] + + +def _looks_like_uuid(value: Any) -> bool: + """True if ``value`` is a UUID or a string already in UUID form.""" + if isinstance(value, UUID): + return True + if not isinstance(value, str): + return False + try: + UUID(value) + except ValueError: + return False + return True + + +def canonicalize_tree_ids(data: Any) -> Any: + """Rewrite slug ids in a TreeKnowledge-shaped mapping to UUIDs. + + Pure and copy-on-write: the input is never mutated. A no-op for anything + that is not a ``dict`` carrying a ``nodes`` map, so non-tree payloads and + already-canonical trees pass straight through. Every distinct slug maps to + one UUID, so cross-references (``parent_id``, ``children_ids``, citation + anchors, and ``nodes`` keys) stay internally consistent. + """ + if not isinstance(data, dict): + return data + nodes = data.get("nodes") + if not isinstance(nodes, dict): + return data + + mapping: dict[str, str] = {} + + def resolve(raw: Any) -> Any: + """Map one id slot: slug -> uuid; UUID / None / non-str pass through.""" + if raw is None or not isinstance(raw, str) or _looks_like_uuid(raw): + return raw + if raw not in mapping: + mapping[raw] = str(uuid4()) + return mapping[raw] + + # Seed from the authoritative identity slots (the node keys) first so a + # reference resolves to the same UUID no matter where it is first seen. + for slug in nodes: + resolve(slug) + + out = dict(data) + if "id" in out: + out["id"] = resolve(out["id"]) + if "root_node_id" in out: + out["root_node_id"] = resolve(out["root_node_id"]) + out["citations"] = [ + _rewrite_citation(c, resolve) for c in out.get("citations", []) + ] + out["nodes"] = { + resolve(key): _rewrite_node(node, resolve) + for key, node in nodes.items() + } + return out + + +def _rewrite_node(node: Any, resolve: _ResolveFn) -> Any: + """Return a copy of one node dict with its id slots resolved.""" + if not isinstance(node, dict): + return node + out = dict(node) + if "node_id" in out: + out["node_id"] = resolve(out["node_id"]) + if "parent_id" in out: + out["parent_id"] = resolve(out["parent_id"]) + children = out.get("children_ids") + if isinstance(children, list): + out["children_ids"] = [resolve(c) for c in children] + citations = out.get("citations") + if isinstance(citations, list): + out["citations"] = [_rewrite_citation(c, resolve) for c in citations] + return out + + +def _rewrite_citation(cit: Any, resolve: _ResolveFn) -> Any: + """Return a copy of one citation dict with its anchor ids resolved.""" + if not isinstance(cit, dict): + return cit + out = dict(cit) + if "tree_id" in out: + out["tree_id"] = resolve(out["tree_id"]) + if "node_id" in out: + out["node_id"] = resolve(out["node_id"]) + return out diff --git a/quantmind/knowledge/paper.py b/quantmind/knowledge/paper.py index 5345bdf..ac63868 100644 --- a/quantmind/knowledge/paper.py +++ b/quantmind/knowledge/paper.py @@ -8,11 +8,12 @@ summarisation), then a `PaperKnowledgeCard` derived from the root summary. """ -from typing import Literal +from typing import Any, Literal from uuid import UUID -from pydantic import Field +from pydantic import Field, model_validator +from quantmind.knowledge._extraction import canonicalize_tree_ids from quantmind.knowledge._flatten import FlattenKnowledge from quantmind.knowledge._tree import TreeKnowledge @@ -31,6 +32,23 @@ class Paper(TreeKnowledge): asset_classes: list[str] = Field(default_factory=list) +class PaperExtraction(Paper): + """``Paper`` variant for the LLM extraction boundary. + + Same shape as ``Paper`` but tolerant of the slug ids the model emits under + ``strict_json_schema=False``. A ``mode="before"`` validator rewrites every + slug id slot to a ``UUID`` before the frozen ``Paper`` validation runs, so + the result is a fully-valid ``Paper`` (``PaperExtraction`` is a subclass). + Use this — not ``Paper`` — as ``paper_flow``'s ``output_type``. + """ + + @model_validator(mode="before") + @classmethod + def _canonicalize_ids(cls, data: Any) -> Any: + """Rewrite slug ids to UUIDs before frozen ``Paper`` validation.""" + return canonicalize_tree_ids(data) + + class PaperKnowledgeCard(FlattenKnowledge): """Distilled summary card of a `Paper`. diff --git a/quantmind/magic.py b/quantmind/magic.py index 71e522a..8aa831b 100644 --- a/quantmind/magic.py +++ b/quantmind/magic.py @@ -19,7 +19,7 @@ from collections.abc import Awaitable, Callable from typing import Any, Generic, TypeVar, Union, get_args, get_origin -from agents import Agent, Runner +from agents import Agent, AgentOutputSchema, Runner from pydantic import BaseModel from quantmind.configs.base import BaseFlowCfg @@ -91,7 +91,15 @@ async def resolve_magic_input( name=f"magic_resolver_{target_flow.__name__}", instructions=instructions, model=resolver_model, - output_type=ResolvedFlowConfig[input_type, cfg_type], # type: ignore[valid-type] + output_type=AgentOutputSchema( + # Two layers, both required: model_settings is SkipJsonSchema'd in + # BaseFlowCfg so the schema builds at all (ModelSettings has callable + # fields), and strict_json_schema=False accepts the + # additionalProperties the discriminated-union + knowledge models + # emit (same rationale as flows/paper.py). + ResolvedFlowConfig[input_type, cfg_type], # type: ignore[valid-type] + strict_json_schema=False, + ), ) result = await Runner.run(resolver, natural_language) out = result.final_output diff --git a/tests/flows/test_paper.py b/tests/flows/test_paper.py index 2a143c7..46a089a 100644 --- a/tests/flows/test_paper.py +++ b/tests/flows/test_paper.py @@ -7,7 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from uuid import uuid4 -from agents import RunHooks +from agents import AgentOutputSchema, RunHooks from quantmind.configs import PaperFlowCfg from quantmind.configs.paper import ( @@ -25,7 +25,7 @@ _format_input, paper_flow, ) -from quantmind.knowledge import Paper, SourceRef, TreeNode +from quantmind.knowledge import Paper, PaperExtraction, SourceRef, TreeNode from quantmind.preprocess.fetch import Fetched, RawPaper @@ -273,7 +273,31 @@ def _capture_agent(*_a: Any, **kwargs: Any) -> Any: _patch_runner(_stub_paper()), ): await paper_flow(RawText(text="x"), output_type=MyPaper) - self.assertIs(seen["output_type"], MyPaper) + # output_type is wrapped in AgentOutputSchema(strict_json_schema=False) + # so QuantMind's non-strict knowledge schema is accepted; the override + # type is preserved inside the wrapper. + self.assertIsInstance(seen["output_type"], AgentOutputSchema) + self.assertIs(seen["output_type"].output_type, MyPaper) + self.assertFalse(seen["output_type"].is_strict_json_schema()) + + async def test_default_output_type_is_slug_tolerant_extraction( + self, + ) -> None: + seen: dict[str, Any] = {} + + def _capture_agent(*_a: Any, **kwargs: Any) -> Any: + seen.update(kwargs) + return MagicMock() + + with ( + patch("quantmind.flows.paper.Agent", side_effect=_capture_agent), + _patch_runner(_stub_paper()), + ): + await paper_flow(RawText(text="x")) + # No override -> the flow must default to the slug-tolerant extraction + # model so the LLM's slug ids get canonicalised to UUIDs. + self.assertIsInstance(seen["output_type"], AgentOutputSchema) + self.assertIs(seen["output_type"].output_type, PaperExtraction) async def test_extra_tools_and_guardrails_forwarded(self) -> None: seen: dict[str, Any] = {} diff --git a/tests/knowledge/test_extraction.py b/tests/knowledge/test_extraction.py new file mode 100644 index 0000000..15adbf6 --- /dev/null +++ b/tests/knowledge/test_extraction.py @@ -0,0 +1,134 @@ +"""Tests for slug -> UUID canonicalisation at the LLM extraction boundary. + +The openai-agents SDK, with ``strict_json_schema=False``, lets the model emit +human-readable slug ids ("root", "intro", "methodology") wherever the +``Paper`` / ``TreeKnowledge`` schema declares a ``UUID``. The domain model +keeps ``UUID`` (load-bearing: ``tests/knowledge`` assert node-id uniqueness and +UUID-typed JSON round-trips, and the standard relies on stable unique identity +for dedup across re-runs). The +extraction model ``PaperExtraction`` bridges the two: it accepts the slug tree +the LLM produces and canonicalises every id slot to a real ``UUID`` before the +frozen domain validation runs, preserving the tree's structure. +""" + +import unittest +from uuid import UUID, uuid4 + +from quantmind.knowledge import Paper +from quantmind.knowledge.paper import PaperExtraction + + +def _slug_raw() -> dict: + """A slug-keyed Paper payload mirroring real gpt-5.4-mini output.""" + return { + "id": "paper-2606.05138", + "item_type": "paper", + "as_of": "2026-05-01T00:00:00Z", + "source": {"kind": "arxiv", "uri": "arxiv:2606.05138"}, + "root_node_id": "root", + "citations": [ + { + "source_id": "paper:2606.05138", + "tree_id": "root", + "node_id": "root", + } + ], + "nodes": { + "root": { + "node_id": "root", + "parent_id": None, + "title": "Paper", + "summary": "Top-level summary.", + "children_ids": ["intro", "methodology"], + "citations": [ + {"source_id": "s1", "tree_id": "root", "node_id": "root"} + ], + }, + "intro": { + "node_id": "intro", + "parent_id": "root", + "title": "Introduction", + "summary": "Intro summary.", + "children_ids": [], + "citations": [], + }, + "methodology": { + "node_id": "methodology", + "parent_id": "root", + "title": "Methodology", + "summary": "Method summary.", + "children_ids": [], + "citations": [], + }, + }, + } + + +class PaperExtractionCanonicalisesSlugIds(unittest.TestCase): + def test_slug_tree_becomes_uuid_paper_with_structure_preserved( + self, + ) -> None: + paper = PaperExtraction.model_validate(_slug_raw()) + + # It is a real Paper with UUID identity. + self.assertIsInstance(paper, Paper) + self.assertIsInstance(paper.id, UUID) + + # Root resolves and is a genuine UUID present in the node map. + self.assertIsInstance(paper.root_node_id, UUID) + self.assertIn(paper.root_node_id, paper.nodes) + root = paper.nodes[paper.root_node_id] + self.assertIsNone(root.parent_id) + + # Children are UUIDs, resolve in the map, and point back to root. + self.assertEqual(len(root.children_ids), 2) + for child_id in root.children_ids: + self.assertIsInstance(child_id, UUID) + self.assertIn(child_id, paper.nodes) + self.assertEqual( + paper.nodes[child_id].parent_id, paper.root_node_id + ) + + # A slug used in multiple positions maps to ONE uuid (internal + # consistency): the "intro" child id equals the intro node's own id. + intro_node = next( + n for n in paper.nodes.values() if n.title == "Introduction" + ) + self.assertIn(intro_node.node_id, root.children_ids) + + # Citation anchors are canonicalised to the same node uuids. + self.assertEqual(root.citations[0].node_id, paper.root_node_id) + self.assertEqual(root.citations[0].tree_id, paper.root_node_id) + + def test_existing_uuid_ids_pass_through_unchanged(self) -> None: + root_id, child_id = uuid4(), uuid4() + raw = { + "item_type": "paper", + "as_of": "2026-05-01T00:00:00Z", + "source": {"kind": "arxiv", "uri": "arxiv:x"}, + "root_node_id": str(root_id), + "nodes": { + str(root_id): { + "node_id": str(root_id), + "parent_id": None, + "title": "Root", + "summary": "s", + "children_ids": [str(child_id)], + }, + str(child_id): { + "node_id": str(child_id), + "parent_id": str(root_id), + "title": "Child", + "summary": "s", + "children_ids": [], + }, + }, + } + paper = PaperExtraction.model_validate(raw) + self.assertEqual(paper.root_node_id, root_id) + self.assertIn(child_id, paper.nodes) + self.assertEqual(paper.nodes[child_id].parent_id, root_id) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_magic.py b/tests/test_magic.py index 426f6cd..680d8c4 100644 --- a/tests/test_magic.py +++ b/tests/test_magic.py @@ -7,6 +7,7 @@ from typing import Optional, Union from unittest.mock import AsyncMock, MagicMock, patch +from agents import AgentOutputSchema from pydantic import BaseModel from quantmind.configs import PaperFlowCfg @@ -68,12 +69,30 @@ async def bad(input: ArxivIdentifier, *, cfg: int = 0) -> None: class PydanticSchemaStrTests(unittest.TestCase): def test_basemodel_renders_json_schema(self) -> None: - # PaperFlowCfg embeds ModelSettings which has callable fields; - # the renderer falls back to a fields summary in that case. + # PaperFlowCfg embeds ModelSettings, but model_settings is + # SkipJsonSchema'd, so the cfg now renders a FULL JSON schema (no + # fields-summary fallback) with model_settings omitted. out = _pydantic_schema_str(PaperFlowCfg) parsed = json.loads(out) self.assertEqual(parsed.get("title"), "PaperFlowCfg") - self.assertIn("model", parsed["fields"]) + self.assertIn("model", parsed["properties"]) + self.assertNotIn("model_settings", parsed["properties"]) + + def test_unschemable_field_falls_back_to_fields_summary(self) -> None: + # Defensive fallback in _pydantic_schema_str: a model with a genuinely + # un-schemable field still yields a usable name+fields summary. + from collections.abc import Callable + + from pydantic import ConfigDict + + class _Unschemable(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + fn: Callable[[], int] + + out = _pydantic_schema_str(_Unschemable) + parsed = json.loads(out) + self.assertEqual(parsed["title"], "_Unschemable") + self.assertIn("fn", parsed["fields"]) def test_basemodel_with_clean_schema(self) -> None: class Clean(BaseModel): @@ -135,6 +154,10 @@ def _capture_agent(*_a: object, **kwargs: object) -> object: # Resolver agent was given a name derived from the flow. self.assertEqual(captured["name"], "magic_resolver_paper_flow") self.assertEqual(captured["model"], "gpt-4o-mini") + # Output type is wrapped non-strict so the union/knowledge schema is + # accepted (guards the AgentOutputSchema wrap in resolve_magic_input). + self.assertIsInstance(captured["output_type"], AgentOutputSchema) + self.assertFalse(captured["output_type"].is_strict_json_schema()) async def test_custom_resolver_instructions(self) -> None: captured: dict[str, object] = {} @@ -220,3 +243,21 @@ async def test_prints_and_returns_tuple(self) -> None: self.assertIn("input_obj:", out) self.assertIn("cfg_obj:", out) self.assertIn("2604.12345", out) + + +class ResolverOutputSchemaTests(unittest.TestCase): + def test_resolver_output_type_builds_despite_model_settings(self) -> None: + # The real resolver output-type construction (see resolve_magic_input). + # Two layers had to be fixed and both are exercised here: SkipJsonSchema + # on model_settings lets the schema build at all (no CallableSchema), and + # strict_json_schema=False accepts the additionalProperties the + # discriminated-union + knowledge models emit. Either layer alone + # crashed — caught by the live-fire, not the mocked-Runner tests above. + schema_obj = AgentOutputSchema( + ResolvedFlowConfig[PaperInput, PaperFlowCfg], + strict_json_schema=False, + ) + js = schema_obj.json_schema() + self.assertIsInstance(js, dict) + # model_settings is intentionally absent from the LLM-facing schema. + self.assertNotIn("model_settings", json.dumps(js))