LLMQuant · keola808hunt-dot · Jun 4, 2026
diff --git a/quantmind/configs/base.py b/quantmind/configs/base.py
@@ -8,6 +8,7 @@
 
 from agents import ModelSettings
 from pydantic import BaseModel, ConfigDict
+from pydantic.json_schema import SkipJsonSchema
 
 
 class BaseFlowCfg(BaseModel):
@@ -17,7 +18,12 @@ class BaseFlowCfg(BaseModel):
 
     # Model & execution
     model: str = "gpt-4o"
-    model_settings: ModelSettings | None = None
+    # ``ModelSettings`` carries callable fields that cannot be rendered to JSON
+    # schema. It is an execution knob (set programmatically), never something the
+    # magic NL resolver should populate, so skip it during schema generation —
+    # otherwise the resolver's structured-output schema build raises
+    # PydanticInvalidForJsonSchema. The field still validates and round-trips.
+    model_settings: SkipJsonSchema[ModelSettings | None] = None
     max_turns: int = 10
     timeout_seconds: float = 300.0
 

diff --git a/quantmind/flows/paper.py b/quantmind/flows/paper.py
@@ -13,7 +13,7 @@
 
 from typing import Any, TypeVar
 
-from agents import Agent, RunHooks, Tool
+from agents import Agent, AgentOutputSchema, RunHooks, Tool
 
 from quantmind.configs import PaperFlowCfg
 from quantmind.configs.paper import (
@@ -25,7 +25,7 @@
     RawText,
 )
 from quantmind.flows._runner import run_with_observability
-from quantmind.knowledge import Paper
+from quantmind.knowledge import Paper, PaperExtraction
 from quantmind.preprocess.fetch import (
     Fetched,
     fetch_arxiv,
@@ -85,7 +85,10 @@ async def paper_flow(
             unpaywall fallback is its own follow-up issue).
     """
     cfg = cfg or PaperFlowCfg()
-    out_type: type[Paper] = output_type or Paper  # type: ignore[assignment]
+    # Default to the slug-tolerant extraction model so the LLM's human-readable
+    # node ids are canonicalised to UUIDs (see knowledge._extraction). Callers
+    # passing their own output_type own that concern themselves.
+    out_type: type[Paper] = output_type or PaperExtraction  # type: ignore[assignment]
 
     raw_md, source_meta = await _fetch_and_format(input)
 
@@ -98,7 +101,11 @@ async def paper_flow(
         ),
         "model": cfg.model,
         "tools": list(extra_tools or []),
-        "output_type": out_type,
+        # QuantMind's knowledge models (Paper, etc.) emit `additionalProperties`
+        # in their JSON schema, which the openai-agents SDK's strict-schema mode
+        # rejects. Wrap with strict_json_schema=False to use non-strict structured
+        # output (the SDK-recommended fix). Fixes the agent-setup crash on paper_flow.
+        "output_type": AgentOutputSchema(out_type, strict_json_schema=False),
         "input_guardrails": list(extra_input_guardrails or []),
         "output_guardrails": list(extra_output_guardrails or []),
     }
@@ -140,7 +147,7 @@ async def _fetch_and_format(
         md = await _format_by_content_type(raw)
         return md, {
             "source": "local",
-            "path": str(input.path),
+            "path": input.path.as_posix(),
             "content_type": raw.content_type,
         }
     if isinstance(input, RawText):

diff --git a/quantmind/knowledge/__init__.py b/quantmind/knowledge/__init__.py
@@ -24,7 +24,11 @@
 from quantmind.knowledge.earnings import Earnings
 from quantmind.knowledge.factor import Factor
 from quantmind.knowledge.news import News
-from quantmind.knowledge.paper import Paper, PaperKnowledgeCard
+from quantmind.knowledge.paper import (
+    Paper,
+    PaperExtraction,
+    PaperKnowledgeCard,
+)
 from quantmind.knowledge.thesis import Thesis
 
 __all__ = [
@@ -43,6 +47,7 @@
     "Factor",
     "News",
     "Paper",
+    "PaperExtraction",
     "PaperKnowledgeCard",
     "Thesis",
 ]
diff --git a/quantmind/knowledge/_extraction.py b/quantmind/knowledge/_extraction.py
@@ -0,0 +1,106 @@
+"""Slug -> UUID canonicalisation for LLM-extracted TreeKnowledge.
+
+With ``strict_json_schema=False`` the openai-agents SDK lets the model fill
+``UUID`` id fields with human-readable slugs ("root", "intro"). Slugs are the
+right shape for one-shot tree generation: the model references nodes it is
+creating in the same response, and self-consistent slugs beat invented UUIDs.
+The domain model still stores ``UUID``: a deliberate, tested invariant (node-id
+uniqueness + UUID-typed JSON round-trips in ``tests/knowledge``) and the basis
+for stable dedup/identity across re-runs. This module bridges the
+two: it maps each distinct slug to a fresh ``UUID`` and rewrites every id slot,
+leaving values that are already valid UUIDs untouched.
+"""
+
+from collections.abc import Callable
+from typing import Any
+from uuid import UUID, uuid4
+
+_ResolveFn = Callable[[Any], Any]
+
+
+def _looks_like_uuid(value: Any) -> bool:
+    """True if ``value`` is a UUID or a string already in UUID form."""
+    if isinstance(value, UUID):
+        return True
+    if not isinstance(value, str):
+        return False
+    try:
+        UUID(value)
+    except ValueError:
+        return False
+    return True
+
+
+def canonicalize_tree_ids(data: Any) -> Any:
+    """Rewrite slug ids in a TreeKnowledge-shaped mapping to UUIDs.
+
+    Pure and copy-on-write: the input is never mutated. A no-op for anything
+    that is not a ``dict`` carrying a ``nodes`` map, so non-tree payloads and
+    already-canonical trees pass straight through. Every distinct slug maps to
+    one UUID, so cross-references (``parent_id``, ``children_ids``, citation
+    anchors, and ``nodes`` keys) stay internally consistent.
+    """
+    if not isinstance(data, dict):
+        return data
+    nodes = data.get("nodes")
+    if not isinstance(nodes, dict):
+        return data
+
+    mapping: dict[str, str] = {}
+
+    def resolve(raw: Any) -> Any:
+        """Map one id slot: slug -> uuid; UUID / None / non-str pass through."""
+        if raw is None or not isinstance(raw, str) or _looks_like_uuid(raw):
+            return raw
+        if raw not in mapping:
+            mapping[raw] = str(uuid4())
+        return mapping[raw]
+
+    # Seed from the authoritative identity slots (the node keys) first so a
+    # reference resolves to the same UUID no matter where it is first seen.
+    for slug in nodes:
+        resolve(slug)
+
+    out = dict(data)
+    if "id" in out:
+        out["id"] = resolve(out["id"])
+    if "root_node_id" in out:
+        out["root_node_id"] = resolve(out["root_node_id"])
+    out["citations"] = [
+        _rewrite_citation(c, resolve) for c in out.get("citations", [])
+    ]
+    out["nodes"] = {
+        resolve(key): _rewrite_node(node, resolve)
+        for key, node in nodes.items()
+    }
+    return out
+
+
+def _rewrite_node(node: Any, resolve: _ResolveFn) -> Any:
+    """Return a copy of one node dict with its id slots resolved."""
+    if not isinstance(node, dict):
+        return node
+    out = dict(node)
+    if "node_id" in out:
+        out["node_id"] = resolve(out["node_id"])
+    if "parent_id" in out:
+        out["parent_id"] = resolve(out["parent_id"])
+    children = out.get("children_ids")
+    if isinstance(children, list):
+        out["children_ids"] = [resolve(c) for c in children]
+    citations = out.get("citations")
+    if isinstance(citations, list):
+        out["citations"] = [_rewrite_citation(c, resolve) for c in citations]
+    return out
+
+
+def _rewrite_citation(cit: Any, resolve: _ResolveFn) -> Any:
+    """Return a copy of one citation dict with its anchor ids resolved."""
+    if not isinstance(cit, dict):
+        return cit
+    out = dict(cit)
+    if "tree_id" in out:
+        out["tree_id"] = resolve(out["tree_id"])
+    if "node_id" in out:
+        out["node_id"] = resolve(out["node_id"])
+    return out
diff --git a/quantmind/knowledge/paper.py b/quantmind/knowledge/paper.py
@@ -8,11 +8,12 @@
 summarisation), then a `PaperKnowledgeCard` derived from the root summary.
 """
 
-from typing import Literal
+from typing import Any, Literal
 from uuid import UUID
 
-from pydantic import Field
+from pydantic import Field, model_validator
 
+from quantmind.knowledge._extraction import canonicalize_tree_ids
 from quantmind.knowledge._flatten import FlattenKnowledge
 from quantmind.knowledge._tree import TreeKnowledge
 
@@ -31,6 +32,23 @@ class Paper(TreeKnowledge):
     asset_classes: list[str] = Field(default_factory=list)
 
 
+class PaperExtraction(Paper):
+    """``Paper`` variant for the LLM extraction boundary.
+
+    Same shape as ``Paper`` but tolerant of the slug ids the model emits under
+    ``strict_json_schema=False``. A ``mode="before"`` validator rewrites every
+    slug id slot to a ``UUID`` before the frozen ``Paper`` validation runs, so
+    the result is a fully-valid ``Paper`` (``PaperExtraction`` is a subclass).
+    Use this — not ``Paper`` — as ``paper_flow``'s ``output_type``.
+    """
+
+    @model_validator(mode="before")
+    @classmethod
+    def _canonicalize_ids(cls, data: Any) -> Any:
+        """Rewrite slug ids to UUIDs before frozen ``Paper`` validation."""
+        return canonicalize_tree_ids(data)
+
+
 class PaperKnowledgeCard(FlattenKnowledge):
     """Distilled summary card of a `Paper`.
 

diff --git a/quantmind/magic.py b/quantmind/magic.py
@@ -19,7 +19,7 @@
 from collections.abc import Awaitable, Callable
 from typing import Any, Generic, TypeVar, Union, get_args, get_origin
 
-from agents import Agent, Runner
+from agents import Agent, AgentOutputSchema, Runner
 from pydantic import BaseModel
 
 from quantmind.configs.base import BaseFlowCfg
@@ -91,7 +91,15 @@ async def resolve_magic_input(
         name=f"magic_resolver_{target_flow.__name__}",
         instructions=instructions,
         model=resolver_model,
-        output_type=ResolvedFlowConfig[input_type, cfg_type],  # type: ignore[valid-type]
+        output_type=AgentOutputSchema(
+            # Two layers, both required: model_settings is SkipJsonSchema'd in
+            # BaseFlowCfg so the schema builds at all (ModelSettings has callable
+            # fields), and strict_json_schema=False accepts the
+            # additionalProperties the discriminated-union + knowledge models
+            # emit (same rationale as flows/paper.py).
+            ResolvedFlowConfig[input_type, cfg_type],  # type: ignore[valid-type]
+            strict_json_schema=False,
+        ),
     )
     result = await Runner.run(resolver, natural_language)
     out = result.final_output

diff --git a/tests/flows/test_paper.py b/tests/flows/test_paper.py
@@ -7,7 +7,7 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 from uuid import uuid4
 
-from agents import RunHooks
+from agents import AgentOutputSchema, RunHooks
 
 from quantmind.configs import PaperFlowCfg
 from quantmind.configs.paper import (
@@ -25,7 +25,7 @@
     _format_input,
     paper_flow,
 )
-from quantmind.knowledge import Paper, SourceRef, TreeNode
+from quantmind.knowledge import Paper, PaperExtraction, SourceRef, TreeNode
 from quantmind.preprocess.fetch import Fetched, RawPaper
 
 
@@ -273,7 +273,31 @@ def _capture_agent(*_a: Any, **kwargs: Any) -> Any:
             _patch_runner(_stub_paper()),
         ):
             await paper_flow(RawText(text="x"), output_type=MyPaper)
-        self.assertIs(seen["output_type"], MyPaper)
+        # output_type is wrapped in AgentOutputSchema(strict_json_schema=False)
+        # so QuantMind's non-strict knowledge schema is accepted; the override
+        # type is preserved inside the wrapper.
+        self.assertIsInstance(seen["output_type"], AgentOutputSchema)
+        self.assertIs(seen["output_type"].output_type, MyPaper)
+        self.assertFalse(seen["output_type"].is_strict_json_schema())
+
+    async def test_default_output_type_is_slug_tolerant_extraction(
+        self,
+    ) -> None:
+        seen: dict[str, Any] = {}
+
+        def _capture_agent(*_a: Any, **kwargs: Any) -> Any:
+            seen.update(kwargs)
+            return MagicMock()
+
+        with (
+            patch("quantmind.flows.paper.Agent", side_effect=_capture_agent),
+            _patch_runner(_stub_paper()),
+        ):
+            await paper_flow(RawText(text="x"))
+        # No override -> the flow must default to the slug-tolerant extraction
+        # model so the LLM's slug ids get canonicalised to UUIDs.
+        self.assertIsInstance(seen["output_type"], AgentOutputSchema)
+        self.assertIs(seen["output_type"].output_type, PaperExtraction)
 
     async def test_extra_tools_and_guardrails_forwarded(self) -> None:
         seen: dict[str, Any] = {}