Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion quantmind/configs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from agents import ModelSettings
from pydantic import BaseModel, ConfigDict
from pydantic.json_schema import SkipJsonSchema


class BaseFlowCfg(BaseModel):
Expand All @@ -17,7 +18,12 @@ class BaseFlowCfg(BaseModel):

# Model & execution
model: str = "gpt-4o"
model_settings: ModelSettings | None = None
# ``ModelSettings`` carries callable fields that cannot be rendered to JSON
# schema. It is an execution knob (set programmatically), never something the
# magic NL resolver should populate, so skip it during schema generation —
# otherwise the resolver's structured-output schema build raises
# PydanticInvalidForJsonSchema. The field still validates and round-trips.
model_settings: SkipJsonSchema[ModelSettings | None] = None
max_turns: int = 10
timeout_seconds: float = 300.0

Expand Down
17 changes: 12 additions & 5 deletions quantmind/flows/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from typing import Any, TypeVar

from agents import Agent, RunHooks, Tool
from agents import Agent, AgentOutputSchema, RunHooks, Tool

from quantmind.configs import PaperFlowCfg
from quantmind.configs.paper import (
Expand All @@ -25,7 +25,7 @@
RawText,
)
from quantmind.flows._runner import run_with_observability
from quantmind.knowledge import Paper
from quantmind.knowledge import Paper, PaperExtraction
from quantmind.preprocess.fetch import (
Fetched,
fetch_arxiv,
Expand Down Expand Up @@ -85,7 +85,10 @@ async def paper_flow(
unpaywall fallback is its own follow-up issue).
"""
cfg = cfg or PaperFlowCfg()
out_type: type[Paper] = output_type or Paper # type: ignore[assignment]
# Default to the slug-tolerant extraction model so the LLM's human-readable
# node ids are canonicalised to UUIDs (see knowledge._extraction). Callers
# passing their own output_type own that concern themselves.
out_type: type[Paper] = output_type or PaperExtraction # type: ignore[assignment]

raw_md, source_meta = await _fetch_and_format(input)

Expand All @@ -98,7 +101,11 @@ async def paper_flow(
),
"model": cfg.model,
"tools": list(extra_tools or []),
"output_type": out_type,
# QuantMind's knowledge models (Paper, etc.) emit `additionalProperties`
# in their JSON schema, which the openai-agents SDK's strict-schema mode
# rejects. Wrap with strict_json_schema=False to use non-strict structured
# output (the SDK-recommended fix). Fixes the agent-setup crash on paper_flow.
"output_type": AgentOutputSchema(out_type, strict_json_schema=False),
"input_guardrails": list(extra_input_guardrails or []),
"output_guardrails": list(extra_output_guardrails or []),
}
Expand Down Expand Up @@ -140,7 +147,7 @@ async def _fetch_and_format(
md = await _format_by_content_type(raw)
return md, {
"source": "local",
"path": str(input.path),
"path": input.path.as_posix(),
"content_type": raw.content_type,
}
if isinstance(input, RawText):
Expand Down
7 changes: 6 additions & 1 deletion quantmind/knowledge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
from quantmind.knowledge.earnings import Earnings
from quantmind.knowledge.factor import Factor
from quantmind.knowledge.news import News
from quantmind.knowledge.paper import Paper, PaperKnowledgeCard
from quantmind.knowledge.paper import (
Paper,
PaperExtraction,
PaperKnowledgeCard,
)
from quantmind.knowledge.thesis import Thesis

__all__ = [
Expand All @@ -43,6 +47,7 @@
"Factor",
"News",
"Paper",
"PaperExtraction",
"PaperKnowledgeCard",
"Thesis",
]
106 changes: 106 additions & 0 deletions quantmind/knowledge/_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Slug -> UUID canonicalisation for LLM-extracted TreeKnowledge.

With ``strict_json_schema=False`` the openai-agents SDK lets the model fill
``UUID`` id fields with human-readable slugs ("root", "intro"). Slugs are the
right shape for one-shot tree generation: the model references nodes it is
creating in the same response, and self-consistent slugs beat invented UUIDs.
The domain model still stores ``UUID``: a deliberate, tested invariant (node-id
uniqueness + UUID-typed JSON round-trips in ``tests/knowledge``) and the basis
for stable dedup/identity across re-runs. This module bridges the
two: it maps each distinct slug to a fresh ``UUID`` and rewrites every id slot,
leaving values that are already valid UUIDs untouched.
"""

from collections.abc import Callable
from typing import Any
from uuid import UUID, uuid4

_ResolveFn = Callable[[Any], Any]


def _looks_like_uuid(value: Any) -> bool:
"""True if ``value`` is a UUID or a string already in UUID form."""
if isinstance(value, UUID):
return True
if not isinstance(value, str):
return False
try:
UUID(value)
except ValueError:
return False
return True


def canonicalize_tree_ids(data: Any) -> Any:
"""Rewrite slug ids in a TreeKnowledge-shaped mapping to UUIDs.

Pure and copy-on-write: the input is never mutated. A no-op for anything
that is not a ``dict`` carrying a ``nodes`` map, so non-tree payloads and
already-canonical trees pass straight through. Every distinct slug maps to
one UUID, so cross-references (``parent_id``, ``children_ids``, citation
anchors, and ``nodes`` keys) stay internally consistent.
"""
if not isinstance(data, dict):
return data
nodes = data.get("nodes")
if not isinstance(nodes, dict):
return data

mapping: dict[str, str] = {}

def resolve(raw: Any) -> Any:
"""Map one id slot: slug -> uuid; UUID / None / non-str pass through."""
if raw is None or not isinstance(raw, str) or _looks_like_uuid(raw):
return raw
if raw not in mapping:
mapping[raw] = str(uuid4())
return mapping[raw]

# Seed from the authoritative identity slots (the node keys) first so a
# reference resolves to the same UUID no matter where it is first seen.
for slug in nodes:
resolve(slug)

out = dict(data)
if "id" in out:
out["id"] = resolve(out["id"])
if "root_node_id" in out:
out["root_node_id"] = resolve(out["root_node_id"])
out["citations"] = [
_rewrite_citation(c, resolve) for c in out.get("citations", [])
]
out["nodes"] = {
resolve(key): _rewrite_node(node, resolve)
for key, node in nodes.items()
}
return out


def _rewrite_node(node: Any, resolve: _ResolveFn) -> Any:
"""Return a copy of one node dict with its id slots resolved."""
if not isinstance(node, dict):
return node
out = dict(node)
if "node_id" in out:
out["node_id"] = resolve(out["node_id"])
if "parent_id" in out:
out["parent_id"] = resolve(out["parent_id"])
children = out.get("children_ids")
if isinstance(children, list):
out["children_ids"] = [resolve(c) for c in children]
citations = out.get("citations")
if isinstance(citations, list):
out["citations"] = [_rewrite_citation(c, resolve) for c in citations]
return out


def _rewrite_citation(cit: Any, resolve: _ResolveFn) -> Any:
"""Return a copy of one citation dict with its anchor ids resolved."""
if not isinstance(cit, dict):
return cit
out = dict(cit)
if "tree_id" in out:
out["tree_id"] = resolve(out["tree_id"])
if "node_id" in out:
out["node_id"] = resolve(out["node_id"])
return out
22 changes: 20 additions & 2 deletions quantmind/knowledge/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
summarisation), then a `PaperKnowledgeCard` derived from the root summary.
"""

from typing import Literal
from typing import Any, Literal
from uuid import UUID

from pydantic import Field
from pydantic import Field, model_validator

from quantmind.knowledge._extraction import canonicalize_tree_ids
from quantmind.knowledge._flatten import FlattenKnowledge
from quantmind.knowledge._tree import TreeKnowledge

Expand All @@ -31,6 +32,23 @@ class Paper(TreeKnowledge):
asset_classes: list[str] = Field(default_factory=list)


class PaperExtraction(Paper):
"""``Paper`` variant for the LLM extraction boundary.

Same shape as ``Paper`` but tolerant of the slug ids the model emits under
``strict_json_schema=False``. A ``mode="before"`` validator rewrites every
slug id slot to a ``UUID`` before the frozen ``Paper`` validation runs, so
the result is a fully-valid ``Paper`` (``PaperExtraction`` is a subclass).
Use this — not ``Paper`` — as ``paper_flow``'s ``output_type``.
"""

@model_validator(mode="before")
@classmethod
def _canonicalize_ids(cls, data: Any) -> Any:
"""Rewrite slug ids to UUIDs before frozen ``Paper`` validation."""
return canonicalize_tree_ids(data)


class PaperKnowledgeCard(FlattenKnowledge):
"""Distilled summary card of a `Paper`.

Expand Down
12 changes: 10 additions & 2 deletions quantmind/magic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from collections.abc import Awaitable, Callable
from typing import Any, Generic, TypeVar, Union, get_args, get_origin

from agents import Agent, Runner
from agents import Agent, AgentOutputSchema, Runner
from pydantic import BaseModel

from quantmind.configs.base import BaseFlowCfg
Expand Down Expand Up @@ -91,7 +91,15 @@ async def resolve_magic_input(
name=f"magic_resolver_{target_flow.__name__}",
instructions=instructions,
model=resolver_model,
output_type=ResolvedFlowConfig[input_type, cfg_type], # type: ignore[valid-type]
output_type=AgentOutputSchema(
# Two layers, both required: model_settings is SkipJsonSchema'd in
# BaseFlowCfg so the schema builds at all (ModelSettings has callable
# fields), and strict_json_schema=False accepts the
# additionalProperties the discriminated-union + knowledge models
# emit (same rationale as flows/paper.py).
ResolvedFlowConfig[input_type, cfg_type], # type: ignore[valid-type]
strict_json_schema=False,
),
)
result = await Runner.run(resolver, natural_language)
out = result.final_output
Expand Down
30 changes: 27 additions & 3 deletions tests/flows/test_paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from unittest.mock import AsyncMock, MagicMock, patch
from uuid import uuid4

from agents import RunHooks
from agents import AgentOutputSchema, RunHooks

from quantmind.configs import PaperFlowCfg
from quantmind.configs.paper import (
Expand All @@ -25,7 +25,7 @@
_format_input,
paper_flow,
)
from quantmind.knowledge import Paper, SourceRef, TreeNode
from quantmind.knowledge import Paper, PaperExtraction, SourceRef, TreeNode
from quantmind.preprocess.fetch import Fetched, RawPaper


Expand Down Expand Up @@ -273,7 +273,31 @@ def _capture_agent(*_a: Any, **kwargs: Any) -> Any:
_patch_runner(_stub_paper()),
):
await paper_flow(RawText(text="x"), output_type=MyPaper)
self.assertIs(seen["output_type"], MyPaper)
# output_type is wrapped in AgentOutputSchema(strict_json_schema=False)
# so QuantMind's non-strict knowledge schema is accepted; the override
# type is preserved inside the wrapper.
self.assertIsInstance(seen["output_type"], AgentOutputSchema)
self.assertIs(seen["output_type"].output_type, MyPaper)
self.assertFalse(seen["output_type"].is_strict_json_schema())

async def test_default_output_type_is_slug_tolerant_extraction(
self,
) -> None:
seen: dict[str, Any] = {}

def _capture_agent(*_a: Any, **kwargs: Any) -> Any:
seen.update(kwargs)
return MagicMock()

with (
patch("quantmind.flows.paper.Agent", side_effect=_capture_agent),
_patch_runner(_stub_paper()),
):
await paper_flow(RawText(text="x"))
# No override -> the flow must default to the slug-tolerant extraction
# model so the LLM's slug ids get canonicalised to UUIDs.
self.assertIsInstance(seen["output_type"], AgentOutputSchema)
self.assertIs(seen["output_type"].output_type, PaperExtraction)

async def test_extra_tools_and_guardrails_forwarded(self) -> None:
seen: dict[str, Any] = {}
Expand Down
Loading