Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions apps/miroflow-agent/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ ANTHROPIC_BASE_URL=https://api.anthropic.com
TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id
TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key

# API for Sofya Search, Scrape, and Research (optional)
# Enable with tool-sofya-search in your agent config. Get a key at https://sofya.co
SOFYA_API_KEY=your_sofya_key
SOFYA_BASE_URL="https://sofya.co"

# API for Summary LLM (optional)
SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions"
SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name
Expand Down
31 changes: 31 additions & 0 deletions apps/miroflow-agent/src/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID")
TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY")

# API for Sofya Search, Scrape, and Research
SOFYA_API_KEY = os.environ.get("SOFYA_API_KEY")
SOFYA_BASE_URL = os.environ.get("SOFYA_BASE_URL", "https://sofya.co")

# API for Summary LLM
SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY")
SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL")
Expand Down Expand Up @@ -136,6 +140,32 @@ def create_mcp_server_parameters(cfg: DictConfig, agent_cfg: DictConfig):
}
)

if (
agent_cfg.get("tools", None) is not None
and "tool-sofya-search" in agent_cfg["tools"]
):
if not SOFYA_API_KEY:
raise ValueError(
"SOFYA_API_KEY not set, tool-sofya-search will be unavailable."
)

configs.append(
{
"name": "tool-sofya-search",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.mcp_servers.searching_sofya_mcp_server",
],
env={
"SOFYA_API_KEY": SOFYA_API_KEY,
"SOFYA_BASE_URL": SOFYA_BASE_URL,
},
),
}
)

if agent_cfg.get("tools", None) is not None and "tool-python" in agent_cfg["tools"]:
configs.append(
{
Expand Down Expand Up @@ -467,6 +497,7 @@ def get_env_info(cfg: DictConfig) -> dict:
"has_e2b_api_key": bool(E2B_API_KEY),
"has_tencent_secret_id": bool(TENCENTCLOUD_SECRET_ID),
"has_tencent_secret_key": bool(TENCENTCLOUD_SECRET_KEY),
"has_sofya_api_key": bool(SOFYA_API_KEY),
"has_summary_llm_api_key": bool(SUMMARY_LLM_API_KEY),
# Base URLs
"openai_base_url": OPENAI_BASE_URL,
Expand Down
65 changes: 65 additions & 0 deletions libs/miroflow-tools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The following tools are implemented but were not used in the MiroThinker v1.0/v1
|-----------------------------|----------------------|---------------------------------------------------|---------------------------------------------------------------------|--------------------------------|
| **Web Searching** | `tool-google-search` | `google_search`, `scrape_website` | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-google-search) |
| **Web Searching (Sogou)** | `tool-sogou-search` | `sogou_search`, `scrape_website` | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-sogou-search) |
| **Web Searching (Sofya)** | `tool-sofya-search` | `sofya_search`, `scrape_website`, `sofya_research` | `SOFYA_API_KEY`, `SOFYA_BASE_URL` | [Details](#tool-sofya-search) |
| **Vision Processing** | `tool-vqa` | `visual_question_answering` | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | [Details](#tool-vqa) |
| **Vision Processing** | `tool-vqa-os` | `visual_question_answering` | `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME` | [Details](#tool-vqa-os) |
| **Audio Processing** | `tool-transcribe` | `audio_transcription`, `audio_question_answering` | `OPENAI_API_KEY`, `OPENAI_BASE_URL` | [Details](#tool-transcribe) |
Expand Down Expand Up @@ -910,6 +911,70 @@ if __name__ == "__main__":

</details>

### Server: tool-sofya-search

Web search, scraping, and deep research through the [Sofya](https://sofya.co) API. Search returns extracted page content instead of snippets, scrape fetches a URL as clean markdown (including PDF and DOCX), and research returns a cited multi-source report. *Optional: Not used in the MiroThinker v1.0/v1.5 evaluation*

**Tools**:

- 🔍 `sofya_search(query, max_results=10)`: Web search with extracted page content
- 🌐 `scrape_website(url)`: Fetch a page as clean markdown
- 📚 `sofya_research(query)`: Multi-source deep research with a cited report

**Environment Variables**:

- 🔑 `SOFYA_API_KEY`: Sofya API key (required). Get one at https://sofya.co
- 🌐 `SOFYA_BASE_URL`: Sofya API base URL (default: `https://sofya.co`)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
server_configs = [
{
"name": "tool-sofya-search",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.searching_sofya_mcp_server"],
env={
"SOFYA_API_KEY": "your_sofya_api_key",
"SOFYA_BASE_URL": "https://sofya.co"
}
)
}
]

manager = ToolManager(server_configs)

# Web search
result = await manager.execute_tool_call(
server_name="tool-sofya-search",
tool_name="sofya_search",
arguments={"query": "Model Context Protocol", "max_results": 10}
)
print(result)

# Scrape website
result = await manager.execute_tool_call(
server_name="tool-sofya-search",
tool_name="scrape_website",
arguments={"url": "https://example.com/article"}
)
print(result)

if __name__ == "__main__":
asyncio.run(main())
```

</details>

## 🚀 Development

### Adding a New MCP Server
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Sofya search, scrape, and research MCP server.

Sofya (https://sofya.co) is a web tools API for AI agents. This server exposes
three tools backed by the Sofya REST API:
- sofya_search: web search that returns extracted page content, not just snippets
- scrape_website: fetch a URL as clean markdown (also handles PDF, DOCX, and more)
- sofya_research: decompose a question, read many sources, and return a cited report

Bring your own Sofya API key via the SOFYA_API_KEY environment variable.
"""

import asyncio
import json
import os

import requests
from fastmcp import FastMCP

SOFYA_API_KEY = os.environ.get("SOFYA_API_KEY", "")
SOFYA_BASE_URL = os.environ.get("SOFYA_BASE_URL", "https://sofya.co")

MAX_RETRIES = 3

# Initialize FastMCP server
mcp = FastMCP("searching-sofya-mcp-server")


async def _post_sofya(path: str, payload: dict) -> dict:
"""POST to the Sofya REST API with simple retry on transient network errors.

Returns the parsed JSON response. Raises the last exception if all
attempts fail, or requests.HTTPError immediately on a 4xx response.
"""
url = f"{SOFYA_BASE_URL}/v1/{path}"
headers = {
"Authorization": f"Bearer {SOFYA_API_KEY}",
"Content-Type": "application/json",
"User-Agent": "miroflow-sofya-mcp",
}

last_error: Exception | None = None
for attempt in range(MAX_RETRIES):
try:
response = requests.post(url, json=payload, headers=headers, timeout=180)
response.raise_for_status()
return response.json()
except requests.HTTPError as e:
# Client errors (bad key, bad request) will not succeed on retry.
status = e.response.status_code if e.response is not None else None
if status is not None and 400 <= status < 500:
raise
last_error = e
except (requests.ConnectionError, requests.Timeout) as e:
last_error = e
await asyncio.sleep(min(2 ** (attempt + 1), 10))

raise last_error if last_error else RuntimeError("Sofya request failed")


@mcp.tool()
async def sofya_search(query: str, max_results: int = 10) -> str:
"""Search the web with Sofya and get extracted page content, not just snippets.

Use this for general web search. Each result includes the page title, URL, and
cleaned main content, so the agent can read sources without a separate scrape step.

Args:
query: The search query string. Be specific to improve relevance.
max_results: Number of results to return (1-20, default: 10).

Returns:
The search results in JSON format, including a top-level "answer" when
available and a "results" array of {title, url, content, description,
published_date} objects.
"""
if not SOFYA_API_KEY:
return "[ERROR]: SOFYA_API_KEY is not set, sofya_search tool is not available."

if not query or not query.strip():
return "[ERROR]: Search query is required and cannot be empty."

payload = {
"query": query.strip(),
"max_results": max(1, min(max_results, 20)),
"search_depth": "basic",
}

try:
data = await _post_sofya("search", payload)
return json.dumps(data, ensure_ascii=False)
except requests.HTTPError as e:
status = e.response.status_code if e.response is not None else "unknown"
return f"[ERROR]: sofya_search failed with HTTP {status}: {str(e)}"
except Exception as e:
return f"[ERROR]: sofya_search failed: {str(e)}"


@mcp.tool()
async def scrape_website(url: str) -> str:
"""Fetch a single web page as clean markdown using Sofya. Also handles PDF, DOCX, and more.

Search engines are not supported by this tool. Use sofya_search to find pages,
then scrape_website to read a specific URL in full.

Args:
url: The URL of the page to fetch. Must start with http:// or https://.

Returns:
The page content as markdown, or an error string.
"""
if not SOFYA_API_KEY:
return (
"[ERROR]: SOFYA_API_KEY is not set, scrape_website tool is not available."
)

if not url or not url.startswith(("http://", "https://")):
return f"Invalid URL: '{url}'. URL must start with http:// or https://"

try:
data = await _post_sofya("fetch", {"urls": [url]})
except requests.HTTPError as e:
status = e.response.status_code if e.response is not None else "unknown"
return f"[ERROR]: scrape_website failed with HTTP {status}: {str(e)}"
except Exception as e:
return f"[ERROR]: scrape_website failed: {str(e)}"

results = data.get("results") or []
if not results:
return f"No content retrieved from URL: {url}"

result = results[0]
if not result.get("success", True):
return (
f"[ERROR]: Failed to fetch '{url}': {result.get('error', 'unknown error')}"
)

content = (result.get("content") or "").strip()
if not content:
return f"No content retrieved from URL: {url}"

return content


@mcp.tool()
async def sofya_research(query: str) -> str:
"""Run multi-source deep research with Sofya and get back a cited report.

Sofya decomposes the question into sub-queries, reads many sources in parallel,
and synthesizes a single report with citations. Use this for open-ended questions
that need several sources, not for a single lookup (use sofya_search for that).
This is slower and costs more than a plain search.

Args:
query: The research question.

Returns:
The research report in JSON format, including "report" (the cited write-up)
and "sources" (the references used).
"""
if not SOFYA_API_KEY:
return (
"[ERROR]: SOFYA_API_KEY is not set, sofya_research tool is not available."
)

if not query or not query.strip():
return "[ERROR]: Research query is required and cannot be empty."

try:
data = await _post_sofya("research", {"query": query.strip()})
return json.dumps(data, ensure_ascii=False)
except requests.HTTPError as e:
status = e.response.status_code if e.response is not None else "unknown"
return f"[ERROR]: sofya_research failed with HTTP {status}: {str(e)}"
except Exception as e:
return f"[ERROR]: sofya_research failed: {str(e)}"


if __name__ == "__main__":
mcp.run(transport="stdio")