From 9d34817081eff52d0fecd39c38a2da44b248fdea Mon Sep 17 00:00:00 2001 From: yusufgurdogan Date: Sat, 27 Jun 2026 18:23:38 +0300 Subject: [PATCH] Add Sofya search, scrape, and research MCP server Add tool-sofya-search, a web tools provider backed by the Sofya API (https://sofya.co), alongside the existing Serper and Sogou search servers. It exposes three tools: - sofya_search: web search that returns extracted page content, not snippets - scrape_website: fetch a URL as clean markdown (also handles PDF and DOCX) - sofya_research: multi-source deep research that returns a cited report The server is opt-in: add tool-sofya-search to an agent's tools list and set SOFYA_API_KEY. Registered in settings.py and documented in the tools README and .env.example, mirroring the existing search providers. --- apps/miroflow-agent/.env.example | 5 + apps/miroflow-agent/src/config/settings.py | 31 +++ libs/miroflow-tools/README.md | 65 +++++++ .../mcp_servers/searching_sofya_mcp_server.py | 183 ++++++++++++++++++ 4 files changed, 284 insertions(+) create mode 100644 libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sofya_mcp_server.py diff --git a/apps/miroflow-agent/.env.example b/apps/miroflow-agent/.env.example index 7b46f867..2ed3df83 100644 --- a/apps/miroflow-agent/.env.example +++ b/apps/miroflow-agent/.env.example @@ -36,6 +36,11 @@ ANTHROPIC_BASE_URL=https://api.anthropic.com TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key +# API for Sofya Search, Scrape, and Research (optional) +# Enable with tool-sofya-search in your agent config. Get a key at https://sofya.co +SOFYA_API_KEY=your_sofya_key +SOFYA_BASE_URL="https://sofya.co" + # API for Summary LLM (optional) SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions" SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name diff --git a/apps/miroflow-agent/src/config/settings.py b/apps/miroflow-agent/src/config/settings.py index d5489ddd..4f49709d 100644 --- a/apps/miroflow-agent/src/config/settings.py +++ b/apps/miroflow-agent/src/config/settings.py @@ -59,6 +59,10 @@ TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID") TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY") +# API for Sofya Search, Scrape, and Research +SOFYA_API_KEY = os.environ.get("SOFYA_API_KEY") +SOFYA_BASE_URL = os.environ.get("SOFYA_BASE_URL", "https://sofya.co") + # API for Summary LLM SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY") SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL") @@ -136,6 +140,32 @@ def create_mcp_server_parameters(cfg: DictConfig, agent_cfg: DictConfig): } ) + if ( + agent_cfg.get("tools", None) is not None + and "tool-sofya-search" in agent_cfg["tools"] + ): + if not SOFYA_API_KEY: + raise ValueError( + "SOFYA_API_KEY not set, tool-sofya-search will be unavailable." + ) + + configs.append( + { + "name": "tool-sofya-search", + "params": StdioServerParameters( + command=sys.executable, + args=[ + "-m", + "miroflow_tools.mcp_servers.searching_sofya_mcp_server", + ], + env={ + "SOFYA_API_KEY": SOFYA_API_KEY, + "SOFYA_BASE_URL": SOFYA_BASE_URL, + }, + ), + } + ) + if agent_cfg.get("tools", None) is not None and "tool-python" in agent_cfg["tools"]: configs.append( { @@ -467,6 +497,7 @@ def get_env_info(cfg: DictConfig) -> dict: "has_e2b_api_key": bool(E2B_API_KEY), "has_tencent_secret_id": bool(TENCENTCLOUD_SECRET_ID), "has_tencent_secret_key": bool(TENCENTCLOUD_SECRET_KEY), + "has_sofya_api_key": bool(SOFYA_API_KEY), "has_summary_llm_api_key": bool(SUMMARY_LLM_API_KEY), # Base URLs "openai_base_url": OPENAI_BASE_URL, diff --git a/libs/miroflow-tools/README.md b/libs/miroflow-tools/README.md index 08a444c3..4aa070b2 100644 --- a/libs/miroflow-tools/README.md +++ b/libs/miroflow-tools/README.md @@ -44,6 +44,7 @@ The following tools are implemented but were not used in the MiroThinker v1.0/v1 |-----------------------------|----------------------|---------------------------------------------------|---------------------------------------------------------------------|--------------------------------| | **Web Searching** | `tool-google-search` | `google_search`, `scrape_website` | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-google-search) | | **Web Searching (Sogou)** | `tool-sogou-search` | `sogou_search`, `scrape_website` | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-sogou-search) | +| **Web Searching (Sofya)** | `tool-sofya-search` | `sofya_search`, `scrape_website`, `sofya_research` | `SOFYA_API_KEY`, `SOFYA_BASE_URL` | [Details](#tool-sofya-search) | | **Vision Processing** | `tool-vqa` | `visual_question_answering` | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | [Details](#tool-vqa) | | **Vision Processing** | `tool-vqa-os` | `visual_question_answering` | `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME` | [Details](#tool-vqa-os) | | **Audio Processing** | `tool-transcribe` | `audio_transcription`, `audio_question_answering` | `OPENAI_API_KEY`, `OPENAI_BASE_URL` | [Details](#tool-transcribe) | @@ -910,6 +911,70 @@ if __name__ == "__main__": +### Server: tool-sofya-search + +Web search, scraping, and deep research through the [Sofya](https://sofya.co) API. Search returns extracted page content instead of snippets, scrape fetches a URL as clean markdown (including PDF and DOCX), and research returns a cited multi-source report. *Optional: Not used in the MiroThinker v1.0/v1.5 evaluation* + +**Tools**: + +- 🔍 `sofya_search(query, max_results=10)`: Web search with extracted page content +- 🌐 `scrape_website(url)`: Fetch a page as clean markdown +- 📚 `sofya_research(query)`: Multi-source deep research with a cited report + +**Environment Variables**: + +- 🔑 `SOFYA_API_KEY`: Sofya API key (required). Get one at https://sofya.co +- 🌐 `SOFYA_BASE_URL`: Sofya API base URL (default: `https://sofya.co`) + +**Example**: + +
+Click to expand code example + +```python +import asyncio +from miroflow_tools import ToolManager +from mcp import StdioServerParameters + +async def main(): + server_configs = [ + { + "name": "tool-sofya-search", + "params": StdioServerParameters( + command="python", + args=["-m", "miroflow_tools.mcp_servers.searching_sofya_mcp_server"], + env={ + "SOFYA_API_KEY": "your_sofya_api_key", + "SOFYA_BASE_URL": "https://sofya.co" + } + ) + } + ] + + manager = ToolManager(server_configs) + + # Web search + result = await manager.execute_tool_call( + server_name="tool-sofya-search", + tool_name="sofya_search", + arguments={"query": "Model Context Protocol", "max_results": 10} + ) + print(result) + + # Scrape website + result = await manager.execute_tool_call( + server_name="tool-sofya-search", + tool_name="scrape_website", + arguments={"url": "https://example.com/article"} + ) + print(result) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +
+ ## 🚀 Development ### Adding a New MCP Server diff --git a/libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sofya_mcp_server.py b/libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sofya_mcp_server.py new file mode 100644 index 00000000..a40fe2d4 --- /dev/null +++ b/libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sofya_mcp_server.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025 MiroMind +# This source code is licensed under the Apache 2.0 License. + +""" +Sofya search, scrape, and research MCP server. + +Sofya (https://sofya.co) is a web tools API for AI agents. This server exposes +three tools backed by the Sofya REST API: +- sofya_search: web search that returns extracted page content, not just snippets +- scrape_website: fetch a URL as clean markdown (also handles PDF, DOCX, and more) +- sofya_research: decompose a question, read many sources, and return a cited report + +Bring your own Sofya API key via the SOFYA_API_KEY environment variable. +""" + +import asyncio +import json +import os + +import requests +from fastmcp import FastMCP + +SOFYA_API_KEY = os.environ.get("SOFYA_API_KEY", "") +SOFYA_BASE_URL = os.environ.get("SOFYA_BASE_URL", "https://sofya.co") + +MAX_RETRIES = 3 + +# Initialize FastMCP server +mcp = FastMCP("searching-sofya-mcp-server") + + +async def _post_sofya(path: str, payload: dict) -> dict: + """POST to the Sofya REST API with simple retry on transient network errors. + + Returns the parsed JSON response. Raises the last exception if all + attempts fail, or requests.HTTPError immediately on a 4xx response. + """ + url = f"{SOFYA_BASE_URL}/v1/{path}" + headers = { + "Authorization": f"Bearer {SOFYA_API_KEY}", + "Content-Type": "application/json", + "User-Agent": "miroflow-sofya-mcp", + } + + last_error: Exception | None = None + for attempt in range(MAX_RETRIES): + try: + response = requests.post(url, json=payload, headers=headers, timeout=180) + response.raise_for_status() + return response.json() + except requests.HTTPError as e: + # Client errors (bad key, bad request) will not succeed on retry. + status = e.response.status_code if e.response is not None else None + if status is not None and 400 <= status < 500: + raise + last_error = e + except (requests.ConnectionError, requests.Timeout) as e: + last_error = e + await asyncio.sleep(min(2 ** (attempt + 1), 10)) + + raise last_error if last_error else RuntimeError("Sofya request failed") + + +@mcp.tool() +async def sofya_search(query: str, max_results: int = 10) -> str: + """Search the web with Sofya and get extracted page content, not just snippets. + + Use this for general web search. Each result includes the page title, URL, and + cleaned main content, so the agent can read sources without a separate scrape step. + + Args: + query: The search query string. Be specific to improve relevance. + max_results: Number of results to return (1-20, default: 10). + + Returns: + The search results in JSON format, including a top-level "answer" when + available and a "results" array of {title, url, content, description, + published_date} objects. + """ + if not SOFYA_API_KEY: + return "[ERROR]: SOFYA_API_KEY is not set, sofya_search tool is not available." + + if not query or not query.strip(): + return "[ERROR]: Search query is required and cannot be empty." + + payload = { + "query": query.strip(), + "max_results": max(1, min(max_results, 20)), + "search_depth": "basic", + } + + try: + data = await _post_sofya("search", payload) + return json.dumps(data, ensure_ascii=False) + except requests.HTTPError as e: + status = e.response.status_code if e.response is not None else "unknown" + return f"[ERROR]: sofya_search failed with HTTP {status}: {str(e)}" + except Exception as e: + return f"[ERROR]: sofya_search failed: {str(e)}" + + +@mcp.tool() +async def scrape_website(url: str) -> str: + """Fetch a single web page as clean markdown using Sofya. Also handles PDF, DOCX, and more. + + Search engines are not supported by this tool. Use sofya_search to find pages, + then scrape_website to read a specific URL in full. + + Args: + url: The URL of the page to fetch. Must start with http:// or https://. + + Returns: + The page content as markdown, or an error string. + """ + if not SOFYA_API_KEY: + return ( + "[ERROR]: SOFYA_API_KEY is not set, scrape_website tool is not available." + ) + + if not url or not url.startswith(("http://", "https://")): + return f"Invalid URL: '{url}'. URL must start with http:// or https://" + + try: + data = await _post_sofya("fetch", {"urls": [url]}) + except requests.HTTPError as e: + status = e.response.status_code if e.response is not None else "unknown" + return f"[ERROR]: scrape_website failed with HTTP {status}: {str(e)}" + except Exception as e: + return f"[ERROR]: scrape_website failed: {str(e)}" + + results = data.get("results") or [] + if not results: + return f"No content retrieved from URL: {url}" + + result = results[0] + if not result.get("success", True): + return ( + f"[ERROR]: Failed to fetch '{url}': {result.get('error', 'unknown error')}" + ) + + content = (result.get("content") or "").strip() + if not content: + return f"No content retrieved from URL: {url}" + + return content + + +@mcp.tool() +async def sofya_research(query: str) -> str: + """Run multi-source deep research with Sofya and get back a cited report. + + Sofya decomposes the question into sub-queries, reads many sources in parallel, + and synthesizes a single report with citations. Use this for open-ended questions + that need several sources, not for a single lookup (use sofya_search for that). + This is slower and costs more than a plain search. + + Args: + query: The research question. + + Returns: + The research report in JSON format, including "report" (the cited write-up) + and "sources" (the references used). + """ + if not SOFYA_API_KEY: + return ( + "[ERROR]: SOFYA_API_KEY is not set, sofya_research tool is not available." + ) + + if not query or not query.strip(): + return "[ERROR]: Research query is required and cannot be empty." + + try: + data = await _post_sofya("research", {"query": query.strip()}) + return json.dumps(data, ensure_ascii=False) + except requests.HTTPError as e: + status = e.response.status_code if e.response is not None else "unknown" + return f"[ERROR]: sofya_research failed with HTTP {status}: {str(e)}" + except Exception as e: + return f"[ERROR]: sofya_research failed: {str(e)}" + + +if __name__ == "__main__": + mcp.run(transport="stdio")