From 2a5ff4b33dd21f7e062d40f742f32d914452e611 Mon Sep 17 00:00:00 2001 From: us Date: Mon, 15 Jun 2026 01:52:03 +0300 Subject: [PATCH] feat: add fastCRW search tool --- src/tool/default_tools/search/__init__.py | 2 + src/tool/default_tools/search/crw_search.py | 230 ++++++++++++++++++++ src/tool/default_tools/web_searcher.py | 12 +- 3 files changed, 239 insertions(+), 5 deletions(-) create mode 100644 src/tool/default_tools/search/crw_search.py diff --git a/src/tool/default_tools/search/__init__.py b/src/tool/default_tools/search/__init__.py index 02005c55..7d9831f0 100644 --- a/src/tool/default_tools/search/__init__.py +++ b/src/tool/default_tools/search/__init__.py @@ -1,5 +1,6 @@ from .types import SearchItem from .firecrawl_search import FirecrawlSearch +from .crw_search import CrwSearch from .brave_search import BraveSearch from .bing_search import BingSearch from .google_search import GoogleSearch @@ -9,6 +10,7 @@ __all__ = [ "SearchItem", "FirecrawlSearch", + "CrwSearch", "BraveSearch", "BingSearch", "GoogleSearch", diff --git a/src/tool/default_tools/search/crw_search.py b/src/tool/default_tools/search/crw_search.py new file mode 100644 index 00000000..f5f10952 --- /dev/null +++ b/src/tool/default_tools/search/crw_search.py @@ -0,0 +1,230 @@ +from __future__ import annotations +from typing import Any, Optional, Dict, List, Type +import json +import os +from pydantic import ConfigDict, Field +from firecrawl import AsyncFirecrawlApp +from dotenv import load_dotenv +load_dotenv() + +from src.tool.default_tools.search.types import SearchItem, SearchToolArgs +from src.tool.types import Tool, ToolResponse, ToolExtra +from src.logger import logger +from src.registry import TOOL + +# Default fastCRW cloud base URL. fastCRW is a Firecrawl-compatible web scraper +# (single binary; self-host or cloud), so the official Firecrawl client can be +# pointed at it by overriding the base URL. Set CRW_API_URL to use a self-hosted +# server (e.g. http://localhost:3000). +DEFAULT_CRW_API_URL = "https://fastcrw.com/api" + +@TOOL.register_module(force=True) +class CrwSearch(Tool): + """Tool that queries the fastCRW search engine. + + fastCRW is a Firecrawl-compatible web data engine (single binary; self-host + or cloud). It exposes the same /v1/search API as Firecrawl, so the official + Firecrawl client is reused with the fastCRW base URL. + + Example usages: + .. code-block:: python + # basic usage + tool = CrwSearch() + + .. code-block:: python + # with custom search kwargs + tool = CrwSearch.from_search_kwargs({"limit": 5}) + """ + model_config = ConfigDict(arbitrary_types_allowed=True, extra="allow") + + name: str = "crw_search" + description: str = ( + "a search engine. " + "useful for when you need to answer questions about current events." + " input should be a search query." + ) + metadata: Dict[str, Any] = Field(default={}, description="The metadata of the tool") + api_key: Optional[str] = Field(default=None, description="fastCRW API key") + api_url: Optional[str] = Field(default=None, description="fastCRW base URL (override for self-host)") + + def __init__(self, **kwargs): + """Initialize the CrwSearch tool.""" + # Set api_key from environment if not provided + super().__init__(**kwargs) + self.api_key = os.getenv("CRW_API_KEY") + self.api_url = os.getenv("CRW_API_URL", DEFAULT_CRW_API_URL) + + @classmethod + def from_search_kwargs(cls, search_kwargs: dict, **kwargs: Any) -> CrwSearch: + """Create a tool from search kwargs. + + Args: + search_kwargs: Any additional kwargs to pass to the search function. + **kwargs: Any additional kwargs to pass to the tool. + + Returns: + A tool. + """ + return cls(search_kwargs=search_kwargs, **kwargs) + + async def _search_crw(self, + query: str, + num_results: int = 10, + filter_year: Optional[int] = 2025) -> List[SearchItem]: + """ + Perform a fastCRW search using the provided parameters. + Returns a list of SearchItem objects. + """ + if not self.api_key: + raise ValueError("CRW_API_KEY environment variable is required") + + results = [] + + app = AsyncFirecrawlApp(api_key=self.api_key, api_url=self.api_url) + search_kwargs = { + "query": query, + "limit": num_results, + } + + # Add date filter if year is valid (1900-2100) + # Handle None case (when explicitly passed from caller) + if filter_year is None: + filter_year = 2025 # Use default if None + + if 1900 <= filter_year <= 2100: + search_kwargs["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}" + else: + logger.warning(f"Invalid filter_year: {filter_year}. Expected 1900-2100. Ignoring date filter.") + + try: + response = await app.search(**search_kwargs) + except Exception as e: + logger.error(f"fastCRW API call failed: {e}") + return results + + # Check if response and response.web exist and are not None + if response is None: + logger.warning("fastCRW search returned None response") + return results + + # Log response structure for debugging + logger.debug(f"fastCRW response type: {type(response)}") + logger.debug(f"fastCRW response attributes: {dir(response) if hasattr(response, '__dict__') else 'N/A'}") + + # Check for different possible response formats + web_results = None + + # Try to access web results from response object + if hasattr(response, 'web') and response.web is not None: + web_results = response.web + elif hasattr(response, 'data') and response.data is not None: + # Some API versions might use 'data' instead of 'web' + web_results = response.data + elif hasattr(response, 'results') and response.results is not None: + # Try 'results' attribute + web_results = response.results + elif isinstance(response, dict): + # Response might be a dict + web_results = response.get('web') or response.get('data') or response.get('results') + elif isinstance(response, list): + # Response might be a list directly + web_results = response + else: + # Try to convert response to dict if it's a Pydantic model + try: + if hasattr(response, 'model_dump'): + response_dict = response.model_dump() + web_results = response_dict.get('web') or response_dict.get('data') or response_dict.get('results') + elif hasattr(response, 'dict'): + response_dict = response.dict() + web_results = response_dict.get('web') or response_dict.get('data') or response_dict.get('results') + except Exception: + pass + + if web_results is None: + # Log full response structure for debugging + logger.warning( + f"fastCRW search response has no accessible results. " + f"Response type: {type(response)}, Response: {str(response)[:200]}" + ) + # Try to log all attributes + if hasattr(response, '__dict__'): + logger.debug(f"Response attributes: {list(response.__dict__.keys())}") + elif hasattr(response, '__fields__'): + logger.debug(f"Response fields: {list(response.__fields__.keys())}") + return results + + # Safely iterate over web_results + try: + for item in web_results: + if item is None: + continue + + # Handle both object and dict formats + if isinstance(item, dict): + title = item.get('title', '') or "" + url = item.get('url', '') or "" + description = item.get('description', '') or item.get('snippet', '') or "" + else: + title = getattr(item, 'title', None) or "" + url = getattr(item, 'url', None) or "" + description = getattr(item, 'description', None) or getattr(item, 'snippet', None) or "" + + if url: # Only add items with valid URLs + results.append(SearchItem( + title=title, + url=url, + description=description + )) + except (TypeError, AttributeError) as e: + logger.error(f"Error iterating over fastCRW search results: {e}, web_results type: {type(web_results)}") + return results + + return results + + async def __call__( + self, + query: str, + num_results: Optional[int] = 5, + country: Optional[str] = "us", + lang: Optional[str] = "en", + filter_year: Optional[int] = 2025, + **kwargs + ) -> ToolResponse: + """ + fastCRW search tool. + + Args: + query (str): The query to search for. + num_results (Optional[int]): The number of search results to return. + country (Optional[str]): The country to search in. + lang (Optional[str]): The language to search in. + filter_year (int): The year to filter results by. Defaults to 2025. + """ + + try: + + # Perform search + search_items = await self._search_crw(query, num_results=num_results, filter_year=filter_year) + + # Format results as JSON string + results_json = json.dumps([{ + "title": item.title, + "url": item.url, + "description": item.description or "" + } for item in search_items], ensure_ascii=False, indent=4) + + message = f"fastCRW search results for query: {query}\n\n{results_json}" + + return ToolResponse(success=True, message=message, extra=ToolExtra( + data={ + "query": query, + "num_results": len(search_items), + "search_items": search_items, + "engine": "crw" + } + )) + + except Exception as e: + logger.error(f"Error in fastCRW search: {e}") + return ToolResponse(success=False, message=f"Error in fastCRW search: {str(e)}") diff --git a/src/tool/default_tools/web_searcher.py b/src/tool/default_tools/web_searcher.py index 6ba714c1..233deec1 100644 --- a/src/tool/default_tools/web_searcher.py +++ b/src/tool/default_tools/web_searcher.py @@ -5,11 +5,12 @@ from src.tool.default_tools.web_fetcher import WebFetcherTool from src.tool.default_tools.search import ( - FirecrawlSearch, - SearchItem, - BraveSearch, - BingSearch, - GoogleSearch, + FirecrawlSearch, + CrwSearch, + SearchItem, + BraveSearch, + BingSearch, + GoogleSearch, DDGSSearch ) from src.logger import logger @@ -68,6 +69,7 @@ def __init__(self, model_name: Optional[str] = None, require_grad: bool = False, # Initialize search engines and content fetcher self.search_tools = { # "firecrawl_search": FirecrawlSearch(), + # "crw_search": CrwSearch(), # "bing_search": BingSearch(), "ddgs_search": DDGSSearch(), }