From e487db643b95d0982ab75fdd4d2ffa710a2fc4b2 Mon Sep 17 00:00:00 2001 From: Reason-Wang Date: Sat, 30 Aug 2025 16:35:18 +0000 Subject: [PATCH 1/2] Update docs --- docs/conf.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index f5682eb..906589b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,14 +33,6 @@ html_static_path = ["_static"] - -# html_theme_options = { -# "logo": { -# "text": "🪽AgentFly\n", -# "image_light": "_static/logo-light.png", -# "image_dark": "_static/logo-dark.png", -# } -# } html_theme_options = { # "path_to_docs": "docs", "repository_url": "https://github.com/executablebooks/sphinx-book-theme", @@ -64,10 +56,6 @@ "navigation_depth": 4, "collapse_navigation": False, "globaltoc_collapse": False, - "announcement": ( - "⚠️The latest release refactored our HTML, " - "so double-check your custom CSS rules!⚠️" - ), "logo": { "image_dark": "_static/logo-wide-dark.svg", "text": "🪽AgentFly Document", # Uncomment to try text with logo From 3a94bded35fadb9131a4518b6513909c1b00e141 Mon Sep 17 00:00:00 2001 From: Reason-Wang Date: Sat, 30 Aug 2025 20:10:40 +0000 Subject: [PATCH 2/2] Update llm backends and docs --- agentfly/__init__.py | 4 +- agentfly/agents/agent_base.py | 9 +- agentfly/agents/backend_config.py | 66 ------- agentfly/agents/llm_backends/__init__.py | 15 ++ .../agents/llm_backends/backend_configs.py | 122 +++++++++++++ .../llm_backends.py} | 114 ++++++++++-- agentfly/agents/templates/utils.py | 2 +- docs/api_references/agents/index.rst | 1 + docs/api_references/agents/llm_backends.rst | 164 ++++++++++++++++++ 9 files changed, 411 insertions(+), 86 deletions(-) delete mode 100644 agentfly/agents/backend_config.py create mode 100644 agentfly/agents/llm_backends/__init__.py create mode 100644 agentfly/agents/llm_backends/backend_configs.py rename agentfly/agents/{llm_backend.py => llm_backends/llm_backends.py} (81%) create mode 100644 docs/api_references/agents/llm_backends.rst diff --git a/agentfly/__init__.py b/agentfly/__init__.py index f21686e..3d54ad9 100644 --- a/agentfly/__init__.py +++ b/agentfly/__init__.py @@ -27,4 +27,6 @@ AGENT_CONFIG_DIR = os.getenv("AGENT_CONFIG_DIR", AGENT_CONFIG_DIR) -ENROOT_HOME = os.getenv("ENROOT_HOME", ENROOT_HOME) \ No newline at end of file +ENROOT_HOME = os.getenv("ENROOT_HOME", ENROOT_HOME) + +os.environ["VLLM_CONFIGURE_LOGGING"] = "0" \ No newline at end of file diff --git a/agentfly/agents/agent_base.py b/agentfly/agents/agent_base.py index 82505e9..4284e4d 100644 --- a/agentfly/agents/agent_base.py +++ b/agentfly/agents/agent_base.py @@ -4,7 +4,13 @@ from .utils.messages import MessagesList from .templates.templates import get_template from ..__init__ import AGENT_DATA_DIR -from .llm_backend import AsyncVLLMBackend, AsyncVerlBackend, ClientBackend, TransformersBackend, VLLMBackend +from .llm_backends import ( + AsyncVLLMBackend, + AsyncVerlBackend, + ClientBackend, + TransformersBackend, +) +from .llm_backends.backend_configs import BACKEND_CONFIGS from ..utils.logging import get_logger from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -18,7 +24,6 @@ import logging from .chain.streaming_observer import ConsoleStreamObserver, StreamingManager from .utils.tokenizer import create_processor, create_tokenizer -from .backend_config import BACKEND_CONFIGS try: from verl.protocol import DataProto except ImportError: diff --git a/agentfly/agents/backend_config.py b/agentfly/agents/backend_config.py deleted file mode 100644 index 54a5a62..0000000 --- a/agentfly/agents/backend_config.py +++ /dev/null @@ -1,66 +0,0 @@ -from dataclasses import dataclass -from typing import Optional, Dict, Any, List -import asyncio - - -@dataclass -class TransformersConfig: - """Configuration for Transformers backend""" - temperature: float = 1.0 - max_new_tokens: int = 1024 - trust_remote_code: bool = True - device_map: str = "auto" - - -@dataclass -class VLLMConfig: - """Configuration for VLLM backend""" - temperature: float = 1.0 - max_new_tokens: int = 1024 - # Add other vLLM specific parameters as needed - - -@dataclass -class AsyncVLLMConfig: - """Configuration for Async VLLM backend""" - temperature: float = 1.0 - max_new_tokens: int = 1024 - # Add other async vLLM specific parameters as needed - - -@dataclass -class VerlConfig: - """Configuration for Verl backend""" - temperature: float = 1.0 - max_new_tokens: int = 1024 - # Add other Verl specific parameters as needed - - -@dataclass -class AsyncVerlConfig: - """Configuration for Async Verl backend""" - temperature: float = 1.0 - max_new_tokens: int = 1024 - # Add other async Verl specific parameters as needed - - -@dataclass -class ClientConfig: - """Configuration for Client backend (OpenAI-compatible)""" - base_url: str = "http://localhost:8000/v1" - max_requests_per_minute: int = 100 - timeout: int = 600 - api_key: str = "EMPTY" - max_new_tokens: int = 1024 - temperature: float = 1.0 - - -# Backend configuration mapping -BACKEND_CONFIGS = { - "transformers": TransformersConfig, - "vllm": VLLMConfig, - "async_vllm": AsyncVLLMConfig, - "verl": VerlConfig, - "async_verl": AsyncVerlConfig, - "client": ClientConfig, -} \ No newline at end of file diff --git a/agentfly/agents/llm_backends/__init__.py b/agentfly/agents/llm_backends/__init__.py new file mode 100644 index 0000000..d5045e2 --- /dev/null +++ b/agentfly/agents/llm_backends/__init__.py @@ -0,0 +1,15 @@ +from .backend_configs import ( + TransformersConfig, + VLLMConfig, + AsyncVLLMConfig, + AsyncVerlConfig, + ClientConfig, +) + +from .llm_backends import ( + TransformersBackend, + VLLMBackend, + AsyncVLLMBackend, + AsyncVerlBackend, + ClientBackend, +) \ No newline at end of file diff --git a/agentfly/agents/llm_backends/backend_configs.py b/agentfly/agents/llm_backends/backend_configs.py new file mode 100644 index 0000000..a8974b1 --- /dev/null +++ b/agentfly/agents/llm_backends/backend_configs.py @@ -0,0 +1,122 @@ +from dataclasses import dataclass +from typing import Optional, Dict, Any, List + +from vllm import AsyncEngineArgs + + +@dataclass +class TransformersConfig: + """Configuration for Transformers backend using Hugging Face models. + + Attributes: + temperature (float): Sampling temperature for text generation. Controls randomness. + Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic. + Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + trust_remote_code (bool): Whether to trust remote code when loading models. + This is required for some custom models. Defaults to True. + device_map (str): Device mapping strategy for model placement. + Options include "auto", "cpu", "cuda:0", etc. Defaults to "auto". + """ + temperature: float = 1.0 + max_new_tokens: int = 1024 + trust_remote_code: bool = True + device_map: str = "auto" + + +@dataclass +class VLLMConfig: + """Configuration for VLLM backend for high-performance inference. + + Attributes: + temperature (float): Sampling temperature for text generation. Controls randomness. + Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic. + Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + """ + temperature: float = 1.0 + max_new_tokens: int = 1024 + + + +@dataclass +class AsyncVLLMConfig: + """Configuration for Async VLLM backend with engine arguments. Arguments are the same as vLLM's arguments, which can + be found at https://docs.vllm.ai/en/latest/configuration/engine_args.html. Here listed some important arguments: + + Attributes: + gpu_memory_utilization (float): The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. + max_model_len (int): Model context length (prompt and output). If unspecified, will be automatically derived from the model config. + rope_scaling (dict): Rope scaling. For example, {"rope_type":"dynamic","factor":2.0}. + trust_remote_code (bool): Whether to trust remote code when loading models. + pipeline_parallel_size (int): Pipeline parallel size. + data_parallel_size (int): Data parallel size. + tensor_parallel_size (int): Tensor parallel size. + """ + engine_args: AsyncEngineArgs = AsyncEngineArgs() + + def __init__(self, **kwargs): + self.engine_args = AsyncEngineArgs(**kwargs) + + +@dataclass +class VerlConfig: + """Configuration for Verl backend. + + Attributes: + temperature (float): Sampling temperature for text generation. Controls randomness. + Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic. + Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + """ + temperature: float = 1.0 + max_new_tokens: int = 1024 + + +@dataclass +class AsyncVerlConfig: + """Configuration for Async Verl backend. + + Attributes: + temperature (float): Sampling temperature for text generation. Controls randomness. + Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic. + Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + """ + temperature: float = 1.0 + max_new_tokens: int = 1024 + + +@dataclass +class ClientConfig: + """Configuration for Client backend (OpenAI-compatible) + + This configuration class provides settings for connecting to OpenAI-compatible + API endpoints, such as local models served via vLLM, Ollama, or other + compatible servers. + + Attributes: + base_url: The base URL for the API endpoint. Defaults to localhost:8000. + max_requests_per_minute: Rate limiting for API requests. Defaults to 100. + timeout: Request timeout in seconds. Defaults to 600 (10 minutes). + api_key: API key for authentication. Defaults to "EMPTY" for local servers. + max_new_tokens: Maximum number of tokens to generate. Defaults to 1024. + temperature: Sampling temperature for text generation. Defaults to 1.0. + """ + base_url: str = "http://localhost:8000/v1" + max_requests_per_minute: int = 100 + timeout: int = 600 + api_key: str = "EMPTY" + max_new_tokens: int = 1024 + temperature: float = 1.0 + + +# Backend configuration mapping +BACKEND_CONFIGS = { + "transformers": TransformersConfig, + "vllm": VLLMConfig, + "async_vllm": AsyncVLLMConfig, + "verl": VerlConfig, + "async_verl": AsyncVerlConfig, + "client": ClientConfig, +} \ No newline at end of file diff --git a/agentfly/agents/llm_backend.py b/agentfly/agents/llm_backends/llm_backends.py similarity index 81% rename from agentfly/agents/llm_backend.py rename to agentfly/agents/llm_backends/llm_backends.py index 3a65f00..340e985 100644 --- a/agentfly/agents/llm_backend.py +++ b/agentfly/agents/llm_backends/llm_backends.py @@ -14,13 +14,11 @@ from tenacity import retry, stop_after_attempt, wait_exponential import torch from transformers import AutoTokenizer, AutoModelForCausalLM -from ..utils.verl import pad_tensor_to_rank_size -import os -os.environ["VLLM_USE_V1"] = "1" +from ...utils.verl import pad_tensor_to_rank_size from vllm import LLM, AsyncLLMEngine, SamplingParams, AsyncEngineArgs import openai -from .templates.templates import Chat -from .templates.vision_processor import get_processor +from ..templates.templates import Chat +from ..templates.vision_processor import get_processor import logging import PIL @@ -35,7 +33,14 @@ pass class LLMBackend: - """Base class for LLM backends""" + """Base class for LLM backends. + + This abstract base class provides a unified interface for different LLM implementations. + All backend implementations must inherit from this class and implement the required methods. + + Attributes: + config: Configuration dictionary containing backend-specific parameters. + """ def __init__(self, **kwargs): self.config = kwargs @@ -61,9 +66,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], streaming_ca raise NotImplementedError("Subclasses must implement generate_streaming()") class TransformersBackend(LLMBackend): - """HuggingFace Transformers implementation""" + """HuggingFace Transformers implementation for local model inference. + + This backend uses the Hugging Face Transformers library to load and run models locally. + It supports both synchronous and asynchronous text generation with streaming capabilities. + """ def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, temperature: float=1.0, max_new_tokens: int=1024, **kwargs): + """Initialize TransformersBackend. + + Args: + model_name_or_path (str): Name or path of the pre-trained model to load. + template (str): Chat template to use for formatting messages. + max_length (int): Maximum sequence length for input/output. Defaults to 8192. + temperature (float): Sampling temperature for text generation. Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + **kwargs: Additional configuration parameters. + """ super().__init__(**kwargs) self.model_name = model_name_or_path @@ -151,9 +170,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], streaming_ca inputs['attention_mask'] = torch.cat([inputs['attention_mask'], torch.ones(1, 1, device=inputs['attention_mask'].device)], dim=1) class VLLMBackend(LLMBackend): - """vLLM implementation""" + """vLLM implementation for high-performance model inference. + + This backend uses the vLLM library for optimized inference of large language models. + vLLM provides efficient memory management and high throughput for model serving. + """ def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, temperature: float=1.0, max_new_tokens: int=1024, **kwargs): + """Initialize VLLMBackend. + + Args: + model_name_or_path (str): Name or path of the pre-trained model to load. + template (str): Chat template to use for formatting messages. + max_length (int): Maximum sequence length for input/output. Defaults to 8192. + temperature (float): Sampling temperature for text generation. Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + **kwargs: Additional configuration parameters. + """ super().__init__(**kwargs) self.model_name = model_name_or_path @@ -234,9 +267,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], streaming_ca yield sequence.text class AsyncVLLMBackend(LLMBackend): - """Async vLLM implementation""" + """Asynchronous vLLM implementation for high-performance model inference. + + This backend uses the vLLM AsyncLLMEngine for asynchronous inference, providing + better resource utilization and scalability for concurrent requests. + """ def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, temperature: float=1.0, max_new_tokens: int=1024, **kwargs): + """Initialize AsyncVLLMBackend. + + Args: + model_name_or_path (str): Name or path of the pre-trained model to load. + template (str): Chat template to use for formatting messages. + max_length (int): Maximum sequence length for input/output. Defaults to 8192. + temperature (float): Sampling temperature for text generation. Defaults to 1.0. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + **kwargs: Additional configuration parameters that will be passed to AsyncEngineArgs. + """ super().__init__(**kwargs) self.model_name = model_name_or_path @@ -244,11 +291,17 @@ def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, self.temperature = temperature self.max_new_tokens = max_new_tokens self.template = template - # Load model - self.llm_engine = AsyncLLMEngine.from_engine_args( - AsyncEngineArgs( + + if 'engine_args' in kwargs: + engine_args = kwargs.pop('engine_args') + engine_args.model = self.model_name + else: + engine_args = AsyncEngineArgs( model=self.model_name, + **kwargs, ) + self.llm_engine = AsyncLLMEngine.from_engine_args( + engine_args ) def _process_inputs(self, prompts: List[str], vision_inputs: Dict[str, List[PIL.Image.Image]]): @@ -327,9 +380,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], **kwargs) -> yield sequence.text class AsyncVerlBackend(LLMBackend): - """Verl implementation""" + """Asynchronous Verl implementation for distributed model inference. + + This backend uses the Verl framework for distributed and asynchronous model inference. + Verl provides capabilities for running models across multiple workers and handling + complex inference pipelines. + """ def __init__(self, llm_engine, model_name_or_path: str, template: str, max_length: int=8192, **kwargs): + """Initialize AsyncVerlBackend. + + Args: + llm_engine: Verl engine instance for distributed inference. + model_name_or_path (str): Name or path of the pre-trained model to load. + template (str): Chat template to use for formatting messages. + max_length (int): Maximum sequence length for input/output. Defaults to 8192. + **kwargs: Additional configuration parameters. + """ super().__init__(**kwargs) self.model_name = model_name_or_path self.max_length = max_length @@ -394,9 +461,11 @@ async def generate_async(self, messages_list: str, **kwargs) -> str: class ClientBackend(LLMBackend): - """ - Thin async/sync wrapper around OpenAI-compatible chat API. - Call `generate(...)` with *one* or *many* message lists. + """OpenAI-compatible client backend for remote API inference. + + This backend provides a thin wrapper around OpenAI-compatible chat APIs, + supporting both synchronous and asynchronous operations. It includes built-in + rate limiting and retry mechanisms for reliable API communication. """ def __init__( @@ -411,6 +480,19 @@ def __init__( max_new_tokens: int = 1024, **kwargs, ): + """Initialize ClientBackend. + + Args: + model_name_or_path (str): Name of the model to use for inference. + template (str): Chat template to use for formatting messages. + base_url (str): Base URL for the API endpoint. Defaults to localhost:8000. + max_requests_per_minute (int): Rate limiting for API requests. Defaults to 100. + timeout (int): Request timeout in seconds. Defaults to 600. + api_key (str): API key for authentication. Defaults to "EMPTY" for local servers. + max_length (int): Maximum sequence length for input/output. Defaults to 8192. + max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024. + **kwargs: Additional configuration parameters. + """ super().__init__(**kwargs) # --- connection diff --git a/agentfly/agents/templates/utils.py b/agentfly/agents/templates/utils.py index 46faa0c..ab6c07c 100644 --- a/agentfly/agents/templates/utils.py +++ b/agentfly/agents/templates/utils.py @@ -358,7 +358,7 @@ def vllm_serve(model_name_or_path, template, tp, pp, dp): if __name__=="__main__": "python -m agents.agents.templates.utils" # model = "/mnt/sharefs/users/haonan.li/models/Qwen2.5-7B-instruct-am_think_v1_distilled" - model = "Qwen/Qwen2.5-7B-Instruct" + model = "Qwen/Qwen2.5-3B-Instruct" # vllm_serve(model, "qwen2.5-think", 2, 1, 4) vllm_serve(model, "qwen2.5", 1, 1, 1) diff --git a/docs/api_references/agents/index.rst b/docs/api_references/agents/index.rst index 93853fe..23c5f2c 100644 --- a/docs/api_references/agents/index.rst +++ b/docs/api_references/agents/index.rst @@ -16,6 +16,7 @@ Base Agent :maxdepth: 2 agent + llm_backends Core Classes =========== diff --git a/docs/api_references/agents/llm_backends.rst b/docs/api_references/agents/llm_backends.rst new file mode 100644 index 0000000..8f57918 --- /dev/null +++ b/docs/api_references/agents/llm_backends.rst @@ -0,0 +1,164 @@ +LLM Backends +============ + +Overview +-------- + +AgentFly supports multiple LLM backends for text generation, each with their own configuration options. +This module provides configuration classes for different backend types including Transformers, vLLM, Verl, and OpenAI-compatible clients. + +Configuration Classes +-------------------- + +Transformers Backend +~~~~~~~~~~~~~~~~~~~ + +Configuration for the Transformers backend using Hugging Face models: + +.. autoclass:: agentfly.agents.llm_backends.backend_configs.TransformersConfig + :show-inheritance: + :special-members: !__init__ + + +Async VLLM Backend +~~~~~~~~~~~~~~~~~~ + +Configuration for asynchronous vLLM backend with engine arguments: + +.. autoclass:: agentfly.agents.llm_backends.backend_configs.AsyncVLLMConfig + :show-inheritance: + :special-members: !__init__ + + +Async Verl Backend +~~~~~~~~~~~~~~~~~~ + +Configuration for asynchronous Verl backend: + +.. autoclass:: agentfly.agents.llm_backends.backend_configs.AsyncVerlConfig + :show-inheritance: + :special-members: !__init__ + +Client Backend +~~~~~~~~~~~~~ + +Configuration for OpenAI-compatible client backends: + +.. autoclass:: agentfly.agents.llm_backends.backend_configs.ClientConfig + :show-inheritance: + :special-members: !__init__ + +Backend Implementations +---------------------- + +Base Backend +~~~~~~~~~~~ + +Abstract base class for all LLM backends: + +.. autoclass:: agentfly.agents.llm_backends.llm_backends.LLMBackend + :members: + :show-inheritance: + +Transformers Backend +~~~~~~~~~~~~~~~~~~~ + +HuggingFace Transformers implementation for local model inference: + +.. autoclass:: agentfly.agents.llm_backends.llm_backends.TransformersBackend + :members: + :show-inheritance: + +VLLM Backend +~~~~~~~~~~~~ + +vLLM implementation for high-performance model inference: + +.. autoclass:: agentfly.agents.llm_backends.llm_backends.VLLMBackend + :members: + :show-inheritance: + +Async VLLM Backend +~~~~~~~~~~~~~~~~~~ + +Asynchronous vLLM implementation for high-performance model inference: + +.. autoclass:: agentfly.agents.llm_backends.llm_backends.AsyncVLLMBackend + :members: + :show-inheritance: + +Async Verl Backend +~~~~~~~~~~~~~~~~~~ + +Asynchronous Verl implementation for distributed model inference: + +.. autoclass:: agentfly.agents.llm_backends.llm_backends.AsyncVerlBackend + :members: + :show-inheritance: + +Client Backend +~~~~~~~~~~~~~ + +OpenAI-compatible client backend for remote API inference: + +.. autoclass:: agentfly.agents.llm_backends.llm_backends.ClientBackend + :members: + :show-inheritance: + + +Usage Examples +-------------- + +Backends are designed to work together with agents. Here are examples showing how to configure different backends when creating agents: + + +Async VLLM Backend +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from agentfly.agents import HFAgent + from agentfly.tools import calculator + from agentfly.rewards import math_reward_string_equal + from agentfly.agents.llm_backends import AsyncVLLMConfig + + agent = HFAgent( + model_name_or_path="Qwen/Qwen2.5-3B-Instruct", + tools=[calculator], + reward_fn=math_reward_string_equal, + template="qwen2.5", + backend="async_vllm", + backend_config=AsyncVLLMConfig( + pipeline_parallel_size=2, + data_parallel_size=1, + tensor_parallel_size=1, + gpu_memory_utilization=0.8 + ) + ) + +Client Backend (OpenAI-compatible) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from agentfly.agents import HFAgent + from agentfly.tools import calculator + from agentfly.rewards import math_reward_string_equal + from agentfly.agents.llm_backends import ClientConfig + + agent = HFAgent( + model_name_or_path="Qwen/Qwen2.5-3B-Instruct", + tools=[calculator], + reward_fn=math_reward_string_equal, + template="qwen2.5", + backend="client", + backend_config=ClientConfig( + base_url="http://localhost:8000/v1", + api_key="your-api-key", + max_requests_per_minute=200, + timeout=300, + temperature=0.7, + max_new_tokens=1024 + ) + ) +