From e487db643b95d0982ab75fdd4d2ffa710a2fc4b2 Mon Sep 17 00:00:00 2001
From: Reason-Wang <reason-wang@foxmail.com>
Date: Sat, 30 Aug 2025 16:35:18 +0000
Subject: [PATCH 1/2] Update docs

---
 docs/conf.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index f5682eb..906589b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -33,14 +33,6 @@
 html_static_path = ["_static"]
 
 
-
-# html_theme_options = {
-#     "logo": {
-#         "text": "🪽AgentFly\n",
-#         "image_light": "_static/logo-light.png",
-#         "image_dark": "_static/logo-dark.png",
-#     }
-# }
 html_theme_options = {
     # "path_to_docs": "docs",
     "repository_url": "https://github.com/executablebooks/sphinx-book-theme",
@@ -64,10 +56,6 @@
     "navigation_depth": 4,
     "collapse_navigation": False,
     "globaltoc_collapse": False,
-    "announcement": (
-        "⚠️The latest release refactored our HTML, "
-        "so double-check your custom CSS rules!⚠️"
-    ),
     "logo": {
         "image_dark": "_static/logo-wide-dark.svg",
         "text": "🪽AgentFly Document",  # Uncomment to try text with logo

From 3a94bded35fadb9131a4518b6513909c1b00e141 Mon Sep 17 00:00:00 2001
From: Reason-Wang <reason-wang@foxmail.com>
Date: Sat, 30 Aug 2025 20:10:40 +0000
Subject: [PATCH 2/2] Update llm backends and docs

---
 agentfly/__init__.py                          |   4 +-
 agentfly/agents/agent_base.py                 |   9 +-
 agentfly/agents/backend_config.py             |  66 -------
 agentfly/agents/llm_backends/__init__.py      |  15 ++
 .../agents/llm_backends/backend_configs.py    | 122 +++++++++++++
 .../llm_backends.py}                          | 114 ++++++++++--
 agentfly/agents/templates/utils.py            |   2 +-
 docs/api_references/agents/index.rst          |   1 +
 docs/api_references/agents/llm_backends.rst   | 164 ++++++++++++++++++
 9 files changed, 411 insertions(+), 86 deletions(-)
 delete mode 100644 agentfly/agents/backend_config.py
 create mode 100644 agentfly/agents/llm_backends/__init__.py
 create mode 100644 agentfly/agents/llm_backends/backend_configs.py
 rename agentfly/agents/{llm_backend.py => llm_backends/llm_backends.py} (81%)
 create mode 100644 docs/api_references/agents/llm_backends.rst

diff --git a/agentfly/__init__.py b/agentfly/__init__.py
index f21686e..3d54ad9 100644
--- a/agentfly/__init__.py
+++ b/agentfly/__init__.py
@@ -27,4 +27,6 @@
 
 AGENT_CONFIG_DIR = os.getenv("AGENT_CONFIG_DIR", AGENT_CONFIG_DIR)
 
-ENROOT_HOME = os.getenv("ENROOT_HOME", ENROOT_HOME)
\ No newline at end of file
+ENROOT_HOME = os.getenv("ENROOT_HOME", ENROOT_HOME)
+
+os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
\ No newline at end of file
diff --git a/agentfly/agents/agent_base.py b/agentfly/agents/agent_base.py
index 82505e9..4284e4d 100644
--- a/agentfly/agents/agent_base.py
+++ b/agentfly/agents/agent_base.py
@@ -4,7 +4,13 @@
 from .utils.messages import MessagesList
 from .templates.templates import get_template
 from ..__init__ import AGENT_DATA_DIR
-from .llm_backend import AsyncVLLMBackend, AsyncVerlBackend, ClientBackend, TransformersBackend, VLLMBackend
+from .llm_backends import (
+    AsyncVLLMBackend,
+    AsyncVerlBackend,
+    ClientBackend,
+    TransformersBackend,
+)
+from .llm_backends.backend_configs import BACKEND_CONFIGS
 from ..utils.logging import get_logger
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -18,7 +24,6 @@
 import logging
 from .chain.streaming_observer import ConsoleStreamObserver, StreamingManager
 from .utils.tokenizer import create_processor, create_tokenizer
-from .backend_config import BACKEND_CONFIGS
 try:
     from verl.protocol import DataProto
 except ImportError:
diff --git a/agentfly/agents/backend_config.py b/agentfly/agents/backend_config.py
deleted file mode 100644
index 54a5a62..0000000
--- a/agentfly/agents/backend_config.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, Dict, Any, List
-import asyncio
-
-
-@dataclass
-class TransformersConfig:
-    """Configuration for Transformers backend"""
-    temperature: float = 1.0
-    max_new_tokens: int = 1024
-    trust_remote_code: bool = True
-    device_map: str = "auto"
-
-
-@dataclass
-class VLLMConfig:
-    """Configuration for VLLM backend"""
-    temperature: float = 1.0
-    max_new_tokens: int = 1024
-    # Add other vLLM specific parameters as needed
-
-
-@dataclass
-class AsyncVLLMConfig:
-    """Configuration for Async VLLM backend"""
-    temperature: float = 1.0
-    max_new_tokens: int = 1024
-    # Add other async vLLM specific parameters as needed
-
-
-@dataclass
-class VerlConfig:
-    """Configuration for Verl backend"""
-    temperature: float = 1.0
-    max_new_tokens: int = 1024
-    # Add other Verl specific parameters as needed
-
-
-@dataclass
-class AsyncVerlConfig:
-    """Configuration for Async Verl backend"""
-    temperature: float = 1.0
-    max_new_tokens: int = 1024
-    # Add other async Verl specific parameters as needed
-
-
-@dataclass
-class ClientConfig:
-    """Configuration for Client backend (OpenAI-compatible)"""
-    base_url: str = "http://localhost:8000/v1"
-    max_requests_per_minute: int = 100
-    timeout: int = 600
-    api_key: str = "EMPTY"
-    max_new_tokens: int = 1024
-    temperature: float = 1.0
-
-
-# Backend configuration mapping
-BACKEND_CONFIGS = {
-    "transformers": TransformersConfig,
-    "vllm": VLLMConfig,
-    "async_vllm": AsyncVLLMConfig,
-    "verl": VerlConfig,
-    "async_verl": AsyncVerlConfig,
-    "client": ClientConfig,
-} 
\ No newline at end of file
diff --git a/agentfly/agents/llm_backends/__init__.py b/agentfly/agents/llm_backends/__init__.py
new file mode 100644
index 0000000..d5045e2
--- /dev/null
+++ b/agentfly/agents/llm_backends/__init__.py
@@ -0,0 +1,15 @@
+from .backend_configs import (
+    TransformersConfig,
+    VLLMConfig,
+    AsyncVLLMConfig,
+    AsyncVerlConfig,
+    ClientConfig,
+)
+
+from .llm_backends import (
+    TransformersBackend,
+    VLLMBackend,
+    AsyncVLLMBackend,
+    AsyncVerlBackend,
+    ClientBackend,
+)
\ No newline at end of file
diff --git a/agentfly/agents/llm_backends/backend_configs.py b/agentfly/agents/llm_backends/backend_configs.py
new file mode 100644
index 0000000..a8974b1
--- /dev/null
+++ b/agentfly/agents/llm_backends/backend_configs.py
@@ -0,0 +1,122 @@
+from dataclasses import dataclass
+from typing import Optional, Dict, Any, List
+
+from vllm import AsyncEngineArgs
+
+
+@dataclass
+class TransformersConfig:
+    """Configuration for Transformers backend using Hugging Face models.
+    
+    Attributes:
+        temperature (float): Sampling temperature for text generation. Controls randomness.
+            Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic.
+            Defaults to 1.0.
+        max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+        trust_remote_code (bool): Whether to trust remote code when loading models.
+            This is required for some custom models. Defaults to True.
+        device_map (str): Device mapping strategy for model placement.
+            Options include "auto", "cpu", "cuda:0", etc. Defaults to "auto".
+    """
+    temperature: float = 1.0
+    max_new_tokens: int = 1024
+    trust_remote_code: bool = True
+    device_map: str = "auto"
+
+
+@dataclass
+class VLLMConfig:
+    """Configuration for VLLM backend for high-performance inference.
+    
+    Attributes:
+        temperature (float): Sampling temperature for text generation. Controls randomness.
+            Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic.
+            Defaults to 1.0.
+        max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+    """
+    temperature: float = 1.0
+    max_new_tokens: int = 1024
+
+
+
+@dataclass
+class AsyncVLLMConfig:
+    """Configuration for Async VLLM backend with engine arguments. Arguments are the same as vLLM's arguments, which can
+    be found at https://docs.vllm.ai/en/latest/configuration/engine_args.html. Here listed some important arguments:
+    
+    Attributes:
+        gpu_memory_utilization (float): The fraction of GPU memory to be used for the model executor, which can range from 0 to 1.
+        max_model_len (int): Model context length (prompt and output). If unspecified, will be automatically derived from the model config.
+        rope_scaling (dict): Rope scaling. For example, {"rope_type":"dynamic","factor":2.0}.
+        trust_remote_code (bool): Whether to trust remote code when loading models.
+        pipeline_parallel_size (int): Pipeline parallel size.
+        data_parallel_size (int): Data parallel size.
+        tensor_parallel_size (int): Tensor parallel size.
+    """
+    engine_args: AsyncEngineArgs = AsyncEngineArgs()
+
+    def __init__(self, **kwargs):
+        self.engine_args = AsyncEngineArgs(**kwargs)
+
+
+@dataclass
+class VerlConfig:
+    """Configuration for Verl backend.
+    
+    Attributes:
+        temperature (float): Sampling temperature for text generation. Controls randomness.
+            Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic.
+            Defaults to 1.0.
+        max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+    """
+    temperature: float = 1.0
+    max_new_tokens: int = 1024
+
+
+@dataclass
+class AsyncVerlConfig:
+    """Configuration for Async Verl backend.
+    
+    Attributes:
+        temperature (float): Sampling temperature for text generation. Controls randomness.
+            Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic.
+            Defaults to 1.0.
+        max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+    """
+    temperature: float = 1.0
+    max_new_tokens: int = 1024
+
+
+@dataclass
+class ClientConfig:
+    """Configuration for Client backend (OpenAI-compatible)
+    
+    This configuration class provides settings for connecting to OpenAI-compatible
+    API endpoints, such as local models served via vLLM, Ollama, or other
+    compatible servers.
+    
+    Attributes:
+        base_url: The base URL for the API endpoint. Defaults to localhost:8000.
+        max_requests_per_minute: Rate limiting for API requests. Defaults to 100.
+        timeout: Request timeout in seconds. Defaults to 600 (10 minutes).
+        api_key: API key for authentication. Defaults to "EMPTY" for local servers.
+        max_new_tokens: Maximum number of tokens to generate. Defaults to 1024.
+        temperature: Sampling temperature for text generation. Defaults to 1.0.
+    """
+    base_url: str = "http://localhost:8000/v1"
+    max_requests_per_minute: int = 100
+    timeout: int = 600
+    api_key: str = "EMPTY"
+    max_new_tokens: int = 1024
+    temperature: float = 1.0
+
+
+# Backend configuration mapping
+BACKEND_CONFIGS = {
+    "transformers": TransformersConfig,
+    "vllm": VLLMConfig,
+    "async_vllm": AsyncVLLMConfig,
+    "verl": VerlConfig,
+    "async_verl": AsyncVerlConfig,
+    "client": ClientConfig,
+} 
\ No newline at end of file
diff --git a/agentfly/agents/llm_backend.py b/agentfly/agents/llm_backends/llm_backends.py
similarity index 81%
rename from agentfly/agents/llm_backend.py
rename to agentfly/agents/llm_backends/llm_backends.py
index 3a65f00..340e985 100644
--- a/agentfly/agents/llm_backend.py
+++ b/agentfly/agents/llm_backends/llm_backends.py
@@ -14,13 +14,11 @@
 from tenacity import retry, stop_after_attempt, wait_exponential
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from ..utils.verl import pad_tensor_to_rank_size
-import os
-os.environ["VLLM_USE_V1"] = "1"
+from ...utils.verl import pad_tensor_to_rank_size
 from vllm import LLM, AsyncLLMEngine, SamplingParams, AsyncEngineArgs
 import openai
-from .templates.templates import Chat
-from .templates.vision_processor import get_processor
+from ..templates.templates import Chat
+from ..templates.vision_processor import get_processor
 import logging
 import PIL
 
@@ -35,7 +33,14 @@
     pass
 
 class LLMBackend:
-    """Base class for LLM backends"""
+    """Base class for LLM backends.
+    
+    This abstract base class provides a unified interface for different LLM implementations.
+    All backend implementations must inherit from this class and implement the required methods.
+    
+    Attributes:
+        config: Configuration dictionary containing backend-specific parameters.
+    """
     
     def __init__(self, **kwargs):
         self.config = kwargs
@@ -61,9 +66,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], streaming_ca
         raise NotImplementedError("Subclasses must implement generate_streaming()")
 
 class TransformersBackend(LLMBackend):
-    """HuggingFace Transformers implementation"""
+    """HuggingFace Transformers implementation for local model inference.
+    
+    This backend uses the Hugging Face Transformers library to load and run models locally.
+    It supports both synchronous and asynchronous text generation with streaming capabilities.
+    """
     
     def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, temperature: float=1.0, max_new_tokens: int=1024, **kwargs):
+        """Initialize TransformersBackend.
+        
+        Args:
+            model_name_or_path (str): Name or path of the pre-trained model to load.
+            template (str): Chat template to use for formatting messages.
+            max_length (int): Maximum sequence length for input/output. Defaults to 8192.
+            temperature (float): Sampling temperature for text generation. Defaults to 1.0.
+            max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+            **kwargs: Additional configuration parameters.
+        """
         super().__init__(**kwargs)
         
         self.model_name = model_name_or_path
@@ -151,9 +170,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], streaming_ca
             inputs['attention_mask'] = torch.cat([inputs['attention_mask'], torch.ones(1, 1, device=inputs['attention_mask'].device)], dim=1)
 
 class VLLMBackend(LLMBackend):
-    """vLLM implementation"""
+    """vLLM implementation for high-performance model inference.
+    
+    This backend uses the vLLM library for optimized inference of large language models.
+    vLLM provides efficient memory management and high throughput for model serving.
+    """
     
     def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, temperature: float=1.0, max_new_tokens: int=1024, **kwargs):
+        """Initialize VLLMBackend.
+        
+        Args:
+            model_name_or_path (str): Name or path of the pre-trained model to load.
+            template (str): Chat template to use for formatting messages.
+            max_length (int): Maximum sequence length for input/output. Defaults to 8192.
+            temperature (float): Sampling temperature for text generation. Defaults to 1.0.
+            max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+            **kwargs: Additional configuration parameters.
+        """
         super().__init__(**kwargs)
 
         self.model_name = model_name_or_path
@@ -234,9 +267,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], streaming_ca
                         yield sequence.text
 
 class AsyncVLLMBackend(LLMBackend):
-    """Async vLLM implementation"""
+    """Asynchronous vLLM implementation for high-performance model inference.
+    
+    This backend uses the vLLM AsyncLLMEngine for asynchronous inference, providing
+    better resource utilization and scalability for concurrent requests.
+    """
     
     def __init__(self, model_name_or_path: str, template: str, max_length: int=8192, temperature: float=1.0, max_new_tokens: int=1024, **kwargs):
+        """Initialize AsyncVLLMBackend.
+        
+        Args:
+            model_name_or_path (str): Name or path of the pre-trained model to load.
+            template (str): Chat template to use for formatting messages.
+            max_length (int): Maximum sequence length for input/output. Defaults to 8192.
+            temperature (float): Sampling temperature for text generation. Defaults to 1.0.
+            max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+            **kwargs: Additional configuration parameters that will be passed to AsyncEngineArgs.
+        """
         super().__init__(**kwargs)
 
         self.model_name = model_name_or_path
@@ -244,11 +291,17 @@ def __init__(self, model_name_or_path: str, template: str, max_length: int=8192,
         self.temperature = temperature
         self.max_new_tokens = max_new_tokens
         self.template = template
-        # Load model
-        self.llm_engine = AsyncLLMEngine.from_engine_args(
-            AsyncEngineArgs(
+        
+        if 'engine_args' in kwargs:
+            engine_args = kwargs.pop('engine_args')
+            engine_args.model = self.model_name
+        else:
+            engine_args = AsyncEngineArgs(
                 model=self.model_name,
+                **kwargs,
             )
+        self.llm_engine = AsyncLLMEngine.from_engine_args(
+            engine_args
         )
         
     def _process_inputs(self, prompts: List[str], vision_inputs: Dict[str, List[PIL.Image.Image]]):
@@ -327,9 +380,23 @@ async def generate_streaming(self, messages_list: List[List[Dict]], **kwargs) ->
                         yield sequence.text
 
 class AsyncVerlBackend(LLMBackend):
-    """Verl implementation"""
+    """Asynchronous Verl implementation for distributed model inference.
+    
+    This backend uses the Verl framework for distributed and asynchronous model inference.
+    Verl provides capabilities for running models across multiple workers and handling
+    complex inference pipelines.
+    """
     
     def __init__(self, llm_engine, model_name_or_path: str, template: str, max_length: int=8192, **kwargs):
+        """Initialize AsyncVerlBackend.
+        
+        Args:
+            llm_engine: Verl engine instance for distributed inference.
+            model_name_or_path (str): Name or path of the pre-trained model to load.
+            template (str): Chat template to use for formatting messages.
+            max_length (int): Maximum sequence length for input/output. Defaults to 8192.
+            **kwargs: Additional configuration parameters.
+        """
         super().__init__(**kwargs)
         self.model_name = model_name_or_path
         self.max_length = max_length
@@ -394,9 +461,11 @@ async def generate_async(self, messages_list: str, **kwargs) -> str:
 
 
 class ClientBackend(LLMBackend):
-    """
-    Thin async/sync wrapper around OpenAI-compatible chat API.
-    Call `generate(...)` with *one* or *many* message lists.
+    """OpenAI-compatible client backend for remote API inference.
+    
+    This backend provides a thin wrapper around OpenAI-compatible chat APIs,
+    supporting both synchronous and asynchronous operations. It includes built-in
+    rate limiting and retry mechanisms for reliable API communication.
     """
 
     def __init__(
@@ -411,6 +480,19 @@ def __init__(
         max_new_tokens: int = 1024,
         **kwargs,
     ):
+        """Initialize ClientBackend.
+        
+        Args:
+            model_name_or_path (str): Name of the model to use for inference.
+            template (str): Chat template to use for formatting messages.
+            base_url (str): Base URL for the API endpoint. Defaults to localhost:8000.
+            max_requests_per_minute (int): Rate limiting for API requests. Defaults to 100.
+            timeout (int): Request timeout in seconds. Defaults to 600.
+            api_key (str): API key for authentication. Defaults to "EMPTY" for local servers.
+            max_length (int): Maximum sequence length for input/output. Defaults to 8192.
+            max_new_tokens (int): Maximum number of new tokens to generate. Defaults to 1024.
+            **kwargs: Additional configuration parameters.
+        """
         super().__init__(**kwargs)
 
         # --- connection
diff --git a/agentfly/agents/templates/utils.py b/agentfly/agents/templates/utils.py
index 46faa0c..ab6c07c 100644
--- a/agentfly/agents/templates/utils.py
+++ b/agentfly/agents/templates/utils.py
@@ -358,7 +358,7 @@ def vllm_serve(model_name_or_path, template, tp, pp, dp):
 if __name__=="__main__":
     "python -m agents.agents.templates.utils"
     # model = "/mnt/sharefs/users/haonan.li/models/Qwen2.5-7B-instruct-am_think_v1_distilled"
-    model = "Qwen/Qwen2.5-7B-Instruct"
+    model = "Qwen/Qwen2.5-3B-Instruct"
     # vllm_serve(model, "qwen2.5-think", 2, 1, 4)
     vllm_serve(model, "qwen2.5", 1, 1, 1)
 
diff --git a/docs/api_references/agents/index.rst b/docs/api_references/agents/index.rst
index 93853fe..23c5f2c 100644
--- a/docs/api_references/agents/index.rst
+++ b/docs/api_references/agents/index.rst
@@ -16,6 +16,7 @@ Base Agent
    :maxdepth: 2
 
    agent
+   llm_backends
 
 Core Classes
 ===========
diff --git a/docs/api_references/agents/llm_backends.rst b/docs/api_references/agents/llm_backends.rst
new file mode 100644
index 0000000..8f57918
--- /dev/null
+++ b/docs/api_references/agents/llm_backends.rst
@@ -0,0 +1,164 @@
+LLM Backends
+============
+
+Overview
+--------
+
+AgentFly supports multiple LLM backends for text generation, each with their own configuration options.
+This module provides configuration classes for different backend types including Transformers, vLLM, Verl, and OpenAI-compatible clients.
+
+Configuration Classes
+--------------------
+
+Transformers Backend
+~~~~~~~~~~~~~~~~~~~
+
+Configuration for the Transformers backend using Hugging Face models:
+
+.. autoclass:: agentfly.agents.llm_backends.backend_configs.TransformersConfig
+   :show-inheritance:
+   :special-members: !__init__
+
+
+Async VLLM Backend
+~~~~~~~~~~~~~~~~~~
+
+Configuration for asynchronous vLLM backend with engine arguments:
+
+.. autoclass:: agentfly.agents.llm_backends.backend_configs.AsyncVLLMConfig
+   :show-inheritance:
+   :special-members: !__init__
+
+
+Async Verl Backend
+~~~~~~~~~~~~~~~~~~
+
+Configuration for asynchronous Verl backend:
+
+.. autoclass:: agentfly.agents.llm_backends.backend_configs.AsyncVerlConfig
+   :show-inheritance:
+   :special-members: !__init__
+
+Client Backend
+~~~~~~~~~~~~~
+
+Configuration for OpenAI-compatible client backends:
+
+.. autoclass:: agentfly.agents.llm_backends.backend_configs.ClientConfig
+   :show-inheritance:
+   :special-members: !__init__
+
+Backend Implementations
+----------------------
+
+Base Backend
+~~~~~~~~~~~
+
+Abstract base class for all LLM backends:
+
+.. autoclass:: agentfly.agents.llm_backends.llm_backends.LLMBackend
+   :members:
+   :show-inheritance:
+
+Transformers Backend
+~~~~~~~~~~~~~~~~~~~
+
+HuggingFace Transformers implementation for local model inference:
+
+.. autoclass:: agentfly.agents.llm_backends.llm_backends.TransformersBackend
+   :members:
+   :show-inheritance:
+
+VLLM Backend
+~~~~~~~~~~~~
+
+vLLM implementation for high-performance model inference:
+
+.. autoclass:: agentfly.agents.llm_backends.llm_backends.VLLMBackend
+   :members:
+   :show-inheritance:
+
+Async VLLM Backend
+~~~~~~~~~~~~~~~~~~
+
+Asynchronous vLLM implementation for high-performance model inference:
+
+.. autoclass:: agentfly.agents.llm_backends.llm_backends.AsyncVLLMBackend
+   :members:
+   :show-inheritance:
+
+Async Verl Backend
+~~~~~~~~~~~~~~~~~~
+
+Asynchronous Verl implementation for distributed model inference:
+
+.. autoclass:: agentfly.agents.llm_backends.llm_backends.AsyncVerlBackend
+   :members:
+   :show-inheritance:
+
+Client Backend
+~~~~~~~~~~~~~
+
+OpenAI-compatible client backend for remote API inference:
+
+.. autoclass:: agentfly.agents.llm_backends.llm_backends.ClientBackend
+   :members:
+   :show-inheritance:
+
+
+Usage Examples
+--------------
+
+Backends are designed to work together with agents. Here are examples showing how to configure different backends when creating agents:
+
+
+Async VLLM Backend
+~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+   from agentfly.agents import HFAgent
+   from agentfly.tools import calculator
+   from agentfly.rewards import math_reward_string_equal
+   from agentfly.agents.llm_backends import AsyncVLLMConfig
+   
+   agent = HFAgent(
+       model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
+       tools=[calculator],
+       reward_fn=math_reward_string_equal,
+       template="qwen2.5",
+       backend="async_vllm",
+       backend_config=AsyncVLLMConfig(
+           pipeline_parallel_size=2,
+           data_parallel_size=1,
+           tensor_parallel_size=1,
+           gpu_memory_utilization=0.8
+       )
+   )
+
+Client Backend (OpenAI-compatible)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+   from agentfly.agents import HFAgent
+   from agentfly.tools import calculator
+   from agentfly.rewards import math_reward_string_equal
+   from agentfly.agents.llm_backends import ClientConfig
+   
+   agent = HFAgent(
+       model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
+       tools=[calculator],
+       reward_fn=math_reward_string_equal,
+       template="qwen2.5",
+       backend="client",
+       backend_config=ClientConfig(
+           base_url="http://localhost:8000/v1",
+           api_key="your-api-key",
+           max_requests_per_minute=200,
+           timeout=300,
+           temperature=0.7,
+           max_new_tokens=1024
+       )
+   )
+