From cb3a8627f6e79527f67e1f5c8b7805b132f900a0 Mon Sep 17 00:00:00 2001
From: Reason-Wang <reason-wang@foxmail.com>
Date: Fri, 15 Aug 2025 21:31:15 +0000
Subject: [PATCH] Make tool content to be pure observation, fix tests

---
 agents/agents/agents/agent_base.py            |  9 ++--
 agents/agents/agents/chain/chain_base.py      | 13 ++---
 agents/agents/tools/utils/schema.py           |  4 +-
 agents/tests/scripts/test_gpu_runs.sh         | 10 ++++
 agents/tests/unit/agents/test_auto_agent.py   | 12 ++---
 agents/tests/unit/agents/test_chain.py        | 42 ---------------
 ..._agent_with_code.py => test_code_agent.py} |  5 +-
 .../tests/unit/agents/test_initialization.py  | 51 ++++++++++++++-----
 agents/tests/unit/agents/test_react_agent.py  | 45 ++++++++--------
 agents/tests/unit/agents/test_vision_agent.py |  2 +
 agents/tests/unit/agents/test_vllm_backend.py | 42 ---------------
 .../tests/unit/agents/test_webshop_agent.py   | 22 ++++++--
 pyproject.toml                                |  5 ++
 13 files changed, 117 insertions(+), 145 deletions(-)
 create mode 100644 agents/tests/scripts/test_gpu_runs.sh
 rename agents/tests/unit/agents/{test_agent_with_code.py => test_code_agent.py} (95%)
 delete mode 100644 agents/tests/unit/agents/test_vllm_backend.py

diff --git a/agents/agents/agents/agent_base.py b/agents/agents/agents/agent_base.py
index 43b01f2..89bf41f 100644
--- a/agents/agents/agents/agent_base.py
+++ b/agents/agents/agents/agent_base.py
@@ -254,12 +254,9 @@ def extract_final_response(self, messages: List[Dict[str, Any]]) -> str:
         if last_message_role == "assistant":
             return last_message_content
         elif last_message_role == "tool":
-            try:
-                response = json.loads(last_message_content)
-                if "content" in response:
-                    return response["content"]
-            except json.JSONDecodeError:
-                return last_message_content
+            return last_message_content
+        else:
+            raise ValueError(f"The last message role must be assistant or tool, but got {last_message_role}")
 
     @abstractmethod
     def parse(self, responses: List[str], tools: List[Any], **args) -> Tuple[dict, int, int]:
diff --git a/agents/agents/agents/chain/chain_base.py b/agents/agents/agents/chain/chain_base.py
index 669be97..faf8f71 100644
--- a/agents/agents/agents/chain/chain_base.py
+++ b/agents/agents/agents/chain/chain_base.py
@@ -312,6 +312,10 @@ async def _run_single_chain(self,
                 )
                 thought_node.is_terminal = new_msg.get("status", "continue") in self.terminal_status
                 current_node = thought_node
+                
+                # Check if the thought node is terminal - if so, break the loop
+                if current_node.is_terminal:
+                    break
 
             # Handle tool calls
             if current_node.messages[-1].get("tool_calls"):
@@ -331,16 +335,13 @@ async def _run_single_chain(self,
                     
                     # Process observation
                     observation = result["observation"]
-                    observation_json = json.dumps({
-                        "name": result["name"],
-                        "content": observation,
-                    }, indent=4)
                     
-                    action_input_node.observation = observation_json
+                    action_input_node.observation = observation
                     action_input_node.observation_code = result["status"]
                     newest_messages.append({
                         "role": "tool",
                         "tool_call_id": tool_call["id"],
+                        "tool_name": result["name"],
                         "content": [{"type": "text", "text": observation}],
                     })
                     action_input_node.messages = deepcopy(newest_messages)
@@ -535,7 +536,7 @@ def monitor_step(self) -> None:
                     avg_turns += 1
                 if msg['role'] == 'tool':
                     avg_tool_calls += 1
-                    tool_call_name = json.loads(msg['content'][0]['text'])['name']
+                    tool_call_name = msg['tool_name']
                     tool_calls_by_name[tool_call_name] += 1
 
         avg_turns /= len(messages)
diff --git a/agents/agents/tools/utils/schema.py b/agents/agents/tools/utils/schema.py
index 28b4815..37016ce 100644
--- a/agents/agents/tools/utils/schema.py
+++ b/agents/agents/tools/utils/schema.py
@@ -129,7 +129,9 @@ def validate_schema(name, description, signature, docs):
     docs_description = docs['description']
     if description and docs_description and docs_description != description:
         # raise ValueError(f"Description mismatch: {description} != {docs_description}")
-        warnings.warn(f"Description mismatch: {description} != {docs_description}, use the specified description by default.")
+        # warnings.warn(f"Description mismatch: {description} != {docs_description}, use the specified description by default.")
+        # TODO: currently we don't do anything here and prioritize the specified description by default.
+        pass
 
 
     docs_params = docs['params']
diff --git a/agents/tests/scripts/test_gpu_runs.sh b/agents/tests/scripts/test_gpu_runs.sh
new file mode 100644
index 0000000..e033a51
--- /dev/null
+++ b/agents/tests/scripts/test_gpu_runs.sh
@@ -0,0 +1,10 @@
+#! /bin/bash
+
+# Test GPU runs
+
+python -m pytest -x tests/unit/agents/test_initialization.py || exit 1
+python -m pytest -x tests/unit/agents/test_auto_agent.py || exit 1
+python -m pytest -x tests/unit/agents/test_code_agent.py || exit 1
+python -m pytest -x tests/unit/agents/test_react_agent.py || exit 1
+python -m pytest -x tests/unit/agents/test_webshop_agent.py || exit 1
+python -m pytest -x tests/unit/agents/test_vision_agent.py || exit 1
\ No newline at end of file
diff --git a/agents/tests/unit/agents/test_auto_agent.py b/agents/tests/unit/agents/test_auto_agent.py
index 5b5a5c7..7159a4c 100644
--- a/agents/tests/unit/agents/test_auto_agent.py
+++ b/agents/tests/unit/agents/test_auto_agent.py
@@ -8,7 +8,7 @@ def test_auto_agent_from_config_react():
     config = {
         "agent_type": "react",
         "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
-        "template": "qwen-7b-chat",
+        "template": "qwen2.5",
         "tools": ["google_search", "answer"],
         "backend": "client"
     }
@@ -17,7 +17,7 @@ def test_auto_agent_from_config_react():
     
     assert isinstance(agent, ReactAgent)
     assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
-    assert agent.template == "qwen-7b-chat"
+    assert agent.template == "qwen2.5"
     assert len(agent.tools) == 2
     assert agent.backend == "client"
 
@@ -26,7 +26,7 @@ def test_auto_agent_from_config_code():
     config = {
         "agent_type": "code",
         "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
-        "template": "qwen-7b-chat",
+        "template": "qwen2.5",
         "tools": ["code_interpreter"],
         "backend": "client"
     }
@@ -43,7 +43,7 @@ def test_auto_agent_from_pretrained():
     agent = AutoAgent.from_pretrained(
         model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
         agent_type="react",
-        template="qwen-7b-chat",
+        template="qwen2.5",
         tools=["google_search", "answer"],
         debug=True,
         backend="client"
@@ -55,7 +55,7 @@ def test_auto_agent_with_reward():
     config = {
         "agent_type": "react",
         "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
-        "template": "qwen-7b-chat",
+        "template": "qwen2.5",
         "tools": ["google_search", "answer"],
         "reward_name": "qa_f1_reward",
         "backend": "client"
@@ -71,7 +71,7 @@ def test_auto_agent_invalid_type():
     config = {
         "agent_type": "invalid_type",
         "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
-        "template": "qwen-7b-chat",
+        "template": "qwesn2.5",
         "tools": ["google_search", "answer"],
         "backend": "client"
     }
diff --git a/agents/tests/unit/agents/test_chain.py b/agents/tests/unit/agents/test_chain.py
index 4997283..5861ea4 100644
--- a/agents/tests/unit/agents/test_chain.py
+++ b/agents/tests/unit/agents/test_chain.py
@@ -81,45 +81,3 @@ def test_chain_to_json():
     assert json_data[1]["type"] == "Action"
 
 
-def test_multi_level_chain():
-    chain = Chain(info={"question": "test question"})
-    
-    # Level 0
-    root = chain.add_node(type="Thought", description="Initial thought")
-    
-    # Level 1
-    action = chain.add_node(type="Action", description="google_search")
-    
-    # Level 2
-    action_input = chain.add_node(type="Action Input", description='{"query": "test"}')
-    
-    # Level 3
-    observation = chain.add_node(
-        type="Observation",
-        description="Result",
-        observation="Search results here"
-    )
-    
-    assert root.depth == 0
-    assert action.depth == 1
-    assert action_input.depth == 2
-    assert observation.depth == 3
-    
-    # Check parent-child relationships
-    assert root.children[0] == action
-    assert action.children[0] == action_input
-    assert action_input.children[0] == observation
-
-
-def test_node_to_json_recursive():
-    # Create a chain with multiple nodes
-    chain = Chain(info={"question": "test question"})
-    root = chain.add_node(type="Thought", description="Initial thought")
-    action = chain.add_node(type="Action", description="google_search")
-    
-    # Get recursive JSON
-    json_data = root.to_json_recursive()
-    
-    assert json_data["type"] == "Thought"
-    assert len(json_data["children"]) == 1
-    assert json_data["children"][0]["type"] == "Action" 
\ No newline at end of file
diff --git a/agents/tests/unit/agents/test_agent_with_code.py b/agents/tests/unit/agents/test_code_agent.py
similarity index 95%
rename from agents/tests/unit/agents/test_agent_with_code.py
rename to agents/tests/unit/agents/test_code_agent.py
index b6b191e..662b33e 100644
--- a/agents/tests/unit/agents/test_agent_with_code.py
+++ b/agents/tests/unit/agents/test_code_agent.py
@@ -10,9 +10,8 @@ async def test_code_agent_end_to_end():
     agent = CodeAgent(
         "Qwen/Qwen2.5-3B-Instruct",
         tools=tools,
-        template="qwen-7b-chat",
+        template="qwen2.5",
         backend="async_vllm",
-        debug=True
     )
 
     question1 = "Every morning Aya goes for a $9$-kilometer-long walk and stops at a coffee shop afterwards. When she walks at a constant speed of $s$ kilometers per hour, the walk takes her 4 hours, including $t$ minutes spent in the coffee shop. When she walks $s+2$ kilometers per hour, the walk takes her 2 hours and 24 minutes, including $t$ minutes spent in the coffee shop. Suppose Aya walks at $s+\frac{1}{2}$ kilometers per hour. Find the number of minutes the walk takes her, including the $t$ minutes spent in the coffee shop."
@@ -39,7 +38,7 @@ async def test_code_agent_end_to_end():
     await agent.run_async(
         max_steps=4,
         start_messages=messages,
-        num_chains=5
+        num_chains=2
     )
 
     messages = agent.get_messages()
diff --git a/agents/tests/unit/agents/test_initialization.py b/agents/tests/unit/agents/test_initialization.py
index c76cf9a..61055ee 100644
--- a/agents/tests/unit/agents/test_initialization.py
+++ b/agents/tests/unit/agents/test_initialization.py
@@ -1,10 +1,13 @@
 from agents.agents.agent_base import BaseAgent
 from agents.agents.specialized.code_agent import CodeAgent
-from agents.tools import code_interpreter
+from agents.agents.react.react_agent import ReactAgent
+from agents.agents.specialized.think_agent import ThinkAgent
+from agents.tools import code_interpreter, google_search_serper, answer_qa
 import pytest
 
 
-@pytest.mark.parametrize("backend", ["vllm", "client"])
+@pytest.mark.gpu
+@pytest.mark.parametrize("backend", ["async_vllm", "client"])
 def test_agent_initialization_backend(backend: str):
     # Initialize the code agent
     print(f"Testing {backend} backend")
@@ -14,7 +17,7 @@ def test_agent_initialization_backend(backend: str):
         agent = CodeAgent(
             "Qwen/Qwen2.5-3B-Instruct",
             tools=tools,
-            template="qwen-7b-chat",
+            template="qwen2.5",
             backend=backend
         )
         print("Agent initialized successfully")
@@ -26,25 +29,49 @@ def test_agent_initialization_backend(backend: str):
     assert agent.backend == backend
     assert agent.tools == tools
     assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
-    assert agent.template == "qwen-7b-chat"
+    assert agent.template == "qwen2.5"
     
     # Test basic methods
     messages = agent.get_messages()
     assert isinstance(messages, list)
 
-
-def test_code_agent_initialization():
+@pytest.mark.gpu
+@pytest.mark.parametrize("backend", ["async_vllm", "client"])
+def test_code_agent_initialization(backend: str):
     tools = [code_interpreter]
     agent = CodeAgent(
         "Qwen/Qwen2.5-3B-Instruct",
         tools=tools,
-        template="qwen-7b-chat",
-        backend="client"
+        template="qwen2.5",
+        backend=backend
     )
     
-    # Check system prompt is set correctly
-    assert "multi-turn manner" in agent.system_prompt
-    assert agent.max_length == 8192
 
+@pytest.mark.gpu
+@pytest.mark.parametrize("backend", ["async_vllm", "client"])
+def test_react_agent_initialization(backend: str):
+    tools = [google_search_serper, answer_qa]
+    task_info = "Test search task"
+    agent = ReactAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        task_info=task_info,
+        backend=backend
+    )
+    
+    # Check system prompt contains task info and tools
+    assert task_info in agent.system_prompt
+    assert "google_search" in agent.system_prompt
+    assert "answer" in agent.system_prompt
 
-    
\ No newline at end of file
+@pytest.mark.gpu
+@pytest.mark.parametrize("backend", ["async_vllm", "client"])
+def test_think_agent_initialization(backend: str):
+    tools = [code_interpreter]
+    agent = ThinkAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend=backend
+    )
diff --git a/agents/tests/unit/agents/test_react_agent.py b/agents/tests/unit/agents/test_react_agent.py
index dcb2043..82a19bb 100644
--- a/agents/tests/unit/agents/test_react_agent.py
+++ b/agents/tests/unit/agents/test_react_agent.py
@@ -1,25 +1,7 @@
 import pytest
 from agents.agents.react.react_agent import ReactAgent, parse_react_step
 from agents.tools.src.search.google_search import google_search_serper
-from agents.tools.src.react.tools import answer
-
-
-def test_react_agent_initialization():
-    tools = [google_search_serper, answer]
-    task_info = "Test search task"
-    agent = ReactAgent(
-        "Qwen/Qwen2.5-3B-Instruct",
-        tools=tools,
-        template="qwen2.5",
-        task_info=task_info,
-        backend="client"
-    )
-    
-    # Check system prompt contains task info and tools
-    assert task_info in agent.system_prompt
-    assert "google_search" in agent.system_prompt
-    assert "answer" in agent.system_prompt
-    
+from agents.tools import answer_qa
 
 def test_parse_react_step():
     # Test with a valid ReAct step
@@ -40,13 +22,15 @@ def test_parse_react_step():
     assert result_missing["input"] is None
 
 
-def test_react_agent_parse():
-    tools = [google_search_serper, answer]
+@pytest.mark.gpu
+@pytest.mark.asyncio(loop_scope="session")
+async def test_react_agent_parse_run():
+    tools = [google_search_serper, answer_qa]
     agent = ReactAgent(
         "Qwen/Qwen2.5-3B-Instruct",
         tools=tools,
         template="qwen2.5",
-        backend="client"
+        backend="async_vllm"
     )
     
     responses = ["""Thought: I need to search for information.
@@ -60,4 +44,19 @@ def test_react_agent_parse():
     assert "Thought: I need to search for information." in result[0]["content"][0]["text"]
     assert len(result[0]["tool_calls"]) == 1
     assert result[0]["tool_calls"][0]["function"]["name"] == "google_search"
-    assert result[0]["tool_calls"][0]["function"]["arguments"] == {"query": "test query"}
\ No newline at end of file
+    assert result[0]["tool_calls"][0]["function"]["arguments"] == {"query": "test query"}
+
+    messages = [
+        {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is the capital of France?"
+                }
+            ]
+        }
+    ]
+    await agent.run_async(start_messages=messages, max_steps=4, num_chains=1)
+    messages_list = agent.get_messages()
+    print(messages_list[0])
+    
\ No newline at end of file
diff --git a/agents/tests/unit/agents/test_vision_agent.py b/agents/tests/unit/agents/test_vision_agent.py
index 0f67ab5..eedc521 100644
--- a/agents/tests/unit/agents/test_vision_agent.py
+++ b/agents/tests/unit/agents/test_vision_agent.py
@@ -2,6 +2,8 @@
 from agents.tools import answer_qa
 import pytest
 
+
+@pytest.mark.gpu
 @pytest.mark.asyncio(loop_scope="session")
 async def test_vision_agent():
     tools = [answer_qa]
diff --git a/agents/tests/unit/agents/test_vllm_backend.py b/agents/tests/unit/agents/test_vllm_backend.py
deleted file mode 100644
index 09ddec6..0000000
--- a/agents/tests/unit/agents/test_vllm_backend.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from agents.agents.llm_backend import AsyncVLLMBackend, VLLMBackend
-import pytest
-
-def test_vllm_backend():
-    backend = VLLMBackend(model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", template="qwen2.5-vl")
-    messages_list = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
-                    },
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            },
-        ]
-    ]
-    result = backend.generate(messages_list)
-    print(result)
-
-
-@pytest.mark.asyncio(loop_scope="session")
-async def test_async_vllm_backend():
-    backend = AsyncVLLMBackend(model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", template="qwen2.5-vl")
-    messages_list = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
-                    },
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            },
-        ]
-    ]
-    result = await backend.generate_async(messages_list)
-    print(result)
\ No newline at end of file
diff --git a/agents/tests/unit/agents/test_webshop_agent.py b/agents/tests/unit/agents/test_webshop_agent.py
index 49ff192..59f5c91 100644
--- a/agents/tests/unit/agents/test_webshop_agent.py
+++ b/agents/tests/unit/agents/test_webshop_agent.py
@@ -2,17 +2,18 @@
 from agents.agents.react.react_agent import ReactAgent
 from agents.tools.src.webshop.tools import webshop_browser
 from agents.tools.src.react.tools import answer
-from rewards.webshop_reward import WebshopReward
+from agents.rewards import webshop_reward
 
 
+@pytest.mark.gpu
 @pytest.mark.asyncio
 async def test_webshop_agent_call():
     tools = [webshop_browser, answer]
     agent = ReactAgent(
         "Qwen/Qwen2.5-3B-Instruct",
         tools=tools,
-        reward_fn=WebshopReward,
-        template="qwen-7b-chat",
+        reward_fn=webshop_reward,
+        template="qwen2.5",
         backend="async_vllm",
         debug=True
     )
@@ -24,6 +25,19 @@ async def test_webshop_agent_call():
                 {"role": "user", "content": f"{question}"}
             ],
             "question": f"{question}",
+            "asin": "B07FYPSNH8",
+            "category": "grocery",
+            "query": "beverages",
+            "name": "OWYN - 100% Vegan Plant-Based Protein Shakes | Cold Brew Coffee, 12 Fl Oz | Dairy-Free, Gluten-Free, Soy-Free, Tree Nut-Free, Egg-Free, Allergy-Free, Vegetarian",
+            "product_category": "Grocery & Gourmet Food \u203a Beverages \u203a Bottled Beverages, Water & Drink Mixes \u203a Meal Replacement & Protein Drinks \u203a Protein Drinks",
+            "instruction_text": "i am looking for a gluten free, 100% vegan plant based protein shake that is soy-free, and price lower than 40.00 dollars",
+            "attributes": [
+            "gluten free"
+            ],
+            "price_upper": 40.0,
+            "goal_options": [],
+            "weight": 1,
+            "task_id": 0,
         },
     ]
 
@@ -35,4 +49,4 @@ async def test_webshop_agent_call():
         )
 
     messages = agent.get_messages()
-    print(messages)
\ No newline at end of file
+    print(messages)
diff --git a/pyproject.toml b/pyproject.toml
index 4003daa..92d6504 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,11 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 include = ["agents", "verl"]
 
+[tool.pytest.ini_options]
+markers = [
+    "gpu: marks tests as requiring GPU resources"
+]
+
 [project]
 name = "AgentFly"
 version = "0.0.1"