diff --git a/apps/miroflow-agent/benchmarks/common_benchmark.py b/apps/miroflow-agent/benchmarks/common_benchmark.py
index a2e74164..4b26138a 100644
--- a/apps/miroflow-agent/benchmarks/common_benchmark.py
+++ b/apps/miroflow-agent/benchmarks/common_benchmark.py
@@ -23,9 +23,12 @@
execute_task_pipeline,
)
from src.logging.summary_time_cost import generate_summary
-
-# Constants for format error detection
-FORMAT_ERROR_MESSAGE = "No \\boxed{} content found in the final answer."
+from src.utils.prompt_utils import (
+ FAILURE_EXPERIENCE_FOOTER,
+ FAILURE_EXPERIENCE_HEADER,
+ FAILURE_EXPERIENCE_ITEM,
+ FORMAT_ERROR_MESSAGE,
+)
def _task_worker(task_dict, cfg_dict, evaluator_kwargs):
@@ -135,7 +138,7 @@ def __init__(self, data_dir: str, benchmark_name: str, cfg: DictConfig):
# Format error tracking and retry configuration
# Read from agent config as it's part of context management
- self.format_error_retry_limit = cfg.agent.get("format_error_retry_limit", 0)
+ self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)
# Get LLM provider and model from the config object
self.llm_provider = cfg.llm.provider
@@ -290,7 +293,57 @@ def extract_timestamp(file_path):
# Try to get a valid response with format retry
print(f"TASK ID: {task.task_id}, ATTEMPT: {attempt}")
- max_format_retries = self.format_error_retry_limit
+ max_format_retries = self.context_compress_limit
+
+ # Track accumulated failure experiences for this attempt
+ # Start with the original task description
+ current_task_description = task_description
+ failure_experiences = []
+
+ # Resume: Recover failure experiences from previous retry logs
+ if format_retry_count > 0 and logs_dir.exists():
+ print(
+ f" Resuming from retry {format_retry_count}, recovering previous failure experiences..."
+ )
+ for prev_retry in range(format_retry_count):
+ prev_log_pattern = f"task_{task.task_id}_attempt-{attempt}_format-retry-{prev_retry}_*.json"
+ prev_logs = sorted(list(logs_dir.glob(prev_log_pattern)))
+ if prev_logs:
+ prev_log_file = prev_logs[-1] # Get the latest one
+ try:
+ with open(
+ prev_log_file, "r", encoding="utf-8"
+ ) as f:
+ prev_log_data = json.load(f)
+ # Extract failure experience from trace_data
+ trace_data = prev_log_data.get("trace_data", {})
+ prev_failure_exp = trace_data.get(
+ "failure_experience_summary"
+ )
+ if prev_failure_exp:
+ failure_experiences.append(prev_failure_exp)
+ print(
+ f" Recovered failure experience from retry {prev_retry}"
+ )
+ except Exception as e:
+ print(
+ f" Warning: Failed to load previous log {prev_log_file}: {e}"
+ )
+
+ # Rebuild enhanced task description with recovered failure experiences
+ if failure_experiences:
+ current_task_description += FAILURE_EXPERIENCE_HEADER
+ for idx, exp in enumerate(failure_experiences, 1):
+ current_task_description += (
+ FAILURE_EXPERIENCE_ITEM.format(
+ attempt_number=idx,
+ failure_summary=exp,
+ )
+ )
+ current_task_description += FAILURE_EXPERIENCE_FOOTER
+ print(
+ f" Recovered {len(failure_experiences)} failure experience(s) from previous retries"
+ )
while format_retry_count <= max_format_retries:
try:
@@ -298,11 +351,12 @@ def extract_timestamp(file_path):
response,
final_boxed_answer,
log_file_path,
+ failure_experience_summary,
) = await execute_task_pipeline(
cfg=self.cfg,
task_id=f"{task.task_id}_attempt-{attempt}_format-retry-{format_retry_count}",
task_file_name=task_file_path,
- task_description=task_description,
+ task_description=current_task_description,
main_agent_tool_manager=self.main_agent_tool_manager,
sub_agent_tool_managers=self.sub_agent_tool_managers,
output_formatter=self.output_formatter,
@@ -322,12 +376,48 @@ def extract_timestamp(file_path):
):
format_retry_count += 1
if format_retry_count <= max_format_retries:
+ # Use the model-generated failure experience summary
+ print(
+ f" Format error detected, using model-generated failure summary for retry {format_retry_count}..."
+ )
+
+ if failure_experience_summary:
+ failure_experiences.append(
+ failure_experience_summary
+ )
+
+ # Build enhanced task description with accumulated failure experiences
+ # Start fresh from original task_description each time
+ current_task_description = task_description
+ current_task_description += (
+ FAILURE_EXPERIENCE_HEADER
+ )
+ for idx, exp in enumerate(
+ failure_experiences, 1
+ ):
+ current_task_description += (
+ FAILURE_EXPERIENCE_ITEM.format(
+ attempt_number=idx,
+ failure_summary=exp,
+ )
+ )
+ current_task_description += (
+ FAILURE_EXPERIENCE_FOOTER
+ )
+
+ print(
+ f" Enhanced task description with {len(failure_experiences)} failure experience(s)"
+ )
+ else:
+ print(
+ " No failure experience summary generated, retrying without enhancement..."
+ )
continue
else:
# Exceeded format retry limit
attempt_result["status"] = "success"
attempt_result["model_boxed_answer"] = (
- "No \\boxed{} content found after format error retry limit exceeded."
+ f"{FORMAT_ERROR_MESSAGE} (after {max_format_retries} retries)"
)
attempt_result["error_message"] = (
f"Exceeded format error retry limit ({max_format_retries})"
diff --git a/apps/miroflow-agent/conf/agent/default.yaml b/apps/miroflow-agent/conf/agent/default.yaml
index 7ad6d72d..2d32ce1e 100644
--- a/apps/miroflow-agent/conf/agent/default.yaml
+++ b/apps/miroflow-agent/conf/agent/default.yaml
@@ -21,4 +21,4 @@ sub_agents:
# Settings for context management
keep_tool_result: -1
-format_error_retry_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml b/apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml
index 455ec927..5cf8230e 100644
--- a/apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml
+++ b/apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: -1
-format_error_retry_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml b/apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml
index 8efa6e65..273e14ce 100644
--- a/apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml
+++ b/apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: 5
-format_error_retry_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml b/apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml
index ddce2f10..67ddd661 100644
--- a/apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml
+++ b/apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: -1
-format_error_retry_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml b/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml
index dd42f9ae..a602ef8f 100644
--- a/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml
+++ b/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: 5
-format_error_retry_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml b/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml
index 8dd348fd..011359a8 100644
--- a/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml
+++ b/apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: 5
-format_error_retry_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/multi_agent.yaml b/apps/miroflow-agent/conf/agent/multi_agent.yaml
index b4d46614..ed642c82 100644
--- a/apps/miroflow-agent/conf/agent/multi_agent.yaml
+++ b/apps/miroflow-agent/conf/agent/multi_agent.yaml
@@ -25,4 +25,4 @@ sub_agents:
# Settings for context management
keep_tool_result: -1
-format_error_retry_limit: 0 # Maximum number of retries for format errors in context management
+context_compress_limit: 0 # Maximum number of retries for format errors in context management
diff --git a/apps/miroflow-agent/conf/agent/multi_agent_os.yaml b/apps/miroflow-agent/conf/agent/multi_agent_os.yaml
index d2fd46c9..1fbe557a 100644
--- a/apps/miroflow-agent/conf/agent/multi_agent_os.yaml
+++ b/apps/miroflow-agent/conf/agent/multi_agent_os.yaml
@@ -25,4 +25,4 @@ sub_agents:
# Settings for context management
keep_tool_result: -1
-format_error_retry_limit: 0 # Maximum number of retries for format errors in context management
+context_compress_limit: 0 # Maximum number of retries for format errors in context management
diff --git a/apps/miroflow-agent/conf/agent/single_agent.yaml b/apps/miroflow-agent/conf/agent/single_agent.yaml
index 349f1f58..a04a60fe 100644
--- a/apps/miroflow-agent/conf/agent/single_agent.yaml
+++ b/apps/miroflow-agent/conf/agent/single_agent.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: -1
-format_error_retry_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 0 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/conf/agent/single_agent_keep5.yaml b/apps/miroflow-agent/conf/agent/single_agent_keep5.yaml
index 0233358a..4a074360 100644
--- a/apps/miroflow-agent/conf/agent/single_agent_keep5.yaml
+++ b/apps/miroflow-agent/conf/agent/single_agent_keep5.yaml
@@ -19,4 +19,4 @@ sub_agents:
# Settings for context management
keep_tool_result: 5
-format_error_retry_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
+context_compress_limit: 10 # Maximum number of retries for format errors in context management
\ No newline at end of file
diff --git a/apps/miroflow-agent/main.py b/apps/miroflow-agent/main.py
index 75e61e18..c3d7676b 100644
--- a/apps/miroflow-agent/main.py
+++ b/apps/miroflow-agent/main.py
@@ -33,7 +33,7 @@ async def amain(cfg: DictConfig) -> None:
task_file_name = ""
# Execute task using the pipeline
- final_summary, final_boxed_answer, log_file_path = await execute_task_pipeline(
+ final_summary, final_boxed_answer, log_file_path, _ = await execute_task_pipeline(
cfg=cfg,
task_id=task_id,
task_file_name=task_file_name,
diff --git a/apps/miroflow-agent/src/core/orchestrator.py b/apps/miroflow-agent/src/core/orchestrator.py
index 68284b89..05a00195 100644
--- a/apps/miroflow-agent/src/core/orchestrator.py
+++ b/apps/miroflow-agent/src/core/orchestrator.py
@@ -23,8 +23,14 @@
TaskLog,
get_utc_plus_8_time,
)
-from ..utils.parsing_utils import extract_llm_response_text
+from ..utils.parsing_utils import (
+ extract_failure_experience_summary,
+ extract_llm_response_text,
+)
from ..utils.prompt_utils import (
+ FAILURE_SUMMARY_ASSISTANT_PREFIX,
+ FAILURE_SUMMARY_PROMPT,
+ FORMAT_ERROR_MESSAGE,
generate_agent_specific_system_prompt,
generate_agent_summarize_prompt,
mcp_tags,
@@ -92,6 +98,8 @@ def __init__(
# Retry loop protection limits
self.MAX_CONSECUTIVE_ROLLBACKS = 5
self.MAX_FINAL_ANSWER_RETRIES = 3 if cfg.agent.keep_tool_result == -1 else 1
+ # When context_compress_limit > 0, enables a context compression mechanism
+ self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)
async def _stream_update(self, event_type: str, data: dict):
"""Send streaming update in new SSE protocol format"""
@@ -228,6 +236,29 @@ async def _stream_tool_call(
return tool_call_id
+ def _is_google_search_empty_result(self, tool_name: str, tool_result: dict) -> bool:
+ """
+ Check if google_search result has empty organic results.
+ This indicates a poor search query that should be retried.
+ """
+ if tool_name != "google_search":
+ return False
+
+ result = tool_result.get("result")
+ if not result:
+ return False
+
+ try:
+ if isinstance(result, str):
+ result_dict = json.loads(result)
+ else:
+ result_dict = result
+
+ organic = result_dict.get("organic", [])
+ return len(organic) == 0
+ except (json.JSONDecodeError, TypeError, AttributeError):
+ return False
+
def get_scrape_result(self, result: str) -> str:
"""
Process scrape result and truncate if too long to support more conversation turns.
@@ -383,6 +414,426 @@ async def _handle_llm_call(
# Return empty response with should_break=False, need to retry
return "", False, None, original_message_history
+ async def _generate_failure_summary(
+ self,
+ system_prompt: str,
+ message_history: List[Dict[str, Any]],
+ tool_definitions: List[Dict],
+ turn_count: int,
+ ) -> Optional[str]:
+ """Generate a failure experience summary for context compression.
+
+ This is the core of the context management mechanism. When a task attempt fails
+ (i.e., the task is not completed within the given turns and context window),
+ we compress the entire conversation history into a structured summary containing:
+ - Failure type: incomplete / blocked / misdirected / format_missed
+ - What happened: the approach taken and why a final answer was not reached
+ - Useful findings: facts, intermediate results, or conclusions to be reused
+
+ This summary will be injected into the task description for the next retry,
+ effectively compressing potentially thousands of tokens of conversation into
+ a focused summary of ~500-1000 tokens.
+
+ Args:
+ system_prompt: The system prompt used in the conversation.
+ message_history: The full conversation history to be compressed.
+ tool_definitions: Available tool definitions.
+ turn_count: Current turn count for step ID.
+
+ Returns:
+ The compressed failure experience summary, or None if generation failed.
+ """
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Failure Summary",
+ "Generating failure experience summary for potential retry...",
+ )
+
+ # Build failure summary history
+ failure_summary_history = message_history.copy()
+ if failure_summary_history and failure_summary_history[-1]["role"] == "user":
+ failure_summary_history.pop()
+
+ # Add failure summary prompt and assistant prefix for structured output
+ failure_summary_history.append(
+ {"role": "user", "content": FAILURE_SUMMARY_PROMPT}
+ )
+ failure_summary_history.append(
+ {"role": "assistant", "content": FAILURE_SUMMARY_ASSISTANT_PREFIX}
+ )
+
+ # Call LLM to generate failure summary (auto-detects assistant prefix for vLLM continuation)
+ (
+ failure_summary_text,
+ _,
+ _,
+ _,
+ ) = await self._handle_llm_call(
+ system_prompt,
+ failure_summary_history,
+ tool_definitions,
+ turn_count + 10, # Use a different step id
+ "Main Agent | Failure Experience Summary",
+ agent_type="main",
+ )
+
+ # Prepend the assistant prefix to the response for complete output
+ if failure_summary_text:
+ failure_summary_text = (
+ FAILURE_SUMMARY_ASSISTANT_PREFIX + failure_summary_text
+ )
+ failure_experience_summary = extract_failure_experience_summary(
+ failure_summary_text
+ )
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Failure Summary",
+ f"Generated failure experience summary:\n{failure_experience_summary[:500]}...",
+ )
+ return failure_experience_summary
+ else:
+ self.task_log.log_step(
+ "warning",
+ "Main Agent | Failure Summary",
+ "Failed to generate failure experience summary",
+ )
+ return None
+
+ async def _generate_final_answer_with_retries(
+ self,
+ system_prompt: str,
+ message_history: List[Dict[str, Any]],
+ tool_definitions: List[Dict],
+ turn_count: int,
+ task_description: str,
+ ) -> Tuple[Optional[str], str, Optional[str], str, List[Dict[str, Any]]]:
+ """Generate final answer with retry mechanism.
+
+ Returns:
+ Tuple of (final_answer_text, final_summary, final_boxed_answer, usage_log, message_history)
+ """
+ # Generate summary prompt
+ summary_prompt = generate_agent_summarize_prompt(
+ task_description,
+ agent_type="main",
+ )
+
+ if message_history[-1]["role"] == "user":
+ message_history.pop(-1)
+ message_history.append({"role": "user", "content": summary_prompt})
+
+ final_answer_text = None
+ final_boxed_answer = None
+ final_summary = ""
+ usage_log = ""
+
+ for retry_idx in range(self.MAX_FINAL_ANSWER_RETRIES):
+ (
+ final_answer_text,
+ should_break,
+ tool_calls_info,
+ message_history,
+ ) = await self._handle_llm_call(
+ system_prompt,
+ message_history,
+ tool_definitions,
+ turn_count + 1 + retry_idx,
+ f"Main agent | Final Summary (attempt {retry_idx + 1}/{self.MAX_FINAL_ANSWER_RETRIES})",
+ agent_type="main",
+ )
+
+ if final_answer_text:
+ final_summary, final_boxed_answer, usage_log = (
+ self.output_formatter.format_final_summary_and_log(
+ final_answer_text, self.llm_client
+ )
+ )
+
+ if final_boxed_answer != FORMAT_ERROR_MESSAGE:
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Final Answer",
+ f"Boxed answer found on attempt {retry_idx + 1}",
+ )
+ break
+ else:
+ self.task_log.log_step(
+ "warning",
+ "Main Agent | Final Answer",
+ f"No boxed answer on attempt {retry_idx + 1}, retrying...",
+ )
+ if retry_idx < self.MAX_FINAL_ANSWER_RETRIES - 1:
+ if (
+ message_history
+ and message_history[-1]["role"] == "assistant"
+ ):
+ message_history.pop()
+ else:
+ self.task_log.log_step(
+ "warning",
+ "Main Agent | Final Answer",
+ f"Failed to generate answer on attempt {retry_idx + 1}",
+ )
+ if retry_idx < self.MAX_FINAL_ANSWER_RETRIES - 1:
+ if message_history and message_history[-1]["role"] == "assistant":
+ message_history.pop()
+
+ # Ensure final_boxed_answer is never None - treat it as FORMAT_ERROR_MESSAGE
+ if final_boxed_answer is None:
+ final_boxed_answer = FORMAT_ERROR_MESSAGE
+
+ return (
+ final_answer_text,
+ final_summary,
+ final_boxed_answer,
+ usage_log,
+ message_history,
+ )
+
+ def _handle_no_context_management_fallback(
+ self,
+ final_answer_text: Optional[str],
+ final_summary: str,
+ final_boxed_answer: Optional[str],
+ ) -> Tuple[str, str, str]:
+ """Handle fallback when context_compress_limit == 0 (no context management).
+
+ In this mode, the model has only one chance to answer.
+ We should try to use intermediate answers as fallback to maximize accuracy.
+
+ Returns:
+ Tuple of (final_answer_text, final_summary, final_boxed_answer)
+ """
+ # Validate final_answer_text
+ if not final_answer_text:
+ final_answer_text = "No final answer generated."
+ final_summary = final_answer_text
+ final_boxed_answer = FORMAT_ERROR_MESSAGE
+ self.task_log.log_step(
+ "error",
+ "Main Agent | Final Answer",
+ "Unable to generate final answer after all retries",
+ )
+ else:
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Final Answer",
+ f"Final answer content:\n\n{final_answer_text}",
+ )
+
+ # Fallback to intermediate answer if no valid boxed answer
+ # This is important when context management is disabled to maximize answer accuracy
+ # Check for both FORMAT_ERROR_MESSAGE and None (defensive)
+ if (
+ final_boxed_answer == FORMAT_ERROR_MESSAGE or final_boxed_answer is None
+ ) and self.intermediate_boxed_answers:
+ final_boxed_answer = self.intermediate_boxed_answers[-1]
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Final Answer (No Context Management)",
+ f"Using intermediate boxed answer as fallback: {final_boxed_answer}",
+ )
+
+ # Ensure final_boxed_answer is never None
+ if final_boxed_answer is None:
+ final_boxed_answer = FORMAT_ERROR_MESSAGE
+
+ return final_answer_text, final_summary, final_boxed_answer
+
+ def _handle_context_management_no_fallback(
+ self,
+ final_answer_text: Optional[str],
+ final_summary: str,
+ final_boxed_answer: Optional[str],
+ ) -> Tuple[str, str, str]:
+ """Handle failure when context_compress_limit > 0 (context management enabled).
+
+ In this mode, the model has multiple chances to retry with context management.
+ We should NOT guess or use intermediate answers, because:
+ - A wrong guess can reduce accuracy
+ - The model will have another chance to answer with failure experience
+
+ Returns:
+ Tuple of (final_answer_text, final_summary, final_boxed_answer)
+ """
+ # Validate final_answer_text
+ if not final_answer_text:
+ final_answer_text = "No final answer generated."
+ final_summary = final_answer_text
+ final_boxed_answer = FORMAT_ERROR_MESSAGE
+ self.task_log.log_step(
+ "error",
+ "Main Agent | Final Answer",
+ "Unable to generate final answer after all retries",
+ )
+ else:
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Final Answer",
+ f"Final answer content:\n\n{final_answer_text}",
+ )
+
+ # Ensure final_boxed_answer is never None
+ if final_boxed_answer is None:
+ final_boxed_answer = FORMAT_ERROR_MESSAGE
+
+ # With context management, do NOT fallback to intermediate answers
+ # Keep FORMAT_ERROR_MESSAGE to trigger failure summary and allow retry
+ if final_boxed_answer == FORMAT_ERROR_MESSAGE:
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Final Answer (Context Management Mode)",
+ "No valid boxed answer found. Not using intermediate fallback - will generate failure summary for retry.",
+ )
+
+ return final_answer_text, final_summary, final_boxed_answer
+
+ async def _generate_and_finalize_answer(
+ self,
+ system_prompt: str,
+ message_history: List[Dict[str, Any]],
+ tool_definitions: List[Dict],
+ turn_count: int,
+ task_description: str,
+ reached_max_turns: bool = False,
+ ) -> Tuple[str, str, Optional[str], str, List[Dict[str, Any]]]:
+ """Generate final answer and handle fallback based on context management settings.
+
+ Context Management (context_compress_limit > 0) is essentially a context compression
+ mechanism that enables multi-attempt problem solving:
+
+ 1. When the task is not completed within the given turns and context window,
+ we generate a "failure_experience_summary" - a structured post-mortem containing:
+ - Failure type: incomplete / blocked / misdirected / format_missed
+ - What happened: the approach taken and why it didn't reach a final answer
+ - Useful findings: facts, intermediate results, or conclusions to be reused
+
+ 2. On retry, this compressed summary is injected into the task description,
+ allowing the model to:
+ - Start fresh with a clean context window
+ - Learn from previous failures without context overflow
+ - Try different approaches informed by past experience
+
+ This is more effective than simply continuing with a long context because:
+ - Avoids context window limits
+ - Focuses attention on key learnings rather than raw conversation
+ - Allows multiple "fresh starts" with accumulated wisdom
+
+ Decision table based on (context_management, reached_max_turns):
+
+ | Context Management | Reached Max Turns | Behavior |
+ |--------------------|-------------------|---------------------------------------------|
+ | OFF (limit=0) | No | Generate answer → fallback to intermediate |
+ | OFF (limit=0) | Yes | Generate answer → fallback to intermediate |
+ | ON (limit>0) | No | Generate answer → no fallback, fail summary |
+ | ON (limit>0) | Yes | SKIP generation → fail summary directly |
+
+ Args:
+ reached_max_turns: Whether the main loop ended due to reaching max turns or context limit.
+
+ Returns:
+ Tuple of (final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history)
+ """
+ context_management_enabled = self.context_compress_limit > 0
+ failure_experience_summary = None
+ usage_log = ""
+
+ # =============================================================================
+ # CASE: Context management ON + reached max turns
+ # Skip answer generation entirely - any answer would be a blind guess
+ # =============================================================================
+ if context_management_enabled and reached_max_turns:
+ self.task_log.log_step(
+ "info",
+ "Main Agent | Final Answer (Context Management Mode)",
+ "Reached max turns. Skipping answer generation to avoid blind guessing.",
+ )
+
+ self._save_message_history(system_prompt, message_history)
+
+ failure_experience_summary = await self._generate_failure_summary(
+ system_prompt, message_history, tool_definitions, turn_count
+ )
+
+ return (
+ "Task incomplete - reached maximum turns. Will retry with failure experience.",
+ FORMAT_ERROR_MESSAGE,
+ failure_experience_summary,
+ usage_log,
+ message_history,
+ )
+
+ # =============================================================================
+ # ALL OTHER CASES: Generate final answer first
+ # =============================================================================
+ (
+ final_answer_text,
+ final_summary,
+ final_boxed_answer,
+ usage_log,
+ message_history,
+ ) = await self._generate_final_answer_with_retries(
+ system_prompt=system_prompt,
+ message_history=message_history,
+ tool_definitions=tool_definitions,
+ turn_count=turn_count,
+ task_description=task_description,
+ )
+
+ self._save_message_history(system_prompt, message_history)
+
+ # =============================================================================
+ # CASE: Context management OFF
+ # Try to use intermediate answers as fallback to maximize accuracy
+ # =============================================================================
+ if not context_management_enabled:
+ final_answer_text, final_summary, final_boxed_answer = (
+ self._handle_no_context_management_fallback(
+ final_answer_text, final_summary, final_boxed_answer
+ )
+ )
+ # No failure summary needed - won't be used without context management
+ return (
+ final_summary,
+ final_boxed_answer,
+ None,
+ usage_log,
+ message_history,
+ )
+
+ # =============================================================================
+ # CASE: Context management ON + normal completion (not reached max turns)
+ # Don't use fallback - wrong guess would reduce accuracy
+ # =============================================================================
+ final_answer_text, final_summary, final_boxed_answer = (
+ self._handle_context_management_no_fallback(
+ final_answer_text, final_summary, final_boxed_answer
+ )
+ )
+
+ if final_boxed_answer == FORMAT_ERROR_MESSAGE:
+ failure_experience_summary = await self._generate_failure_summary(
+ system_prompt, message_history, tool_definitions, turn_count
+ )
+
+ return (
+ final_summary,
+ final_boxed_answer,
+ failure_experience_summary,
+ usage_log,
+ message_history,
+ )
+
+ def _save_message_history(
+ self, system_prompt: str, message_history: List[Dict[str, Any]]
+ ):
+ """Save message history to task log."""
+ self.task_log.main_agent_message_history = {
+ "system_prompt": system_prompt,
+ "message_history": message_history,
+ }
+ self.task_log.save()
+
async def run_sub_agent(
self,
sub_agent_name: str,
@@ -634,9 +1085,13 @@ async def run_sub_agent(
else tool_result.get("error")
)
- # Check for "Unknown tool:" error and rollback
- if str(result).startswith("Unknown tool:"):
- # If we haven't reached rollback limit, rollback and retry
+ # Check for errors that should trigger rollback
+ should_rollback_result = (
+ str(result).startswith("Unknown tool:")
+ or str(result).startswith("Error executing tool")
+ or self._is_google_search_empty_result(tool_name, tool_result)
+ )
+ if should_rollback_result:
if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
message_history.pop()
turn_count -= 1
@@ -645,15 +1100,14 @@ async def run_sub_agent(
self.task_log.log_step(
"warning",
f"{sub_agent_name} | Turn: {turn_count} | Rollback",
- f"Unknown tool error - tool: {tool_name}, error: '{str(result)[:200]}'. Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}",
+ f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'. Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}",
)
- break # Exit inner for loop, then continue outer while loop
+ break
else:
- # Reached rollback limit, allow error to be sent to LLM as feedback
self.task_log.log_step(
"warning",
f"{sub_agent_name} | Turn: {turn_count} | Allow Error Feedback",
- f"Allowing unknown tool error to be sent to LLM after {consecutive_rollbacks} rollbacks - tool: {tool_name}, error: '{str(result)[:200]}'",
+ f"Allowing error result after {consecutive_rollbacks} rollbacks - tool: {tool_name}, result: '{str(result)[:200]}'",
)
await self._stream_tool_call(
@@ -943,8 +1397,10 @@ async def run_main_agent(
)
break
else:
+ # LLM call failed, don't count this as a valid turn
+ turn_count -= 1
self.task_log.log_step(
- "info",
+ "warning",
f"Main Agent | Turn: {turn_count} | LLM Call",
"No valid response from LLM, retrying",
)
@@ -1147,9 +1603,15 @@ async def run_main_agent(
else tool_result.get("error")
)
- # Check for "Unknown tool:" error and rollback
- if str(result).startswith("Unknown tool:"):
- # If we haven't reached rollback limit, rollback and retry
+ # Check for errors that should trigger rollback
+ should_rollback_result = (
+ str(result).startswith("Unknown tool:")
+ or str(result).startswith("Error executing tool")
+ or self._is_google_search_empty_result(
+ tool_name, tool_result
+ )
+ )
+ if should_rollback_result:
if (
consecutive_rollbacks
< self.MAX_CONSECUTIVE_ROLLBACKS - 1
@@ -1161,15 +1623,14 @@ async def run_main_agent(
self.task_log.log_step(
"warning",
f"Main Agent | Turn: {turn_count} | Rollback",
- f"Unknown tool error - tool: {tool_name}, error: '{str(result)[:200]}'. Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}",
+ f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'. Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}",
)
- break # Exit inner for loop, then continue outer while loop
+ break
else:
- # Reached rollback limit, allow error to be sent to LLM as feedback
self.task_log.log_step(
"warning",
f"Main Agent | Turn: {turn_count} | Allow Error Feedback",
- f"Allowing unknown tool error to be sent to LLM after {consecutive_rollbacks} rollbacks - tool: {tool_name}, error: '{str(result)[:200]}'",
+ f"Allowing error result after {consecutive_rollbacks} rollbacks - tool: {tool_name}, result: '{str(result)[:200]}'",
)
await self._stream_tool_call(
@@ -1277,8 +1738,9 @@ async def run_main_agent(
await self._stream_end_llm("main")
await self._stream_end_agent("main", self.current_agent_id)
- # Record main loop end
- if turn_count >= max_turns:
+ # Record main loop end and determine if max turns was reached
+ reached_max_turns = turn_count >= max_turns
+ if reached_max_turns:
self.task_log.log_step(
"warning",
"Main Agent | Max Turns Reached / Context Limit Reached",
@@ -1299,116 +1761,24 @@ async def run_main_agent(
self.current_agent_id = await self._stream_start_agent("Final Summary")
await self._stream_start_llm("Final Summary")
- # Generate summary prompt (generate only once)
- summary_prompt = generate_agent_summarize_prompt(
- task_description,
- agent_type="main",
+ # Generate final answer and handle fallback based on context_compress_limit
+ # If reached_max_turns is True and context management is enabled,
+ # skip answer generation to avoid blind guessing
+ (
+ final_summary,
+ final_boxed_answer,
+ failure_experience_summary,
+ usage_log,
+ message_history,
+ ) = await self._generate_and_finalize_answer(
+ system_prompt=system_prompt,
+ message_history=message_history,
+ tool_definitions=tool_definitions,
+ turn_count=turn_count,
+ task_description=task_description,
+ reached_max_turns=reached_max_turns,
)
- if message_history[-1]["role"] == "user":
- message_history.pop(-1)
- message_history.append({"role": "user", "content": summary_prompt})
-
- # Retry mechanism for generating boxed answer
- final_answer_text = None
- final_boxed_answer = None
- final_summary = ""
- usage_log = ""
-
- for retry_idx in range(self.MAX_FINAL_ANSWER_RETRIES):
- # Use unified LLM call processing
- (
- final_answer_text,
- should_break,
- tool_calls_info,
- message_history,
- ) = await self._handle_llm_call(
- system_prompt,
- message_history,
- tool_definitions,
- turn_count + 1 + retry_idx,
- f"Main agent | Final Summary (attempt {retry_idx + 1}/{self.MAX_FINAL_ANSWER_RETRIES})",
- agent_type="main",
- )
-
- if final_answer_text:
- # Try to extract boxed answer
- final_summary, final_boxed_answer, usage_log = (
- self.output_formatter.format_final_summary_and_log(
- final_answer_text, self.llm_client
- )
- )
-
- # Check if we got a valid boxed answer
- if (
- final_boxed_answer
- != "No \\boxed{} content found in the final answer."
- ):
- self.task_log.log_step(
- "info",
- "Main Agent | Final Answer",
- f"Boxed answer found on attempt {retry_idx + 1}",
- )
- break
- else:
- self.task_log.log_step(
- "warning",
- "Main Agent | Final Answer",
- f"No boxed answer on attempt {retry_idx + 1}, retrying...",
- )
- # Remove the failed assistant response before retry
- if retry_idx < self.MAX_FINAL_ANSWER_RETRIES - 1:
- if (
- message_history
- and message_history[-1]["role"] == "assistant"
- ):
- message_history.pop()
- else:
- self.task_log.log_step(
- "warning",
- "Main Agent | Final Answer",
- f"Failed to generate answer on attempt {retry_idx + 1}",
- )
- # Remove the failed assistant response before retry
- if retry_idx < self.MAX_FINAL_ANSWER_RETRIES - 1:
- if message_history and message_history[-1]["role"] == "assistant":
- message_history.pop()
-
- self.task_log.main_agent_message_history = {
- "system_prompt": system_prompt,
- "message_history": message_history,
- }
- self.task_log.save()
-
- # Final validation and fallback
- if not final_answer_text:
- final_answer_text = "No final answer generated."
- final_summary = final_answer_text
- final_boxed_answer = "No \\boxed{} content found in the final answer."
- self.task_log.log_step(
- "error",
- "Main Agent | Final Answer",
- "Unable to generate final answer after all retries",
- )
- else:
- self.task_log.log_step(
- "info",
- "Main Agent | Final Answer",
- f"Final answer content:\n\n{final_answer_text}",
- )
-
- # Fallback to intermediate answer if still no boxed answer
- if (
- final_boxed_answer == "No \\boxed{} content found in the final answer."
- and self.intermediate_boxed_answers
- ):
- final_boxed_answer = self.intermediate_boxed_answers[-1]
- self.task_log.log_step(
- "info",
- "Main Agent | Final Answer",
- f"Using intermediate boxed answer as fallback: {final_boxed_answer}",
- )
-
await self._stream_tool_call("show_text", {"text": final_boxed_answer})
await self._stream_end_llm("Final Summary")
await self._stream_end_agent("Final Summary", self.current_agent_id)
@@ -1430,4 +1800,4 @@ async def run_main_agent(
f"Main agent task {task_id} completed successfully",
)
gc.collect()
- return final_summary, final_boxed_answer
+ return final_summary, final_boxed_answer, failure_experience_summary
diff --git a/apps/miroflow-agent/src/core/pipeline.py b/apps/miroflow-agent/src/core/pipeline.py
index 78d41402..b0cc1e6c 100644
--- a/apps/miroflow-agent/src/core/pipeline.py
+++ b/apps/miroflow-agent/src/core/pipeline.py
@@ -98,7 +98,11 @@ async def execute_task_pipeline(
sub_agent_tool_definitions=sub_agent_tool_definitions,
)
- final_summary, final_boxed_answer = await orchestrator.run_main_agent(
+ (
+ final_summary,
+ final_boxed_answer,
+ failure_experience_summary,
+ ) = await orchestrator.run_main_agent(
task_description=task_description,
task_file_name=task_file_name,
task_id=task_id,
@@ -109,8 +113,19 @@ async def execute_task_pipeline(
task_log.final_boxed_answer = final_boxed_answer
task_log.status = "success"
+ # Store failure experience summary in task log if available
+ if failure_experience_summary:
+ task_log.trace_data["failure_experience_summary"] = (
+ failure_experience_summary
+ )
+
log_file_path = task_log.save()
- return final_summary, final_boxed_answer, log_file_path
+ return (
+ final_summary,
+ final_boxed_answer,
+ log_file_path,
+ failure_experience_summary,
+ )
except Exception as e:
error_details = traceback.format_exc()
@@ -134,7 +149,7 @@ async def execute_task_pipeline(
log_file_path = task_log.save()
- return error_message, "", log_file_path
+ return error_message, "", log_file_path, None
finally:
task_log.end_time = get_utc_plus_8_time()
diff --git a/apps/miroflow-agent/src/io/output_formatter.py b/apps/miroflow-agent/src/io/output_formatter.py
index 3c1d0fc5..8e63b046 100644
--- a/apps/miroflow-agent/src/io/output_formatter.py
+++ b/apps/miroflow-agent/src/io/output_formatter.py
@@ -3,6 +3,8 @@
import re
+from ..utils.prompt_utils import FORMAT_ERROR_MESSAGE
+
class OutputFormatter:
def _extract_boxed_content(self, text: str) -> str:
@@ -117,7 +119,7 @@ def format_final_summary_and_log(self, final_answer_text, client=None):
summary_lines.append(boxed_result)
elif final_answer_text:
summary_lines.append("No \\boxed{} content found.")
- boxed_result = "No \\boxed{} content found in the final answer."
+ boxed_result = FORMAT_ERROR_MESSAGE
# Token usage statistics and cost estimation - use client method
if client and hasattr(client, "format_token_usage_summary"):
diff --git a/apps/miroflow-agent/src/llm/providers/openai_client.py b/apps/miroflow-agent/src/llm/providers/openai_client.py
index 2eda891f..a81bc76d 100644
--- a/apps/miroflow-agent/src/llm/providers/openai_client.py
+++ b/apps/miroflow-agent/src/llm/providers/openai_client.py
@@ -137,6 +137,11 @@ async def _create_message(
if "deepseek-v3-1" in self.model_name:
params["extra_body"]["thinking"] = {"type": "enabled"}
+ # auto-detect if we need to continue from the last assistant message
+ if messages_for_llm and messages_for_llm[-1].get("role") == "assistant":
+ params["extra_body"]["continue_final_message"] = True
+ params["extra_body"]["add_generation_prompt"] = False
+
try:
if self.async_client:
response = await self.client.chat.completions.create(**params)
diff --git a/apps/miroflow-agent/src/utils/parsing_utils.py b/apps/miroflow-agent/src/utils/parsing_utils.py
index 5f6e5274..04733ae6 100644
--- a/apps/miroflow-agent/src/utils/parsing_utils.py
+++ b/apps/miroflow-agent/src/utils/parsing_utils.py
@@ -100,6 +100,57 @@ def safe_json_loads(arguments_str: str) -> dict:
}
+def extract_failure_experience_summary(text: str) -> str:
+ """
+ Extract failure experience summary from LLM response text.
+
+ The text may contain:
+ - ... block (thinking content)
+ - Main content after and before
+ - ... block (tool call, ignored)
+
+ Examples:
+ "\n{xxx}\n\n\n{content}\n\n..."
+ "\n{xxx}\n\n\n{content}"
+ "{content}" (no think block)
+
+ Returns:
+ - If content is empty after strip, return think_content
+ - If both think_content and content are non-empty, return content
+ - mcp_block is never used
+ """
+ if not text:
+ return ""
+
+ think_content = ""
+ content = ""
+
+ # Extract think content
+ think_match = re.search(r"([\s\S]*?)", text)
+ if think_match:
+ think_content = think_match.group(1).strip()
+ # Get content after
+ after_think = text[think_match.end() :]
+ else:
+ # No think block, entire text is potential content
+ after_think = text
+
+ # Remove ... block from content
+ mcp_match = re.search(r"[\s\S]*", after_think)
+ if mcp_match:
+ content = after_think[: mcp_match.start()].strip()
+ else:
+ content = after_think.strip()
+
+ # Apply the rules:
+ # - If content is empty, use think_content
+ # - If both are non-empty, use content
+ if content:
+ return content
+ else:
+ return think_content
+
+
def extract_llm_response_text(llm_response):
"""
Extract text from LLM response, excluding tags. Stop immediately when this opening tag is encountered.
diff --git a/apps/miroflow-agent/src/utils/prompt_utils.py b/apps/miroflow-agent/src/utils/prompt_utils.py
index 1157eb0e..0c4b62cc 100644
--- a/apps/miroflow-agent/src/utils/prompt_utils.py
+++ b/apps/miroflow-agent/src/utils/prompt_utils.py
@@ -1,6 +1,61 @@
# Copyright (c) 2025 MiroMind
# This source code is licensed under the MIT License.
+# ============================================================================
+# Format Error Messages
+# ============================================================================
+
+FORMAT_ERROR_MESSAGE = "No \\boxed{} content found in the final answer."
+
+# ============================================================================
+# Failure Experience Templates (for format error retry)
+# ============================================================================
+
+# Header that appears once before all failure experiences
+FAILURE_EXPERIENCE_HEADER = """
+
+=== Previous Attempts Analysis ===
+The following summarizes what was tried before and why it didn't work. Use this to guide a NEW approach.
+
+"""
+
+# Template for each individual failure experience (used multiple times)
+FAILURE_EXPERIENCE_ITEM = """[Attempt {attempt_number}]
+{failure_summary}
+
+"""
+
+# Footer that appears once after all failure experiences
+FAILURE_EXPERIENCE_FOOTER = """=== End of Analysis ===
+
+Based on the above, you should try a different strategy this time.
+"""
+
+FAILURE_SUMMARY_PROMPT = """The task was not completed successfully. Do NOT call any tools. Provide a summary:
+
+Failure type: [incomplete / blocked / misdirected / format_missed]
+ - incomplete: ran out of turns before finishing
+ - blocked: got stuck due to tool failure or missing information
+ - misdirected: went down the wrong path
+ - format_missed: found the answer but forgot to use \\boxed{}
+What happened: [describe the approach taken and why a final answer was not reached]
+Useful findings: [list any facts, intermediate results, or conclusions discovered that should be reused]"""
+
+# Assistant prefix for failure summary generation (guides model to follow structured format)
+FAILURE_SUMMARY_THINK_CONTENT = """We need to write a structured post-mortem style summary **without calling any tools**, explaining why the task was not completed, using these required sections:
+
+* **Failure type**: pick one from **incomplete / blocked / misdirected / format_missed**
+* **What happened**: describe the approach taken and why it didn't reach a final answer
+* **Useful findings**: list any facts, intermediate results, or conclusions that can be reused"""
+
+FAILURE_SUMMARY_ASSISTANT_PREFIX = (
+ f"\n{FAILURE_SUMMARY_THINK_CONTENT}\n\n\n"
+)
+
+# ============================================================================
+# MCP Tags for Parsing
+# ============================================================================
+
mcp_tags = [
"",
"",