diff --git a/.gitignore b/.gitignore index 7ed1b977d..36459346e 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,7 @@ team_implementation_analysis.md # Temporary AI-generated artifacts ai_news_today.* survey/ +eval/ # OpenSpec design docs (not runtime) openspec/ @@ -177,4 +178,3 @@ openspec/ *.tmp __pycache__/ *.pyc - diff --git a/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json new file mode 100644 index 000000000..905f9022b --- /dev/null +++ b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json @@ -0,0 +1,19 @@ +{ + "id": "aworld-evaluator-cli", + "name": "aworld-evaluator-cli", + "version": "1.0.0", + "entrypoints": { + "cli_commands": [ + { + "id": "evaluator", + "name": "evaluator", + "target": "cli_commands/evaluator.py", + "scope": "workspace", + "visibility": "public", + "metadata": { + "factory": "build_command" + } + } + ] + } +} diff --git a/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py new file mode 100644 index 000000000..19af4f24a --- /dev/null +++ b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py @@ -0,0 +1 @@ +"""Built-in framework plugin providing the `evaluator` top-level CLI command.""" diff --git a/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py new file mode 100644 index 000000000..5ae07a71f --- /dev/null +++ b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py @@ -0,0 +1,5 @@ +from aworld_cli.top_level_commands.evaluator_cmd import EvaluatorTopLevelCommand + + +def build_command(): + return EvaluatorTopLevelCommand() diff --git a/aworld-cli/src/aworld_cli/commands/__init__.py b/aworld-cli/src/aworld_cli/commands/__init__.py index f5875ae2e..a0d8da80d 100644 --- a/aworld-cli/src/aworld_cli/commands/__init__.py +++ b/aworld-cli/src/aworld_cli/commands/__init__.py @@ -13,6 +13,7 @@ - /cron: Manage scheduled tasks (tool command) - /dispatch: Submit task to background execution (tool command) - /tasks: Manage background tasks (tool command) +- /evaluation: Run evaluator flows (tool command) Usage: # Import to register all commands @@ -33,6 +34,7 @@ from . import dispatch from . import tasks from . import plugins_cmd +from . import evaluation_cmd __all__ = [ "help_cmd", @@ -44,4 +46,5 @@ "dispatch", "tasks", "plugins_cmd", + "evaluation_cmd", ] diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py new file mode 100644 index 000000000..74e8cb9ec --- /dev/null +++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py @@ -0,0 +1,113 @@ +""" +/evaluation command - Run evaluator flows from chat. +""" +from __future__ import annotations + +import argparse +import asyncio +import shlex + +from aworld_cli.core.command_system import Command, CommandContext, register_command +from aworld_cli.evaluator_rendering import render_evaluator_summary +from aworld_cli.evaluator_runtime import run_evaluator_source_cli + + +def _usage() -> str: + return """Usage: + /evaluation --input --kind task --judge-agent [--agent ] [--out-dir ] + /evaluation --input --kind answer --judge-agent [--out-dir ] + /evaluation --input --kind trajectory --judge-agent [--agent ] [--out-dir ] + /evaluation --input --kind trajectory --task-id --judge-agent [--out-dir ] + +Examples: + /evaluation --input ./tasks.jsonl --kind task --judge-agent ./judge_agents/answer_judge.md + /evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./judge_agents/answer_judge.md + /evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./judge_agents/trajectory_judge.md + /evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./judge_agents/trajectory_judge.md +""" + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="/evaluation", add_help=False) + parser.add_argument("--input", required=True) + parser.add_argument("--kind", required=True) + parser.add_argument("--judge-agent", required=True) + parser.add_argument("--out-dir") + parser.add_argument("--output") + parser.add_argument("--task-id") + parser.add_argument("--agent") + parser.add_argument("--id-field", default="id") + parser.add_argument("--task-field", default="input") + parser.add_argument("--answer-field", default="answer") + parser.add_argument("--interactive-approval", action="store_true") + parser.add_argument("--help", action="store_true") + return parser + + +@register_command +class EvaluationCommand(Command): + @property + def name(self) -> str: + return "evaluation" + + @property + def description(self) -> str: + return "Run evaluator flows" + + @property + def command_type(self) -> str: + return "tool" + + @property + def completion_items(self) -> dict[str, str]: + return { + "/evaluation --kind task": "Run tasks with the default agent, then evaluate the produced state", + "/evaluation --kind answer": "Evaluate existing task+answer JSONL records", + "/evaluation --kind trajectory": "Evaluate generated or replayed trajectories", + } + + async def execute(self, context: CommandContext) -> str: + raw_args = (context.user_args or "").strip() + if not raw_args: + return _usage() + + try: + parts = shlex.split(raw_args) + except ValueError as exc: + return f"Evaluator error: {exc}\n\n{_usage()}" + + if not parts or parts[0] in {"help", "--help", "-h"}: + return _usage() + + parser = _build_parser() + try: + args = parser.parse_args(parts) + except SystemExit: + return _usage() + + if args.help: + return _usage() + + try: + report = await asyncio.to_thread( + run_evaluator_source_cli, + input=args.input, + kind=args.kind, + judge_agent=args.judge_agent, + out_dir=args.out_dir, + output=args.output, + task_id=args.task_id, + agent=args.agent, + id_field=args.id_field, + task_field=args.task_field, + answer_field=args.answer_field, + interactive_approval=args.interactive_approval, + ) + except (FileNotFoundError, ValueError, KeyError) as exc: + return f"Evaluator error: {exc}" + + summary = render_evaluator_summary(report) + report_path = report.get("report_path") + if report_path: + return f"{summary}\nReport: {report_path}" + return summary diff --git a/aworld-cli/src/aworld_cli/evaluator_rendering.py b/aworld-cli/src/aworld_cli/evaluator_rendering.py new file mode 100644 index 000000000..21f140df0 --- /dev/null +++ b/aworld-cli/src/aworld_cli/evaluator_rendering.py @@ -0,0 +1,26 @@ +from __future__ import annotations + + +def render_evaluator_summary(report: dict, *, summary_suffix: str | None = None) -> str: + suite_id = report.get("suite_id", "unknown-suite") + gate = report.get("gate", {}) + status = gate.get("status", "unknown") + metric_value = gate.get("value") + summary_line = f"Evaluator suite: {suite_id}\nGate: {status}" + if metric_value is not None: + if isinstance(metric_value, (int, float)): + summary_line += f" ({metric_value:.2f})" + else: + summary_line += f" ({metric_value})" + selection = report.get("suite_selection") or {} + if selection.get("resolved"): + summary_line += f"\nSuite selection: {selection.get('mode', 'unknown')} -> {selection['resolved']}" + backend = report.get("judge_backend", {}).get("backend_id") + if backend: + summary_line += f"\nJudge backend: {backend}" + report_path = report.get("report_path") + if report_path: + summary_line += f"\nReport: {report_path}" + if summary_suffix: + summary_line += f"\n{summary_suffix}" + return summary_line diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py new file mode 100644 index 000000000..863381b00 --- /dev/null +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -0,0 +1,765 @@ +from __future__ import annotations + +import asyncio +import builtins +import json +import time +from pathlib import Path +from typing import Any, Mapping + +from aworld.plugins.discovery import discover_plugins +from aworld.evaluations.execution import normalize_task_response_to_eval_state +from aworld.evaluations.manifests import ( + get_declared_eval_suite_schema as _get_declared_eval_suite_schema, +) +from aworld.evaluations.report import ( + EVALUATOR_REPORT_FORMAT_ID, + EVALUATOR_REPORT_FORMAT_VERSION, + get_evaluator_report_schema as _get_evaluator_report_schema, + validate_evaluator_report as _validate_evaluator_report, +) +from aworld.evaluations.substrate import ( + AgentJudgeBackend, + EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + JudgeSchemaDef, + StateCheckGrader, + describe_eval_target, + run_evaluation_flow, +) +from aworld.evaluations.runtime_composition import RolloutState, RolloutTurn, derive_standard_metrics +from aworld.evaluations.sources import ( + AWorldTrajectoryLogSource, + JsonlTaskAnswerSource, + JsonlTaskSource, + create_source_eval_suite, + extract_aworld_trajectory_payload, +) +from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema +from aworld.runner import Runners +from pydantic import BaseModel +from aworld_cli.core.plugin_manager import PluginManager, get_builtin_plugin_roots +from aworld_cli.evaluator_rendering import render_evaluator_summary as _render_evaluator_summary +from aworld_cli.evaluator_workspace import ( + discover_workspace_suites, + resolve_cli_target_path, + resolve_workspace_suite_selection, +) +from aworld_cli.plugin_capabilities.hooks import PluginHookResult, load_plugin_hooks + + +_CLI_AGENT_RUNTIME_BOOTSTRAPPED = False +_SUPPORTED_SOURCE_KINDS = ("task", "answer", "trajectory") + + +def _sanitize_path_token(value: str) -> str: + return "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in value).strip("-") or "target" + + +def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path | None = None) -> Path: + root = (cwd or Path.cwd()).expanduser().resolve() + report_dir = root / ".aworld" / "evaluations" + report_dir.mkdir(parents=True, exist_ok=True) + target_token = _sanitize_path_token(target_path.stem or target_path.name) + suite_token = _sanitize_path_token(suite_id) + return report_dir / f"{target_token}.{suite_token}.json" + + +def available_evaluator_suites(*, target: str | None = None) -> list[str]: + hooks = _load_evaluator_hooks() + target_path = resolve_cli_target_path(target) if target is not None else None + workspace_path = str((target_path.parent if target_path and target_path.is_file() else target_path) or Path.cwd()) + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.pre_discover", + event={"target": target, "workspace_path": workspace_path}, + state={"target": target, "workspace_path": workspace_path}, + ) + suites = discover_workspace_suites(target=target) + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.post_discover", + event={"target": target, "workspace_path": workspace_path, "suite_names": suites}, + state={**hook_state, "suite_names": suites}, + ) + overridden = hook_state.get("suite_names") + if isinstance(overridden, list): + return [str(item) for item in overridden] + return suites + + +def get_evaluator_suite_selection( + *, + target: str, + suite: str | None = None, +) -> dict[str, str | None]: + return resolve_workspace_suite_selection(target=target, suite=suite) + + +def evaluator_exit_code(report: dict) -> int: + gate_status = report.get("gate", {}).get("status") + approval = report.get("approval") or {} + if gate_status == "fail": + return 2 + if gate_status == "needs_approval" and not approval.get("approved", False): + return 3 + return 0 + + +def _build_automation_summary(report: dict) -> dict[str, object]: + gate = report.get("gate") or {} + approval = report.get("approval") or {} + result_counts = report.get("result_counts") or {} + automation = { + "gate_status": gate.get("status"), + "metric_name": gate.get("metric_name"), + "metric_value": gate.get("value"), + "approval_required": approval.get("required", False), + "approval_resolved": approval.get("resolved", False), + "approved": approval.get("approved"), + "suggested_exit_code": evaluator_exit_code(report), + "case_count": result_counts.get("cases_total", len(report.get("results") or [])), + "judge_backend": (report.get("judge_backend") or {}).get("backend_id"), + } + source_selection = report.get("source_selection") or {} + if source_selection: + automation["source_kind"] = source_selection.get("kind") + automation["source_input"] = source_selection.get("input") + automation["task_id"] = source_selection.get("task_id") + automation["agent"] = source_selection.get("agent") + return automation + + +def get_declared_evaluator_suite_schema() -> dict[str, object]: + return _get_declared_eval_suite_schema() + + +def get_evaluator_report_schema() -> dict[str, object]: + return _get_evaluator_report_schema() + + +def validate_evaluator_report(report: dict) -> None: + _validate_evaluator_report(report) + + +def _load_evaluator_hooks() -> dict[str, tuple[object, ...]]: + builtin_plugin_roots = tuple(Path(root).resolve() for root in get_builtin_plugin_roots()) + plugin_manager = PluginManager() + if hasattr(plugin_manager, "get_runtime_plugin_roots"): + plugin_roots = [Path(root).resolve() for root in plugin_manager.get_runtime_plugin_roots()] + else: + plugin_roots = list(builtin_plugin_roots) + return load_plugin_hooks(discover_plugins(plugin_roots)) + + +def _run_evaluator_hooks( + hooks: dict[str, tuple[object, ...]], + hook_point: str, + *, + event: dict[str, object], + state: dict[str, object], +) -> dict[str, object]: + """ + Evaluator hook contract: + - `evaluator.pre_discover` event payload: `target`, `workspace_path` + - `evaluator.post_discover` event payload: `target`, `workspace_path`, `suite_names` + - `evaluator.pre_run` event payload for target mode: `mode=target`, `target`, `suite`, `workspace_path` + - `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` + - `evaluator.post_run` event payload for target mode: `mode=target`, `report`, `target`, `suite`, `workspace_path` + - `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` + - `evaluator.render_summary` event payload: `report`, `workspace_path` + - mutable state: lightweight CLI assembly metadata only + - allowed side effects: report upload, notifications, summary augmentation + - hooks do not redefine framework execution, scoring, or gate semantics + """ + merged = dict(state) + for hook in hooks.get((hook_point or "").strip().lower(), ()): + result = asyncio.run(hook.run(event=event, state=merged)) + hook_result = result if isinstance(result, PluginHookResult) else PluginHookResult.from_payload(result) + if hook_result.metadata: + merged.update(dict(hook_result.metadata)) + return merged + + +class _SourceJudgeOutput(BaseModel): + score: float + verdict: str + veto_triggered: bool = False + + +def _looks_like_aworld_trajectory_log(path: Path) -> bool: + try: + with path.open(encoding="utf-8", errors="replace") as handle: + for line in handle: + stripped = line.strip() + if not stripped: + continue + return stripped.startswith("{") and "'trajectory'" in stripped and "'task_id'" in stripped + except OSError: + return False + return False + + +def _source_report_path( + *, + input_path: Path, + suite_id: str, + task_id: str | None, + output: str | None, + out_dir: str | None, +) -> Path: + if output: + return Path(output).expanduser().resolve() + root = Path(out_dir).expanduser().resolve() if out_dir else Path.cwd() / ".aworld" / "evaluations" + root.mkdir(parents=True, exist_ok=True) + token = _sanitize_path_token(task_id or input_path.stem or input_path.name) + return root / f"{token}.{_sanitize_path_token(suite_id)}.json" + + +def _build_source_prompt(case_input: dict, target: dict, suite) -> str: + payload = { + "case": {key: value for key, value in case_input.items() if not str(key).startswith("_")}, + "state": { + "answer": target.get("answer"), + "status": target.get("status"), + "artifacts": target.get("artifacts"), + "trajectory": target.get("trajectory"), + "tool_calls": target.get("tool_calls"), + }, + "required_output_schema": { + "score": "number, weighted score from 0 to 100", + "verdict": "string", + "veto_triggered": "boolean, true only for one-vote veto failures", + }, + "instruction": "Evaluate the existing answer/state and return exactly one JSON object.", + } + return json.dumps(payload, ensure_ascii=False, indent=2) + + +def _case_query(case) -> str: + case_input = getattr(case, "input", {}) or {} + for key in ("input", "query", "prompt"): + if key in case_input and case_input[key] is not None: + return str(case_input[key]) + raise ValueError("task source case is missing input/query/prompt") + + +def _case_source_metadata(case) -> dict[str, Any]: + metadata = getattr(case, "metadata", {}) or {} + source_record = metadata.get("source_record") + if isinstance(source_record, Mapping) and isinstance(source_record.get("metadata"), Mapping): + return dict(source_record["metadata"]) + return {} + + +class _CliAgentRuntimeHarness: + def __init__(self, *, agent_name: str): + self.agent_name = agent_name + self._executor = None + + async def run_rollout(self, *, case, target: Mapping[str, Any]) -> RolloutState: + query = _case_query(case) + started_at = time.monotonic() + source_metadata = _case_source_metadata(case) + turns = [RolloutTurn(role="user", content=query)] + executor = await self._get_executor() + try: + swarm = getattr(executor, "swarm", None) + if swarm is not None: + answer = await Runners.run(input=query, swarm=swarm) + else: + answer = await executor.chat(query) + except Exception as exc: + duration_ms = int((time.monotonic() - started_at) * 1000) + state = RolloutState( + case_id=str(getattr(case, "case_id", "case")), + status="failed", + turns=turns, + trajectory=[turn.to_dict() for turn in turns], + timing={"duration_ms": duration_ms}, + error={"type": exc.__class__.__name__, "message": str(exc)}, + outcome={"has_answer": False, "agent": self.agent_name}, + metadata={**source_metadata, "agent": self.agent_name}, + ) + state.standard_metrics.update(derive_standard_metrics(state)) + return state + + duration_ms = int((time.monotonic() - started_at) * 1000) + eval_state = normalize_task_response_to_eval_state( + case_id=str(getattr(case, "case_id", "case")), + response=answer, + target=target, + metadata={**source_metadata, "agent": self.agent_name}, + ) + assistant_turn = RolloutTurn(role="assistant", content=eval_state.answer) + turns.append(assistant_turn) + trajectory = list(eval_state.trajectory) or [turn.to_dict() for turn in turns] + extracted_trajectory = {} + if trajectory: + try: + extracted_trajectory = extract_aworld_trajectory_payload( + trajectory, + task_id=eval_state.case_id, + is_sub_task=False, + ) + except Exception: + extracted_trajectory = {} + evidence_blocks = len(extracted_trajectory.get("evidence") or []) + is_finished = any( + bool(step.get("is_agent_finished")) + for step in extracted_trajectory.get("steps", []) + if isinstance(step, Mapping) + ) + state = RolloutState( + case_id=eval_state.case_id, + status=eval_state.status, + answer=eval_state.answer, + turns=turns, + trajectory=trajectory, + tool_calls=list(eval_state.tool_calls), + usage=dict(eval_state.usage), + timing={**dict(eval_state.timing), "duration_ms": duration_ms}, + error=eval_state.error, + outcome={ + "has_answer": eval_state.answer is not None, + "agent": self.agent_name, + "task_id": eval_state.case_id, + "question": query, + "evidence_blocks": evidence_blocks, + "num_steps": len(trajectory), + "is_finished": is_finished or eval_state.status == "success", + "final_answer_len": len(str(eval_state.answer or "")), + }, + metadata=dict(eval_state.metadata), + ) + state.standard_metrics.update(derive_standard_metrics(state)) + return state + + async def _get_executor(self): + if self._executor is None: + self._executor = await _load_cli_agent_executor(self.agent_name) + return self._executor + + +def _build_cli_agent_runtime_harness(*, agent_name: str): + return _CliAgentRuntimeHarness(agent_name=agent_name) + + +async def _load_cli_agent_executor(agent_name: str): + from aworld.core.scheduler import get_scheduler + from aworld_cli.main import _resolve_agent_dirs + from aworld_cli.runtime.cli import CliRuntime + + _ensure_cli_agent_runtime_bootstrapped() + runtime = CliRuntime( + agent_name=agent_name, + local_dirs=_resolve_agent_dirs(None), + disable_live_display=True, + ) + all_agents = await runtime._load_agents() + selected_agent = next((item for item in all_agents if item.name == agent_name), None) + if selected_agent is None: + available = ", ".join(sorted(item.name for item in all_agents)) or "none" + raise ValueError(f"agent '{agent_name}' not found; available agents: {available}") + + runtime._scheduler = get_scheduler() + runtime._bind_scheduler_default_agent(selected_agent.name) + executor = await runtime._create_executor(selected_agent) + if executor is None: + raise ValueError(f"failed to create executor for agent '{agent_name}'") + executor._base_runtime = runtime + executor._suppress_interactive_loading_status = True + return executor + + +def _ensure_cli_agent_runtime_bootstrapped() -> None: + global _CLI_AGENT_RUNTIME_BOOTSTRAPPED + if _CLI_AGENT_RUNTIME_BOOTSTRAPPED: + return + from aworld_cli.main import _show_banner, init_middlewares + from aworld_cli.runtime_bootstrap import RuntimeBootstrapError, bootstrap_runtime + + try: + bootstrap_runtime( + env_file=".env", + skill_paths=None, + show_banner=False, + init_middlewares_fn=init_middlewares, + show_banner_fn=_show_banner, + ) + except RuntimeBootstrapError as exc: + raise ValueError(str(exc)) from exc + _CLI_AGENT_RUNTIME_BOOTSTRAPPED = True + + +def _build_trajectory_prompt(case_input: dict, target: dict, suite) -> str: + outcome = (target.get("artifacts") or {}).get("outcome") or {} + extracted_path = outcome.get("extracted_path") + extracted_payload = {} + if extracted_path: + extracted_payload = json.loads(Path(str(extracted_path)).read_text(encoding="utf-8")) + elif isinstance(target.get("trajectory"), list) and target.get("trajectory"): + task_id = str(target.get("case_id") or case_input.get("id") or case_input.get("input_id") or case_input.get("_case_id") or "case") + extracted_payload = extract_aworld_trajectory_payload( + target["trajectory"], + task_id=task_id, + is_sub_task=False, + ) + if not extracted_payload.get("final_answer") and target.get("answer") is not None: + extracted_payload["final_answer"] = target.get("answer") + case_value = case_input.get("input") or case_input.get("query") or case_input.get("prompt") + if not extracted_payload.get("question") and case_value is not None: + extracted_payload["question"] = str(case_value) + payload = { + "case": {key: value for key, value in case_input.items() if not str(key).startswith("_")}, + "extracted_trajectory": extracted_payload, + "required_output_schema": { + "score": "number, weighted score from 0 to 100", + "verdict": "Excellent|Pass|Marginal|Fail", + "A1_groundedness": "integer 1-5", + "A2_completeness": "integer 1-5", + "A3_relevance": "integer 1-5", + "A4_readability": "integer 1-5", + "B1_tool_use": "integer 1-5", + "B2_efficiency": "integer 1-5", + "B3_compliance": "integer 1-5", + "B4_robustness": "integer 1-5", + "veto_triggered": "boolean", + }, + "instruction": ( + "Apply the trajectory evaluator contract to the extracted trajectory. " + "Do not call tools and do not re-read the raw log; all required evidence is in extracted_trajectory. " + "Return exactly one JSON object matching required_output_schema, with no markdown." + ), + } + return json.dumps(payload, ensure_ascii=False, indent=2) + + +def _build_source_suite( + *, + kind: str, + input_path: Path, + judge_agent_path: Path, + task_id: str | None, + id_field: str, + task_field: str, + answer_field: str, + out_dir: str | None, + agent: str | None = None, +): + agent_name = agent or "Aworld" + trajectory_gate = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), + GateMetricCondition(metric_name="veto_triggered", op="==", threshold=False), + GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), + GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), + ) + ) + trajectory_outcome_scorers = ( + StateCheckGrader( + metric_name="has_evidence", + source="outcome", + path=("evidence_blocks",), + op=">", + expected=0, + ), + StateCheckGrader( + metric_name="agent_finished", + source="outcome", + path=("is_finished",), + op="==", + expected=True, + ), + ) + answer_gate = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="veto_triggered", op="==", threshold=False), + ) + ) + if kind == "task": + source = JsonlTaskSource( + path=input_path, + id_field=id_field, + input_field=task_field, + ) + return create_source_eval_suite( + suite_id="task-source-evaluator", + source=source, + runtime_harness=_build_cli_agent_runtime_harness(agent_name=agent_name), + judge_backend=AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id="source-agent-md", + prompt_builder=_build_source_prompt, + ), + judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), + gate_policy=answer_gate, + metadata={"agent": agent_name}, + ) + + if kind == "answer": + source = JsonlTaskAnswerSource( + path=input_path, + id_field=id_field, + input_field=task_field, + answer_field=answer_field, + ) + return create_source_eval_suite( + suite_id="answer-source-evaluator", + source=source, + judge_backend=AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id="source-agent-md", + prompt_builder=_build_source_prompt, + ), + judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), + gate_policy=answer_gate, + ) + + if kind == "trajectory": + if task_id or _looks_like_aworld_trajectory_log(input_path): + source = AWorldTrajectoryLogSource( + path=input_path, + task_ids=[task_id] if task_id else None, + extraction_dir=out_dir, + ) + runtime_harness = None + else: + source = JsonlTaskSource( + path=input_path, + id_field=id_field, + input_field=task_field, + ) + runtime_harness = _build_cli_agent_runtime_harness(agent_name=agent_name) + return create_source_eval_suite( + suite_id="trajectory-source-evaluator", + source=source, + runtime_harness=runtime_harness, + judge_backend=AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id="trajectory-evaluator-agent-md", + prompt_builder=_build_trajectory_prompt, + ), + judge_schema=TrajectoryJudgeSchema.default(), + outcome_scorers=trajectory_outcome_scorers, + gate_policy=trajectory_gate, + metadata={"agent": agent_name} if not task_id else None, + ) + + raise ValueError(f"unsupported source kind: {kind}; expected one of: {', '.join(_SUPPORTED_SOURCE_KINDS)}") + + +def run_evaluator_source_cli( + *, + input: str, + kind: str, + judge_agent: str, + out_dir: str | None = None, + output: str | None = None, + task_id: str | None = None, + agent: str | None = None, + id_field: str = "id", + task_field: str = "input", + answer_field: str = "answer", + interactive_approval: bool = False, +) -> dict: + hooks = _load_evaluator_hooks() + kind = (kind or "").strip().lower() + input_path = Path(input).expanduser().resolve() + if not input_path.exists(): + raise FileNotFoundError(f"source input does not exist: {input_path}") + judge_agent_path = Path(judge_agent).expanduser().resolve() + if not judge_agent_path.exists(): + raise FileNotFoundError(f"judge agent does not exist: {judge_agent_path}") + + workspace_path = str(input_path.parent if input_path.is_file() else input_path) + event_base = { + "mode": "source", + "input": str(input_path), + "kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + "agent": agent, + "workspace_path": workspace_path, + "output_path": str(Path(output).expanduser().resolve()) if output else None, + } + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.pre_run", + event=event_base, + state={ + "mode": "source", + "input": str(input_path), + "kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + "agent": agent, + "interactive_approval": interactive_approval, + }, + ) + suite = _build_source_suite( + kind=kind, + input_path=input_path, + judge_agent_path=judge_agent_path, + task_id=task_id, + id_field=id_field, + task_field=task_field, + answer_field=answer_field, + out_dir=out_dir, + agent=agent, + ) + agent_name = agent or "Aworld" + executes_agent = kind == "task" or (kind == "trajectory" and not task_id) + target_info = { + "target_kind": "source", + "target_path": str(input_path), + "source_kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + "agent": agent_name if executes_agent else agent, + } + for key, value in hook_state.items(): + if key not in {"mode", "input", "kind", "task_id", "judge_agent", "agent", "interactive_approval", "summary_suffix"}: + target_info[key] = value + flow = EvaluationFlowDef( + target=target_info, + suite=suite, + interactive_approval=interactive_approval, + output_path=output, + ) + report = asyncio.run(run_evaluation_flow(flow)) + if hasattr(report, "to_dict"): + report = report.to_dict() + approval = dict(report.get("approval") or {}) + approval.setdefault("required", report.get("gate", {}).get("status") == "needs_approval") + approval.setdefault("resolved", False) + approval.setdefault("approved", None) + if approval["required"] and interactive_approval: + approved = builtins.input("Evaluation requires approval. Approve? [y/N]: ").strip().lower() in {"y", "yes"} + approval["resolved"] = True + approval["approved"] = approved + report["approval"] = approval + report["source_selection"] = { + "mode": "source", + "input": str(input_path), + "kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + "agent": agent_name if executes_agent else agent, + } + report["automation"] = _build_automation_summary(report) + output_path = _source_report_path( + input_path=input_path, + suite_id=report["suite_id"], + task_id=task_id, + output=output, + out_dir=out_dir, + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + report["report_path"] = str(output_path) + post_event = { + **event_base, + "output_path": str(output_path), + "report": report, + } + _run_evaluator_hooks( + hooks, + "evaluator.post_run", + event=post_event, + state=hook_state, + ) + output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + return report + + +def run_evaluator_cli( + *, + target: str, + suite: str | None = None, + output: str | None = None, + interactive_approval: bool = False, +) -> dict: + hooks = _load_evaluator_hooks() + target_path = resolve_cli_target_path(target) + workspace_path = str(target_path.parent if target_path.is_file() else target_path) + suite_selection = resolve_workspace_suite_selection(target=target, suite=suite) + from aworld.evaluations.substrate import resolve_eval_suite_selection + + selection = resolve_eval_suite_selection(suite, target_path) + suite_def = selection.suite + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.pre_run", + event={ + "mode": "target", + "target": str(target_path), + "suite": suite_selection["resolved"], + "workspace_path": workspace_path, + }, + state={ + "mode": "target", + "target": str(target_path), + "suite": suite, + "interactive_approval": interactive_approval, + }, + ) + target_info = describe_eval_target(target_path) + for key, value in hook_state.items(): + if key not in {"target", "suite", "interactive_approval", "summary_suffix", "suite_names"}: + target_info[key] = value + flow = EvaluationFlowDef( + target=target_info, + suite=suite_def, + interactive_approval=interactive_approval, + output_path=output, + ) + report = asyncio.run(run_evaluation_flow(flow)) + if hasattr(report, "to_dict"): + report = report.to_dict() + approval = dict(report.get("approval") or {}) + approval.setdefault("required", report.get("gate", {}).get("status") == "needs_approval") + approval.setdefault("resolved", False) + approval.setdefault("approved", None) + if approval["required"] and interactive_approval: + approved = builtins.input("Evaluation requires approval. Approve? [y/N]: ").strip().lower() in {"y", "yes"} + approval["resolved"] = True + approval["approved"] = approved + report["approval"] = approval + report["suite_selection"] = suite_selection + report["automation"] = _build_automation_summary(report) + output_path = ( + Path(output).expanduser().resolve() + if output + else default_evaluator_report_path(target_path=target_path, suite_id=report["suite_id"]) + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + report["report_path"] = str(output_path) + _run_evaluator_hooks( + hooks, + "evaluator.post_run", + event={ + "mode": "target", + "report": report, + "target": str(target_path), + "suite": suite_selection["resolved"], + "workspace_path": workspace_path, + }, + state=hook_state, + ) + output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + return report + + +def render_evaluator_summary(report: dict) -> str: + hooks = _load_evaluator_hooks() + workspace_path = str(Path(report.get("report_path", report.get("target", {}).get("target_path", Path.cwd()))).resolve().parent) + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.render_summary", + event={"report": report, "workspace_path": workspace_path}, + state={"summary_suffix": None}, + ) + return _render_evaluator_summary(report, summary_suffix=hook_state.get("summary_suffix")) diff --git a/aworld-cli/src/aworld_cli/evaluator_workspace.py b/aworld-cli/src/aworld_cli/evaluator_workspace.py new file mode 100644 index 000000000..f9cb1551e --- /dev/null +++ b/aworld-cli/src/aworld_cli/evaluator_workspace.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from pathlib import Path + +from aworld.evaluations.substrate import ( + list_eval_suites, + list_matching_eval_suites, + load_declared_eval_suites, + resolve_eval_suite_selection, +) + + +def resolve_cli_target_path(target: str) -> Path: + target_path = Path(target).expanduser().resolve() + if not target_path.exists(): + raise FileNotFoundError(f"evaluation target does not exist: {target_path}") + return target_path + + +def discover_workspace_suites(target: str | None = None) -> list[str]: + if target is None: + load_declared_eval_suites() + return list_eval_suites() + target_path = resolve_cli_target_path(target) + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) + return list_matching_eval_suites(target_path) + + +def resolve_workspace_suite_selection( + *, + target: str, + suite: str | None = None, +) -> dict[str, str | None]: + target_path = resolve_cli_target_path(target) + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) + selection = resolve_eval_suite_selection(suite, target_path) + return { + "requested": suite, + "resolved": selection.suite_id, + "mode": selection.mode, + } diff --git a/aworld-cli/src/aworld_cli/top_level_commands/__init__.py b/aworld-cli/src/aworld_cli/top_level_commands/__init__.py index 03f64e245..19cb4350f 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/__init__.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/__init__.py @@ -1,7 +1,4 @@ from __future__ import annotations def register_builtin_top_level_commands(registry) -> None: - # Keep the builtin registry hook so kernel-owned top-level commands can be - # added later. The `skill` command is now contributed through the framework - # plugin bootstrap path instead of hardcoded registration here. return None diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py new file mode 100644 index 000000000..3d90f5186 --- /dev/null +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from aworld_cli.evaluator_runtime import ( + available_evaluator_suites, + evaluator_exit_code, + get_evaluator_suite_selection, + get_evaluator_report_schema, + render_evaluator_summary, + run_evaluator_cli, + run_evaluator_source_cli, + validate_evaluator_report, +) + + +class EvaluatorTopLevelCommand: + @property + def name(self) -> str: + return "evaluator" + + @property + def description(self) -> str: + return "Run a suite-backed evaluation flow for a local target." + + @property + def aliases(self) -> tuple[str, ...]: + return tuple() + + def register_parser(self, subparsers) -> None: + parser = subparsers.add_parser( + "evaluator", + help=self.description, + description=self.description, + prog="aworld-cli evaluator", + ) + parser.add_argument("--target", type=str) + parser.add_argument("--suite", type=str) + parser.add_argument("--output", type=str) + parser.add_argument("--interactive-approval", action="store_true") + parser.add_argument("--list-suites", action="store_true") + parser.add_argument("--print-report-schema", action="store_true") + parser.add_argument("--validate-report", type=str) + parser.add_argument("--input", type=str) + parser.add_argument("--kind", type=str) + parser.add_argument("--judge-agent", type=str) + parser.add_argument("--out-dir", type=str) + parser.add_argument("--task-id", type=str) + parser.add_argument("--agent", type=str) + parser.add_argument("--id-field", default="id") + parser.add_argument("--task-field", default="input") + parser.add_argument("--answer-field", default="answer") + + def run(self, args, context) -> int: + if getattr(args, "input", None): + incompatible_args = ( + ("target", "--target"), + ("suite", "--suite"), + ("list_suites", "--list-suites"), + ("print_report_schema", "--print-report-schema"), + ("validate_report", "--validate-report"), + ) + for attr_name, flag_name in incompatible_args: + if getattr(args, attr_name, None): + print(f"Evaluator error: {flag_name} cannot be used with --input") + return 1 + if not getattr(args, "kind", None): + print("Evaluator error: --kind is required with --input") + return 1 + if not getattr(args, "judge_agent", None): + print("Evaluator error: --judge-agent is required with --input") + return 1 + try: + report = run_evaluator_source_cli( + input=args.input, + kind=args.kind, + judge_agent=args.judge_agent, + out_dir=args.out_dir, + output=args.output, + task_id=args.task_id, + agent=args.agent, + id_field=args.id_field, + task_field=args.task_field, + answer_field=args.answer_field, + interactive_approval=args.interactive_approval, + ) + except (FileNotFoundError, ValueError, KeyError) as exc: + print(f"Evaluator error: {exc}") + return 1 + print(render_evaluator_summary(report)) + return evaluator_exit_code(report) + + source_only_args = ( + ("kind", "--kind"), + ("judge_agent", "--judge-agent"), + ("out_dir", "--out-dir"), + ("task_id", "--task-id"), + ("agent", "--agent"), + ) + for attr_name, flag_name in source_only_args: + if getattr(args, attr_name, None): + print(f"Evaluator error: --input is required when using {flag_name}") + return 1 + + if getattr(args, "print_report_schema", False): + print(json.dumps(get_evaluator_report_schema(), ensure_ascii=False, indent=2)) + return 0 + + if getattr(args, "validate_report", None): + report_path = Path(args.validate_report).expanduser().resolve() + report = json.loads(report_path.read_text(encoding="utf-8")) + try: + validate_evaluator_report(report) + except ValueError as exc: + print(f"Report is invalid: {exc}") + return 4 + print(f"Report is valid: {report_path}") + return 0 + + if getattr(args, "list_suites", False): + try: + if getattr(args, "target", None): + print("Available evaluator suites for target:") + suite_names = available_evaluator_suites(target=args.target) + else: + print("Available evaluator suites:") + suite_names = available_evaluator_suites() + for suite_name in suite_names: + print(f" - {suite_name}") + if getattr(args, "target", None) and suite_names: + selection = get_evaluator_suite_selection(target=args.target, suite=args.suite) + print(f"Default suite: {selection['resolved']}") + except (FileNotFoundError, ValueError, KeyError) as exc: + print(f"Evaluator error: {exc}") + return 1 + return 0 + + if not getattr(args, "target", None): + print("❌ --target is required unless --list-suites is used") + return 1 + + try: + report = run_evaluator_cli( + target=args.target, + suite=args.suite, + output=args.output, + interactive_approval=args.interactive_approval, + ) + except (FileNotFoundError, ValueError, KeyError) as exc: + print(f"Evaluator error: {exc}") + return 1 + print(render_evaluator_summary(report)) + return evaluator_exit_code(report) diff --git a/aworld/config/conf.py b/aworld/config/conf.py index 6d9f27219..14db16980 100644 --- a/aworld/config/conf.py +++ b/aworld/config/conf.py @@ -423,6 +423,8 @@ class EvaluationConfig(BaseConfig): eval_target_full_class_name: str = None eval_target_config: dict = None eval_criterias: List[Union[dict]] = None + eval_suite_id: str = None + eval_dataset: Any = None # eval dataset id or file path, file path should be a jsonl file eval_dataset_id_or_file_path: str = None eval_dataset_load_config: Optional[DataLoaderConfig] = DataLoaderConfig() diff --git a/aworld/evaluations/README.md b/aworld/evaluations/README.md index 5199bb5d1..7419026bb 100644 --- a/aworld/evaluations/README.md +++ b/aworld/evaluations/README.md @@ -1,8 +1,9 @@ # AWorld Evaluations Module -The `aworld.evaluations` module provides a comprehensive framework for evaluating the performance of AI agents, language -models, and tasks within the AWorld ecosystem. It offers flexible evaluation criteria, diverse scoring mechanisms, and a -robust runtime system to conduct structured assessments. +The `aworld.evaluations` module is the framework-owned evaluation substrate for AWorld. It supports both legacy +`EvaluationConfig`-driven flows and newer suite-backed evaluator flows that can execute an agent, task, or trusted +program callable first, then score final outcomes and normalized trajectory/process quality from a single execution +state. ## Table of Contents @@ -80,6 +81,36 @@ with input data. `EvalResult` captures the outcomes of an evaluation run, including individual case results and summary statistics. +### Suite-Backed Evaluation Definitions + +Suite-backed evaluation adds a definition layer on top of the existing runtime skeleton: + +- `EvalSuiteDef`: suite identity, cases, judge schema, gate policy, trajectory scorers, toolset hints, execution spec +- `EvalCaseDef`: input plus optional expected output and per-case runtime hints +- `EvalHarnessDef`: reusable execution defaults for suite-backed flows +- `EvalExecutionSpec`: runtime execution mode and target/task configuration +- `EvalState`: normalized execution result containing final answer, completion view, trajectory, usage, timing, and errors + +These live under `aworld/evaluations/**`, not in `aworld-cli`, so the same substrate can be reused by framework callers, +official CLI flows, and custom evaluation agents. + +Ownership is explicit: + +- suite and case definitions own evaluation intent: input, expected outcome, task-domain tool hints, tags, and judge/gate semantics +- harnesses and execution specs own runtime behavior: whether execution is static, agent-backed, task-backed, or program-backed, plus task/runner configuration +- `aworld-cli` only assembles workspace inputs into these framework objects; it does not redefine evaluator semantics + +Declarative JSON manifests are intentionally narrower than in-memory framework APIs. They do not accept `execution`, +`target_ref`, `task_builder_ref`, live agent/task objects, or program callables. In-memory callers may still pass live +AWorld agent/task instances through `EvalExecutionSpec.target_config` for compatibility, but that is not a persisted +suite contract. + +`EvalState` intentionally separates: + +- `answer`: the final deliverable or normalized terminal answer +- `completion`: completion-oriented view used by outcome scorers that only care about the final assistant output +- `trajectory`: captured execution history used by process, tool-use, and efficiency scorers + ## Scorers ### Scorer Registry @@ -119,12 +150,25 @@ The module includes several pre-built scorers for common evaluation tasks: - **SummarizeQuality**: Assesses the quality of generated summaries - And more... +### Execution-State Helpers + +`aworld.evaluations.scorers.state_extractors` provides reusable helpers for execution-backed scoring: + +- `get_eval_state(output)` +- `get_completion(output)` +- `get_assistant_messages(output)` +- `get_messages_by_role(output, role)` +- `get_tool_calls(output)` +- `get_trajectory(output)` + +Use these helpers instead of hand-parsing raw trajectory payloads in every scorer. + ## Evaluation Targets ### AworldAgentEvalTarget `AworldAgentEvalTarget` enables evaluating AWorld agents by running them on evaluation datasets and capturing their -responses. +responses. In execution-backed flows it returns both the final answer and a normalized `state` payload. ```python class AworldAgentEvalTarget(EvalTarget[dict]): @@ -140,7 +184,52 @@ class AworldAgentEvalTarget(EvalTarget[dict]): ### AworldTaskEvalTarget `AworldTaskEvalTarget` provides a framework for evaluating task-based systems by building and running tasks for each -evaluation case. +evaluation case. In execution-backed flows it normalizes `TaskResponse` output into `EvalState`. + +## Execution-Backed Suite Evaluation + +Execution-backed suite flows reuse the existing AWorld runtime instead of introducing a parallel evaluator stack: + +- suite/case definitions specify what is being evaluated +- `EvalExecutionSpec` specifies how runtime execution happens +- `EvalTarget -> Evaluator -> EvaluateRunner` remains the core orchestration skeleton +- scorers read normalized execution state for outcome and trajectory evaluation + +The current execution modes are: + +- `static`: judge-only evaluation with no runtime execution +- `agent`: execute through `AworldAgentEvalTarget` +- `task`: execute through `AworldTaskEvalTarget` +- `program`: execute a trusted importable callable through the evaluator adapter layer and normalize the result into `EvalState` + +This gives AWorld a framework-native evaluator path that can assess final artifacts, structured outputs, and captured +trajectory quality through one substrate. It is a single-shot evaluator flow: rollout-owning harnesses, user simulators, +lifecycle hooks, child-state composition, and step-level training rewards are separate runtime-composition work. + +`program` and TASK builder references are trusted in-process extension points. Importing a module can execute top-level +code, so these references should only point at evaluator code controlled by the runner or workspace owner. They are not +sandboxed and are not exposed through declared JSON manifests. + +Suite-backed evaluation also supports: + +- typed judge schemas: Pydantic-backed validation with JSON schema export and required-field compatibility +- composite gates: structured metric conditions with `pass`, `fail`, and `needs_approval` outcomes +- trajectory scorers: suite-declared process metrics that lower into normal evaluator criteria and reports + +## Suite, Case, and Execution Mapping + +The evaluator v2 path is intentionally close to AWorld's existing runner model: + +- suite -> describes the evaluation contract and default gate/judge behavior +- case -> provides per-row input, optional references, and case-local execution hints +- lightweight harness / execution spec -> describes how a case becomes a runnable AWorld execution +- eval target -> adapts the execution spec into an existing target implementation +- evaluator / runner -> executes cases and produces normalized outputs +- scorers -> read final answer, completion, and trajectory from `EvalState` + +In practice this means outcome evaluation and trajectory evaluation share one execution pipeline. A suite can score only +the final artifact, only the captured trajectory, or both. The lightweight harness boundary selects execution defaults +and adapters; it does not own multi-turn rollout lifecycle in this version. ## Recorder @@ -165,6 +254,33 @@ class EvaluateRunner(Runner): # Evaluation orchestration logic ``` +`EvaluateRunner` remains the orchestration layer. Suite-backed evaluation compiles into it rather than replacing it. + +## Framework vs CLI + +`aworld.evaluations` owns evaluation semantics: + +- suite definitions +- execution-backed compilation +- normalized execution state +- scoring helpers +- report and declared-suite schemas + +`aworld-cli` owns product entrypoints: + +- `aworld-cli evaluator` +- workspace suite discovery +- report file writing +- evaluator lifecycle hooks for peripheral CLI customization + +`aworld-cli evaluator` is now plugin-backed, but the reusable evaluator substrate remains framework-owned. + +The intended layering is: + +- build evaluator capabilities in `aworld/evaluations/**` +- expose a convenient official entrypoint in `aworld-cli` +- allow other agents or products to reuse the framework substrate without depending on the CLI command shape + ## Usage Examples ### Basic Evaluation @@ -245,4 +361,4 @@ Evaluation behavior can be customized through the `EvaluationConfig` class, whic - Evaluation targets and their configurations - Datasets and loading parameters - Evaluation criteria and metrics -- Execution parameters like parallelism and repetition count \ No newline at end of file +- Execution parameters like parallelism and repetition count diff --git a/aworld/evaluations/base.py b/aworld/evaluations/base.py index c54ebe147..5ff609717 100644 --- a/aworld/evaluations/base.py +++ b/aworld/evaluations/base.py @@ -29,7 +29,7 @@ class EvalStatus(Enum): NOT_EVALUATED = 3 -MetricValueType = Union[int, float, bool] +MetricValueType = Union[int, float, bool, str] @dataclass @@ -209,6 +209,8 @@ def _do_summarize(self, scores: list[Any]) -> dict: score_dict['std'] = statistics.stdev(scores) else: score_dict['std'] = 0.0 + elif isinstance(score, str): + score_dict['value'] = score if all(item == score for item in scores) else "mixed" elif isinstance(score, dict): all_keys = list( dict.fromkeys([k for score in scores if isinstance(score, dict) for k in score.keys()]) diff --git a/aworld/evaluations/eval_targets/agent_eval.py b/aworld/evaluations/eval_targets/agent_eval.py index 8176f326a..1139405dd 100644 --- a/aworld/evaluations/eval_targets/agent_eval.py +++ b/aworld/evaluations/eval_targets/agent_eval.py @@ -2,10 +2,11 @@ from typing import Optional, Union from aworld.evaluations.base import EvalTarget, EvalDataCase +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution_adapters import resolve_execution_adapter from aworld.agents.llm_agent import Agent from aworld.config.conf import AgentConfig -from aworld.runner import Runners -from aworld.core.task import Task, TaskResponse +from aworld.core.task import Task import os @@ -72,8 +73,25 @@ def _create_agent_from_config(self, agent_config): async def predict(self, index: int, input: Union[EvalDataCase[dict], dict]) -> dict: query_column = self.eval_config.eval_dataset_query_column or self.query_column case_data = input.case_data if isinstance(input, EvalDataCase) else input - response = await Runners.run(case_data[query_column], agent=self.agent) - return {"answer": response.answer} + case = type( + "AdapterCase", + (), + { + "case_id": getattr(input, "eval_case_id", str(index)), + "input": dict(case_data), + }, + )() + spec = EvalExecutionSpec( + mode=EvalExecutionMode.AGENT, + target_config={"agent": self.agent}, + query_column=query_column, + ) + state = await resolve_execution_adapter(spec).execute( + case=case, + target=dict(case_data.get("_target", {})), + spec=spec, + ) + return {"answer": state.answer, "state": state.to_dict()} class AworldTaskEvalTarget(EvalTarget[dict]): @@ -94,10 +112,22 @@ async def build_task(self, index: int, input: EvalDataCase[dict]) -> Task: async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: task = await self.build_task(index, input) - result = await Runners.run_task(task=task) - if isinstance(result, TaskResponse): - return {"answer": result.answer} - if isinstance(result, dict): - return {"answer": result[task.id].answer} - else: - return {"answer": result} + case_data = input.case_data if isinstance(input, EvalDataCase) else {} + spec = EvalExecutionSpec( + mode=EvalExecutionMode.TASK, + target_config={"task": task}, + ) + case = type( + "AdapterCase", + (), + { + "case_id": getattr(input, "eval_case_id", str(index)), + "input": dict(case_data), + }, + )() + state = await resolve_execution_adapter(spec).execute( + case=case, + target=dict(case_data.get("_target", {})), + spec=spec, + ) + return {"answer": state.answer, "state": state.to_dict()} diff --git a/aworld/evaluations/execution.py b/aworld/evaluations/execution.py new file mode 100644 index 000000000..633fd34a9 --- /dev/null +++ b/aworld/evaluations/execution.py @@ -0,0 +1,179 @@ +# coding: utf-8 +from __future__ import annotations + +import importlib +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Mapping + +from aworld.core.task import TaskResponse + + +class EvalExecutionMode(str, Enum): + STATIC = "static" + AGENT = "agent" + TASK = "task" + PROGRAM = "program" + + +@dataclass(frozen=True) +class EvalExecutionSpec: + mode: EvalExecutionMode = EvalExecutionMode.STATIC + target_ref: str | None = None + target_config: dict[str, Any] = field(default_factory=dict) + query_column: str | None = None + task_builder_ref: str | None = None + runner_method: str | None = None + timeout_seconds: float | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class EvalState: + case_id: str + status: str + answer: Any | None = None + completion: list[Any] = field(default_factory=list) + artifacts: dict[str, Any] = field(default_factory=dict) + trajectory: list[dict[str, Any]] = field(default_factory=list) + tool_calls: list[dict[str, Any]] = field(default_factory=list) + usage: dict[str, Any] = field(default_factory=dict) + timing: dict[str, Any] = field(default_factory=dict) + error: dict[str, Any] | None = None + raw_response: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "case_id": self.case_id, + "status": self.status, + "answer": self.answer, + "completion": self.completion, + "artifacts": self.artifacts, + "trajectory": self.trajectory, + "tool_calls": self.tool_calls, + "usage": self.usage, + "timing": self.timing, + "error": self.error, + "raw_response": self.raw_response, + "metadata": self.metadata, + } + + +def _extract_tool_calls_from_trajectory(trajectory: list[dict[str, Any]]) -> list[dict[str, Any]]: + calls: list[dict[str, Any]] = [] + for step in trajectory: + if not isinstance(step, Mapping): + continue + if isinstance(step.get("tool_calls"), list): + calls.extend([dict(call) for call in step["tool_calls"] if isinstance(call, Mapping)]) + action = step.get("action") + if isinstance(action, Mapping) and isinstance(action.get("tool_calls"), list): + calls.extend([dict(call) for call in action["tool_calls"] if isinstance(call, Mapping)]) + return calls + + +def _merge_eval_metadata( + response_metadata: Any, + invocation_metadata: Mapping[str, Any] | None, + target: Mapping[str, Any] | None, +) -> dict[str, Any]: + base = dict(response_metadata) if isinstance(response_metadata, Mapping) else {} + base.update(dict(invocation_metadata or {})) + base["_target"] = dict(target or {}) + return base + + +def _list_field_from_response(response: Mapping[str, Any], field_name: str, default: list[Any]) -> list[Any]: + value = response.get(field_name) + if value is None: + return default + if not isinstance(value, list): + raise ValueError(f"{field_name} must be a list") + return list(value) + + +def normalize_task_response_to_eval_state( + *, + case_id: str, + response: Any, + target: Mapping[str, Any] | None = None, + metadata: Mapping[str, Any] | None = None, +) -> EvalState: + if isinstance(response, EvalState): + state = response.to_dict() + state["case_id"] = case_id + state["metadata"] = { + **dict(response.metadata or {}), + **dict(metadata or {}), + "_target": dict(target or {}), + } + return EvalState(**state) + + if isinstance(response, TaskResponse): + trajectory = list(response.trajectory or []) + return EvalState( + case_id=case_id, + status="success" if response.success else "failed", + answer=response.answer, + completion=[] if response.answer is None else [response.answer], + trajectory=trajectory, + tool_calls=_extract_tool_calls_from_trajectory(trajectory), + usage=dict(response.usage or {}), + timing={"time_cost": response.time_cost}, + raw_response=response.to_dict(), + metadata=_merge_eval_metadata(getattr(response, "metadata", {}), metadata, target), + ) + + if isinstance(response, Mapping): + trajectory = _list_field_from_response(response, "trajectory", []) + answer = response.get("answer") + return EvalState( + case_id=case_id, + status=str(response.get("status", "success")), + answer=answer, + completion=_list_field_from_response(response, "completion", [] if answer is None else [answer]), + artifacts=dict(response.get("artifacts") or {}), + trajectory=trajectory, + tool_calls=_list_field_from_response( + response, + "tool_calls", + _extract_tool_calls_from_trajectory(trajectory), + ), + usage=dict(response.get("usage") or {}), + timing=dict(response.get("timing") or {}), + error=dict(response.get("error")) if isinstance(response.get("error"), Mapping) else response.get("error"), + raw_response=dict(response), + metadata=_merge_eval_metadata(response.get("metadata"), metadata, target), + ) + + return EvalState( + case_id=case_id, + status="success", + answer=response, + completion=[] if response is None else [response], + metadata=_merge_eval_metadata({}, metadata, target), + ) + + +def _validate_importable_callable_ref(ref: str) -> tuple[str, str]: + if not ref or any(char.isspace() for char in ref) or "/" in ref or "\\" in ref: + raise ValueError("program execution requires an importable callable reference") + if ":" in ref: + module_name, attr_name = ref.split(":", 1) + elif "." in ref: + module_name, attr_name = ref.rsplit(".", 1) + else: + raise ValueError("program execution requires an importable callable reference") + module_parts = module_name.split(".") + if not module_name or not attr_name or attr_name == "py" or "py" in module_parts: + raise ValueError("program execution requires an importable callable reference") + return module_name, attr_name + + +def load_program_callable(ref: str): + module_name, attr_name = _validate_importable_callable_ref(ref) + candidate = getattr(importlib.import_module(module_name), attr_name) + if not callable(candidate): + raise ValueError(f"program reference is not callable: {ref}") + return candidate diff --git a/aworld/evaluations/execution_adapters.py b/aworld/evaluations/execution_adapters.py new file mode 100644 index 000000000..1e40bba2c --- /dev/null +++ b/aworld/evaluations/execution_adapters.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import annotations + +import inspect +from dataclasses import dataclass +from typing import Any, Protocol + +from aworld.evaluations.execution import ( + EvalExecutionMode, + EvalExecutionSpec, + EvalState, + _validate_importable_callable_ref, + load_program_callable, + normalize_task_response_to_eval_state, +) +from aworld.runner import Runners + + +class ExecutionAdapter(Protocol): + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + raise NotImplementedError + + +def _execution_metadata(*, mode: EvalExecutionMode | None = None) -> dict[str, Any]: + metadata: dict[str, Any] = {} + if mode is not None: + metadata["_execution_mode"] = mode.value + return metadata + + +@dataclass(frozen=True) +class StaticExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + return EvalState( + case_id=case.case_id, + status="not_evaluated", + metadata={"_target": dict(target)}, + ) + + +@dataclass(frozen=True) +class AgentExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + query_column = spec.query_column or "query" + query = case.input[query_column] + if "agent" not in spec.target_config: + raise ValueError("agent execution requires target_config['agent']") + response = await Runners.run(query, agent=spec.target_config["agent"]) + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=response, + target=target, + metadata=_execution_metadata(mode=spec.mode), + ) + + +@dataclass(frozen=True) +class TaskExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + task = spec.target_config.get("task") + if task is None: + if not spec.task_builder_ref: + raise ValueError("task execution requires task_builder_ref") + builder = load_program_callable(spec.task_builder_ref) + task = builder(case=case, target=target, spec=spec) + if inspect.isawaitable(task): + task = await task + + result = await Runners.run_task(task=task) + if isinstance(result, dict) and getattr(task, "id", None) in result: + result = result[task.id] + elif isinstance(result, dict) and len(result) == 1 and not {"status", "answer", "completion"} & result.keys(): + result = next(iter(result.values())) + + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=result, + target=target, + metadata=_execution_metadata(mode=spec.mode), + ) + + +@dataclass(frozen=True) +class ProgramExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + if not spec.target_ref: + raise ValueError("program execution requires target_ref") + program = load_program_callable(spec.target_ref) + result = program(case, spec, target) + if inspect.isawaitable(result): + result = await result + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=result, + target=target, + metadata=_execution_metadata(mode=spec.mode), + ) + + +def _validate_program_execution_spec(spec: EvalExecutionSpec) -> None: + if not spec.target_ref: + raise ValueError("program execution requires target_ref") + _validate_importable_callable_ref(spec.target_ref) + unsupported_config_keys = {"command", "commands", "workflow", "workflow_engine", "sandbox"} + if spec.runner_method is not None or unsupported_config_keys & set(spec.target_config): + raise ValueError("unsupported program execution configuration") + + +def resolve_execution_adapter(spec: EvalExecutionSpec) -> ExecutionAdapter: + if spec.mode == EvalExecutionMode.STATIC: + return StaticExecutionAdapter() + if spec.mode == EvalExecutionMode.AGENT: + return AgentExecutionAdapter() + if spec.mode == EvalExecutionMode.TASK: + return TaskExecutionAdapter() + if spec.mode == EvalExecutionMode.PROGRAM: + _validate_program_execution_spec(spec) + return ProgramExecutionAdapter() + raise ValueError(f"unsupported execution mode: {spec.mode}") diff --git a/aworld/evaluations/manifests.py b/aworld/evaluations/manifests.py new file mode 100644 index 000000000..2dd8f8d45 --- /dev/null +++ b/aworld/evaluations/manifests.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any + + +def get_declared_eval_suite_schema() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://schemas.aworld.dev/evaluator/declared-suite/v1.json", + "title": "AWorld Declared Evaluator Suite", + "type": "object", + "required": ["suite_id", "base_suite"], + "properties": { + "suite_id": { + "type": "string", + "minLength": 1, + "description": "Unique suite identifier exposed through aworld-cli evaluator.", + }, + "base_suite": { + "type": "string", + "const": "app-evaluator", + "description": "Builtin evaluator suite used as the declaration base.", + }, + "target_kinds": { + "type": "array", + "items": { + "type": "string", + "enum": ["file", "directory", "image"], + }, + "minItems": 1, + "uniqueItems": True, + "description": "Optional target kinds matched by this declared suite.", + }, + "gate_policy": { + "type": "object", + "properties": { + "metric_name": {"type": "string"}, + "pass_threshold": {"type": "number"}, + "approval_threshold": {"type": ["number", "null"]}, + }, + "additionalProperties": False, + "description": "Optional simple gate override layered on top of the base suite defaults.", + }, + "metadata": { + "type": "object", + "description": "Optional suite metadata copied into the resolved suite definition.", + }, + "priority": { + "type": "integer", + "description": "Optional suite selection priority. Larger values win automatic selection.", + }, + }, + "additionalProperties": False, + "description": "Declared evaluator suites are metadata-only overlays; executable refs and runtime handles are not accepted.", + } + + +def validate_declared_eval_suite_manifest(payload: dict[str, Any]) -> None: + import jsonschema + + try: + jsonschema.validate(instance=payload, schema=get_declared_eval_suite_schema()) + except jsonschema.ValidationError as exc: + path = ".".join(str(part) for part in exc.absolute_path) + location = f" at '{path}'" if path else "" + raise ValueError(f"declared evaluator suite validation failed{location}: {exc.message}") from exc diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py new file mode 100644 index 000000000..49749784f --- /dev/null +++ b/aworld/evaluations/report.py @@ -0,0 +1,238 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any + + +EVALUATOR_REPORT_FORMAT_ID = "aworld.evaluator.report" +EVALUATOR_REPORT_FORMAT_VERSION = 1 + + +class CaseEvaluationReport(dict): + def __init__( + self, + *, + case_id: str, + input: dict[str, Any], + metrics: dict[str, Any], + judge: dict[str, Any], + judge_backend: dict[str, Any] | None = None, + state_summary: dict[str, Any] | None = None, + artifacts: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + metric_details: dict[str, Any] | None = None, + trial: dict[str, Any] | None = None, + ) -> None: + payload = { + "case_id": case_id, + "input": input, + "metrics": metrics, + "judge": judge, + "judge_backend": judge_backend, + "state_summary": state_summary or {}, + } + if artifacts: + payload["artifacts"] = artifacts + if metadata: + payload["metadata"] = metadata + if metric_details: + payload["metric_details"] = metric_details + if trial: + payload["trial"] = trial + super().__init__(payload) + + def to_dict(self) -> dict[str, Any]: + return dict(self) + + +class EvaluatorReport(dict): + def to_dict(self) -> dict[str, Any]: + payload = dict(self) + results = payload.get("results") or [] + payload["results"] = [item.to_dict() if hasattr(item, "to_dict") else dict(item) for item in results] + return payload + + +def get_evaluator_report_schema() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": f"https://schemas.aworld.dev/evaluator/report/v{EVALUATOR_REPORT_FORMAT_VERSION}.json", + "title": "AWorld Evaluator Report", + "type": "object", + "$defs": { + "evalStatus": { + "type": "string", + "enum": ["PASSED", "FAILED", "NOT_EVALUATED"], + }, + "metricScalar": { + "oneOf": [ + {"type": "number"}, + {"type": "boolean"}, + {"type": "string"}, + ] + }, + "metricAggregate": { + "type": "object", + "properties": { + "mean": {"type": "number"}, + "min": {"type": "number"}, + "max": {"type": "number"}, + "std": {"type": "number"}, + "true_count": {"type": "integer", "minimum": 0}, + "true_rate": {"type": "number", "minimum": 0, "maximum": 1}, + "value": {"$ref": "#/$defs/metricScalar"}, + "eval_status": {"$ref": "#/$defs/evalStatus"}, + }, + "additionalProperties": { + "oneOf": [ + {"type": "number"}, + {"type": "boolean"}, + {"type": "string"}, + {"$ref": "#/$defs/metricAggregate"}, + ] + }, + }, + "caseMetric": { + "type": "object", + "properties": { + "value": {"$ref": "#/$defs/metricScalar"}, + "status": {"$ref": "#/$defs/evalStatus"}, + }, + "required": ["value"], + "additionalProperties": False, + }, + "gateDecision": { + "type": "object", + "required": ["status", "metric_name", "value"], + "properties": { + "status": { + "type": "string", + "enum": ["pass", "fail", "needs_approval"], + }, + "metric_name": {"type": ["string", "null"]}, + "value": {"type": ["number", "string", "boolean", "null"]}, + "matched_conditions": {"type": "array"}, + "failed_conditions": {"type": "array"}, + }, + "additionalProperties": False, + }, + "automationSummary": { + "type": "object", + "required": [ + "gate_status", + "metric_name", + "metric_value", + "approval_required", + "approval_resolved", + "approved", + "suggested_exit_code", + "case_count", + "judge_backend", + ], + "properties": { + "gate_status": { + "type": ["string", "null"], + "enum": ["pass", "fail", "needs_approval", None], + }, + "metric_name": {"type": ["string", "null"]}, + "metric_value": {"type": ["number", "string", "boolean", "null"]}, + "approval_required": {"type": "boolean"}, + "approval_resolved": {"type": "boolean"}, + "approved": {"type": ["boolean", "null"]}, + "suggested_exit_code": {"type": "integer", "enum": [0, 2, 3]}, + "case_count": {"type": "integer", "minimum": 0}, + "judge_backend": {"type": ["string", "null"]}, + "source_kind": {"type": ["string", "null"]}, + "source_input": {"type": ["string", "null"]}, + "task_id": {"type": ["string", "null"]}, + "agent": {"type": ["string", "null"]}, + }, + "additionalProperties": False, + }, + }, + "required": [ + "report_version", + "report_format", + "generated_at", + "suite_id", + "target", + "summary", + "metrics", + "results", + "result_counts", + "approval", + ], + "properties": { + "report_version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, + "report_format": { + "type": "object", + "required": ["id", "version"], + "properties": { + "id": {"type": "string", "const": EVALUATOR_REPORT_FORMAT_ID}, + "version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, + }, + "additionalProperties": False, + }, + "generated_at": {"type": "string", "format": "date-time"}, + "suite_id": {"type": "string"}, + "target": {"type": "object"}, + "summary": {"type": "object"}, + "metrics": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/metricAggregate"}, + }, + "results": { + "type": "array", + "items": { + "type": "object", + "required": ["case_id", "input", "metrics", "judge"], + "properties": { + "case_id": {"type": "string"}, + "input": {"type": "object"}, + "metrics": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/caseMetric"}, + }, + "judge": {"type": "object"}, + "judge_backend": { + "type": ["object", "null"], + "properties": {"backend_id": {"type": "string"}}, + "required": ["backend_id"], + "additionalProperties": False, + }, + "state_summary": {"type": "object"}, + }, + "additionalProperties": True, + }, + }, + "result_counts": { + "type": "object", + "required": ["cases_total", "cases_with_metrics", "cases_with_judge"], + "properties": { + "cases_total": {"type": "integer", "minimum": 0}, + "cases_with_metrics": {"type": "integer", "minimum": 0}, + "cases_with_judge": {"type": "integer", "minimum": 0}, + }, + "additionalProperties": False, + }, + "gate": {"$ref": "#/$defs/gateDecision"}, + "approval": {"type": "object"}, + "judge_backend": {"type": "object"}, + "suite_selection": {"type": "object"}, + "automation": {"$ref": "#/$defs/automationSummary"}, + "report_path": {"type": "string"}, + "judge_schema": {"type": "object"}, + }, + "additionalProperties": True, + } + + +def validate_evaluator_report(report: dict[str, Any]) -> None: + import jsonschema + + try: + jsonschema.validate(instance=report, schema=get_evaluator_report_schema()) + except jsonschema.ValidationError as exc: + path = ".".join(str(part) for part in exc.absolute_path) + location = f" at '{path}'" if path else "" + raise ValueError(f"evaluator report validation failed{location}: {exc.message}") from exc diff --git a/aworld/evaluations/runtime_composition.py b/aworld/evaluations/runtime_composition.py new file mode 100644 index 000000000..be59bacdc --- /dev/null +++ b/aworld/evaluations/runtime_composition.py @@ -0,0 +1,652 @@ +# coding: utf-8 +from __future__ import annotations + +import inspect +from dataclasses import dataclass, field, replace +from typing import Any, Callable, Mapping, Protocol + +from aworld.evaluations.execution import EvalExecutionSpec, EvalState + + +_SCALAR_TYPES = (str, int, float, bool, type(None)) + + +def _is_serializable_value(value: Any) -> bool: + if isinstance(value, _SCALAR_TYPES): + return True + if isinstance(value, list): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, tuple): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, Mapping): + return all(isinstance(key, str) and _is_serializable_value(item) for key, item in value.items()) + return False + + +def _serializable_dict(payload: Mapping[str, Any] | None) -> dict[str, Any]: + return { + str(key): value + for key, value in dict(payload or {}).items() + if isinstance(key, str) and _is_serializable_value(value) + } + + +@dataclass(frozen=True) +class RolloutTurn: + role: str + content: Any | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + payload = { + "role": self.role, + "content": self.content, + } + metadata = _serializable_dict(self.metadata) + if metadata: + payload["metadata"] = metadata + return payload + + +@dataclass(frozen=True) +class OutcomeCheckResult: + metric_name: str + value: float + passed: bool + reason: str = "" + metadata: dict[str, Any] = field(default_factory=dict) + + def to_metric_result(self) -> dict[str, Any]: + return { + "value": self.value, + "metadata": { + "passed": self.passed, + "reason": self.reason, + **_serializable_dict(self.metadata), + }, + } + + +@dataclass(frozen=True) +class StepReward: + metric_name: str + step_index: int + value: float + weight: float = 1.0 + partial_credit: bool = False + reason: str = "" + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "step_index": self.step_index, + "value": self.value, + "weight": self.weight, + "partial_credit": self.partial_credit, + "reason": self.reason, + "metadata": _serializable_dict(self.metadata), + } + + +@dataclass(frozen=True) +class EnvironmentSnapshot: + environment_id: str + trial_id: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "environment_id": self.environment_id, + "trial_id": self.trial_id, + "metadata": _serializable_dict(self.metadata), + } + + +def _resolve_path(source: Mapping[str, Any], path: tuple[str, ...]) -> Any: + current: Any = source + for part in path: + if not isinstance(current, Mapping) or part not in current: + raise KeyError(".".join(path)) + current = current[part] + return current + + +def _compare_values(value: Any, op: str, expected: Any) -> bool: + if op == "==": + return value == expected + if op == "!=": + return value != expected + if op == ">=": + return float(value) >= float(expected) + if op == "<=": + return float(value) <= float(expected) + if op == ">": + return float(value) > float(expected) + if op == "<": + return float(value) < float(expected) + raise ValueError(f"unsupported state-check operator: {op}") + + +@dataclass(frozen=True) +class StateCheckGrader: + metric_name: str + path: tuple[str, ...] + expected: Any + source: str = "outcome" + op: str = "==" + weight: float = 1.0 + required: bool = True + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "source": self.source, + "path": list(self.path), + "op": self.op, + "expected": self.expected, + "weight": self.weight, + "required": self.required, + } + + def grade(self, *, state: RolloutState, case: Any, target: Mapping[str, Any]) -> OutcomeCheckResult: + sources = { + "outcome": state.outcome, + "metadata": state.metadata, + "artifacts": state.to_eval_state(target=target).artifacts, + } + if self.source not in sources: + raise ValueError(f"unsupported state-check source: {self.source}") + try: + actual = _resolve_path(sources[self.source], self.path) + except KeyError: + actual = None + passed = False + reason = f"missing path: {'.'.join(self.path)}" + else: + try: + passed = _compare_values(actual, self.op, self.expected) + except (TypeError, ValueError) as exc: + if isinstance(exc, ValueError) and str(exc).startswith("unsupported state-check operator"): + raise + passed = False + reason = f"not comparable: expected {self.expected!r}, got {actual!r} ({exc})" + else: + reason = "matched" if passed else f"expected {self.expected!r}, got {actual!r}" + return OutcomeCheckResult( + metric_name=self.metric_name, + value=1.0 if passed else 0.0, + passed=passed, + reason=reason, + metadata={ + "source": self.source, + "path": list(self.path), + "op": self.op, + "expected": self.expected, + "actual": actual, + "weight": self.weight, + "required": self.required, + }, + ) + + +@dataclass +class RolloutState: + case_id: str + status: str = "success" + answer: Any | None = None + turns: list[RolloutTurn] = field(default_factory=list) + messages: list[dict[str, Any]] = field(default_factory=list) + trajectory: list[dict[str, Any]] = field(default_factory=list) + tool_calls: list[dict[str, Any]] = field(default_factory=list) + step_rewards: list[StepReward] = field(default_factory=list) + outcome: dict[str, Any] = field(default_factory=dict) + attempts: list["RolloutState"] = field(default_factory=list) + child_states: list["RolloutState"] = field(default_factory=list) + usage: dict[str, Any] = field(default_factory=dict) + timing: dict[str, Any] = field(default_factory=dict) + standard_metrics: dict[str, Any] = field(default_factory=dict) + error: dict[str, Any] | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_eval_state(self, target: Mapping[str, Any] | None = None) -> EvalState: + trajectory = list(self.trajectory) + if not trajectory: + trajectory = [turn.to_dict() for turn in self.turns] + artifacts = { + "outcome": _serializable_dict(self.outcome), + "attempts": [attempt.to_dict(include_children=False) for attempt in self.attempts], + "child_states": [state.to_dict(include_children=False) for state in self.child_states], + } + metadata = _serializable_dict(self.metadata) + metadata["_target"] = dict(target or {}) + if self.standard_metrics: + metadata["standard_metrics"] = _serializable_dict(self.standard_metrics) + return EvalState( + case_id=self.case_id, + status=self.status, + answer=self.answer, + completion=[] if self.answer is None else [self.answer], + artifacts=artifacts, + trajectory=trajectory, + tool_calls=list(self.tool_calls), + usage=_serializable_dict(self.usage), + timing=_serializable_dict(self.timing), + error=self.error, + raw_response=self.to_dict(include_children=False), + metadata=metadata, + ) + + def to_dict(self, *, include_children: bool = True) -> dict[str, Any]: + payload = { + "case_id": self.case_id, + "status": self.status, + "answer": self.answer, + "turns": [turn.to_dict() for turn in self.turns], + "messages": list(self.messages), + "trajectory": list(self.trajectory), + "tool_calls": list(self.tool_calls), + "step_rewards": [reward.to_dict() for reward in self.step_rewards], + "outcome": _serializable_dict(self.outcome), + "usage": _serializable_dict(self.usage), + "timing": _serializable_dict(self.timing), + "standard_metrics": _serializable_dict(self.standard_metrics), + "error": self.error, + "metadata": _serializable_dict(self.metadata), + } + if include_children: + payload["attempts"] = [attempt.to_dict(include_children=False) for attempt in self.attempts] + payload["child_states"] = [state.to_dict(include_children=False) for state in self.child_states] + return payload + + +@dataclass(frozen=True) +class EvalRuntimeHarnessDef: + harness_id: str + execution: EvalExecutionSpec = field(default_factory=EvalExecutionSpec) + simulator: str = "single_prompt" + metadata: dict[str, Any] = field(default_factory=dict) + + +class RuntimeHarness(Protocol): + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + ... + + +class EnvironmentFixture(Protocol): + def reset(self, *, case: Any, target: Mapping[str, Any]) -> EnvironmentSnapshot | Mapping[str, Any]: + ... + + def cleanup( + self, + *, + snapshot: EnvironmentSnapshot, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + ) -> EnvironmentSnapshot | Mapping[str, Any] | None: + ... + + +class UserSimulator(Protocol): + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None: + ... + + +def _case_input(case: Any) -> dict[str, Any]: + if hasattr(case, "input") and isinstance(case.input, Mapping): + return dict(case.input) + if hasattr(case, "case_data") and isinstance(case.case_data, Mapping): + return dict(case.case_data) + if isinstance(case, Mapping): + return dict(case) + return {} + + +class ScriptedUserSimulator: + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None: + turns = _case_input(case).get("turns") or [] + user_turn_count = sum(1 for turn in state.turns if turn.role == "user") + if user_turn_count >= len(turns): + return None + return RolloutTurn(role="user", content=turns[user_turn_count]) + + +class SinglePromptUserSimulator: + def __init__(self, query_key: str = "query"): + self.query_key = query_key + + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None: + if any(turn.role == "user" for turn in state.turns): + return None + case_input = _case_input(case) + content = case_input.get(self.query_key, case_input.get("prompt")) + if content is None: + return None + return RolloutTurn(role="user", content=content) + + +class LLMUserSimulator: + def __init__(self, *, turn_generator: Callable[..., Any]): + self.turn_generator = turn_generator + + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None | Any: + user_turn_count = sum(1 for turn in state.turns if turn.role == "user") + generated = self.turn_generator( + case=case, + target=target, + state=state, + last_output=last_output, + turn_index=user_turn_count, + ) + if inspect.isawaitable(generated): + return self._await_turn(generated) + return self._normalize_turn(generated) + + async def _await_turn(self, generated: Any) -> RolloutTurn | None: + return self._normalize_turn(await generated) + + def _normalize_turn(self, generated: Any) -> RolloutTurn | None: + if generated is None: + return None + if isinstance(generated, RolloutTurn): + return generated + if isinstance(generated, str): + return RolloutTurn(role="user", content=generated) + if isinstance(generated, Mapping): + if generated.get("stop") is True: + return None + return RolloutTurn( + role=str(generated.get("role", "user")), + content=generated.get("content"), + metadata=dict(generated.get("metadata") or {}), + ) + raise TypeError("LLMUserSimulator generator must return str, mapping, RolloutTurn, or None") + + +async def _maybe_await(value: Any) -> Any: + if inspect.isawaitable(value): + return await value + return value + + +def _environment_snapshot_from(value: EnvironmentSnapshot | Mapping[str, Any], *, case: Any) -> EnvironmentSnapshot: + if isinstance(value, EnvironmentSnapshot): + return value + if not isinstance(value, Mapping): + raise TypeError("environment fixture must return EnvironmentSnapshot or mapping") + environment_id = value.get("environment_id") + if environment_id is None: + raise ValueError("environment snapshot requires environment_id") + case_input = _case_input(case) + trial = case_input.get("_trial") if isinstance(case_input.get("_trial"), Mapping) else {} + return EnvironmentSnapshot( + environment_id=str(environment_id), + trial_id=value.get("trial_id") or trial.get("trial_id"), + metadata=dict(value.get("metadata") or {}), + ) + + +def _case_with_environment(case: Any, snapshot: EnvironmentSnapshot) -> Any: + snapshot_dict = snapshot.to_dict() + case_input = _case_input(case) + metadata = getattr(case, "metadata", {}) + if not isinstance(metadata, Mapping): + metadata = {} + try: + return replace( + case, + input={**case_input, "_environment": snapshot_dict}, + metadata={**dict(metadata), "_environment": snapshot_dict}, + ) + except TypeError: + return case + + +class EnvironmentIsolatedRuntimeHarness: + def __init__(self, *, base_harness: RuntimeHarness, fixture: EnvironmentFixture): + self.base_harness = base_harness + self.fixture = fixture + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + reset_value = await _maybe_await(self.fixture.reset(case=case, target=target)) + snapshot = _environment_snapshot_from(reset_value, case=case) + snapshot_dict = snapshot.to_dict() + isolated_case = _case_with_environment(case, snapshot) + isolated_target = {**dict(target), "_environment": snapshot_dict} + + try: + state = await self.base_harness.run_rollout(case=isolated_case, target=isolated_target) + except Exception: + cleanup_state = RolloutState(case_id=str(getattr(case, "case_id", "case")), status="failed") + try: + await _maybe_await( + self.fixture.cleanup( + snapshot=snapshot, + case=isolated_case, + target=isolated_target, + state=cleanup_state, + ) + ) + except Exception: + pass + raise + + state.metadata = { + **state.metadata, + "environment": snapshot_dict, + } + try: + cleanup_value = await _maybe_await( + self.fixture.cleanup( + snapshot=snapshot, + case=isolated_case, + target=isolated_target, + state=state, + ) + ) + except Exception as exc: + state.status = "failed" + state.error = { + "type": exc.__class__.__name__, + "message": str(exc), + "phase": "environment_cleanup", + } + state.metadata = { + **state.metadata, + "environment_cleanup_error": dict(state.error), + } + return state + + if cleanup_value is not None: + cleanup_snapshot = _environment_snapshot_from(cleanup_value, case=isolated_case) + state.metadata = { + **state.metadata, + "environment_cleanup": cleanup_snapshot.to_dict(), + } + return state + + +class CallableRuntimeHarness: + def __init__( + self, + *, + simulator: UserSimulator | None = None, + assistant_step: Callable[..., Any], + max_turns: int = 1, + ): + self.simulator = simulator or SinglePromptUserSimulator() + self.assistant_step = assistant_step + self.max_turns = max_turns + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + case_id = getattr(case, "case_id", None) or getattr(case, "eval_case_id", "case") + state = RolloutState(case_id=str(case_id)) + last_output: Any | None = None + for _ in range(self.max_turns): + user_turn = await _maybe_await( + self.simulator.next_turn( + case=case, + target=target, + state=state, + last_output=last_output, + ) + ) + if user_turn is None: + break + state.turns.append(user_turn) + state.trajectory.append(user_turn.to_dict()) + step_output = await _maybe_await( + self.assistant_step( + user_turn=user_turn, + state=state, + case=case, + target=target, + ) + ) + assistant_turn = self._assistant_turn(step_output) + state.turns.append(assistant_turn) + state.trajectory.append(assistant_turn.to_dict()) + if isinstance(step_output, Mapping): + if "answer" in step_output: + state.answer = step_output["answer"] + last_output = step_output["answer"] + for call in step_output.get("tool_calls") or []: + if isinstance(call, Mapping): + state.tool_calls.append(dict(call)) + if isinstance(step_output.get("outcome"), Mapping): + state.outcome.update(dict(step_output["outcome"])) + if isinstance(step_output.get("usage"), Mapping): + state.usage.update(dict(step_output["usage"])) + if isinstance(step_output.get("timing"), Mapping): + state.timing.update(dict(step_output["timing"])) + for reward in step_output.get("step_rewards") or []: + if isinstance(reward, StepReward): + state.step_rewards.append(reward) + elif isinstance(reward, Mapping): + state.step_rewards.append( + StepReward( + metric_name=str(reward["metric_name"]), + step_index=int(reward["step_index"]), + value=float(reward["value"]), + weight=float(reward.get("weight", 1.0)), + partial_credit=bool(reward.get("partial_credit", False)), + reason=str(reward.get("reason", "")), + metadata=dict(reward.get("metadata") or {}), + ) + ) + else: + state.answer = step_output + last_output = step_output + state.standard_metrics.update(derive_standard_metrics(state)) + return state + + def _assistant_turn(self, step_output: Any) -> RolloutTurn: + if isinstance(step_output, Mapping): + return RolloutTurn( + role="assistant", + content=step_output.get("answer"), + metadata={ + "tool_calls": list(step_output.get("tool_calls") or []), + }, + ) + return RolloutTurn(role="assistant", content=step_output) + + +def derive_standard_metrics(state: RolloutState) -> dict[str, Any]: + token_total = state.usage.get("total_tokens") + if token_total is None and isinstance(state.usage.get("tokens"), (int, float)): + token_total = state.usage["tokens"] + duration = state.timing.get("duration_ms", state.timing.get("time_cost_ms")) + return { + "n_turns": len(state.turns), + "n_tool_calls": len(state.tool_calls), + "n_tokens": token_total or 0, + "duration_ms": duration or 0, + } + + +def aggregate_step_rewards(state: RolloutState) -> dict[str, dict[str, Any]]: + grouped: dict[str, list[StepReward]] = {} + for reward in state.step_rewards: + grouped.setdefault(reward.metric_name, []).append(reward) + + metrics: dict[str, dict[str, Any]] = {} + for metric_name, rewards in grouped.items(): + weighted_sum = sum(float(reward.value) * float(reward.weight) for reward in rewards) + weight_total = sum(float(reward.weight) for reward in rewards) or 1.0 + total = sum(float(reward.value) for reward in rewards) + partial_count = sum(1 for reward in rewards if reward.partial_credit) + metrics[metric_name] = { + "value": weighted_sum / weight_total, + "metadata": { + "count": len(rewards), + "weight_total": weight_total, + "rewards": [reward.to_dict() for reward in rewards], + }, + } + metrics[f"{metric_name}_total"] = { + "value": total, + "metadata": {"count": len(rewards)}, + } + metrics[f"{metric_name}_partial_credit_rate"] = { + "value": partial_count / len(rewards), + "metadata": {"partial_credit_count": partial_count, "count": len(rewards)}, + } + return metrics + + +class RetryRuntimeHarness: + def __init__(self, *, base_harness: RuntimeHarness, max_attempts: int = 2): + if max_attempts < 1: + raise ValueError("max_attempts must be >= 1") + self.base_harness = base_harness + self.max_attempts = max_attempts + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + attempts: list[RolloutState] = [] + terminal: RolloutState | None = None + for _ in range(self.max_attempts): + attempt = await self.base_harness.run_rollout(case=case, target=target) + attempts.append(attempt) + terminal = attempt + if attempt.status == "success": + break + assert terminal is not None + terminal.attempts = attempts + terminal.child_states = attempts[:-1] + terminal.metadata = { + **terminal.metadata, + "runtime_composition": "retry", + "attempt_count": len(attempts), + } + terminal.standard_metrics.update(derive_standard_metrics(terminal)) + return terminal diff --git a/aworld/evaluations/scorers/__init__.py b/aworld/evaluations/scorers/__init__.py index d4619379f..82bd3a09e 100644 --- a/aworld/evaluations/scorers/__init__.py +++ b/aworld/evaluations/scorers/__init__.py @@ -14,6 +14,7 @@ class ScorerFactory(Factory): def __init__(self, type_name: str = None): super().__init__(type_name) self._metric_to_scorers: Dict[str, Type[Scorer]] = {} + self._name_to_scorers: Dict[str, Type[Scorer]] = {} self._default_scorer_params: Dict[int, Dict[str, Any]] = {} def __call__(self, name: str = None, criterias: Union[EvalCriteria, List[EvalCriteria]] = None, *args, **kwargs): @@ -41,6 +42,8 @@ def register(self, name: str, desc: str = '', scorer_cls: Type[Scorer] = None, * if name not in self._metric_to_scorers: self._metric_to_scorers[name] = scorer_cls + self._name_to_scorers[scorer_cls.__name__] = scorer_cls + self._name_to_scorers[f"{scorer_cls.__module__}.{scorer_cls.__name__}"] = scorer_cls else: raise ValueError(f'Scorer class {scorer_cls.__name__} already registered for metric {name}') @@ -51,6 +54,9 @@ def unregister(self, name: str): if scorer_id in self._metric_to_scorers: del self._default_scorer_params[scorer_id] + def get_scorer_class(self, metric_name: str) -> Type[Scorer] | None: + return self._metric_to_scorers.get(metric_name) + def create_scorer_instance(self, scorer_class: Type[Scorer], criteria: EvalCriteria = None) -> Scorer: """Create a scorer instance using parameters from EvalCriteria and defaults. @@ -84,11 +90,17 @@ def get_scorer_instances_for_criterias(self, criterias: Union[EvalCriteria, List for criteria in criterias: scorer_class = self._metric_to_scorers.get(criteria.metric_name) + if not scorer_class and criteria.scorer_class: + scorer_class = self._name_to_scorers.get(criteria.scorer_class) if not scorer_class: logger.error(f'No scorer class found for metric {criteria.metric_name}') raise ValueError(f'No scorer class found for metric {criteria.metric_name}') - if criteria.scorer_class and scorer_class.__name__ != criteria.scorer_class: + scorer_class_names = { + scorer_class.__name__, + f"{scorer_class.__module__}.{scorer_class.__name__}", + } + if criteria.scorer_class and criteria.scorer_class not in scorer_class_names: raise ValueError(f"registered scorer class {scorer_class.__name__} does not match criteria {criteria.scorer_class}") if scorer_class not in scorer_instances: diff --git a/aworld/evaluations/scorers/runtime_composition.py b/aworld/evaluations/scorers/runtime_composition.py new file mode 100644 index 000000000..6484dcce4 --- /dev/null +++ b/aworld/evaluations/scorers/runtime_composition.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any, Mapping + +from aworld.evaluations.base import EvalDataCase, MetricResult, Scorer, ScorerResult +from aworld.evaluations.runtime_composition import ( + RolloutState, + StateCheckGrader, + StepReward, + aggregate_step_rewards, +) +from aworld.evaluations.scorers import scorer_register +from aworld.evaluations.scorers.state_extractors import get_eval_state + + +def _rollout_state_from_output(input: EvalDataCase[dict], output: Any) -> RolloutState: + state = get_eval_state(output) + raw = state.get("raw_response") if isinstance(state.get("raw_response"), Mapping) else {} + artifacts = state.get("artifacts") if isinstance(state.get("artifacts"), Mapping) else {} + outcome = raw.get("outcome") if isinstance(raw.get("outcome"), Mapping) else artifacts.get("outcome", {}) + rewards = [] + for reward in raw.get("step_rewards") or []: + if isinstance(reward, Mapping): + rewards.append( + StepReward( + metric_name=str(reward["metric_name"]), + step_index=int(reward["step_index"]), + value=float(reward["value"]), + weight=float(reward.get("weight", 1.0)), + partial_credit=bool(reward.get("partial_credit", False)), + reason=str(reward.get("reason", "")), + metadata=dict(reward.get("metadata") or {}), + ) + ) + return RolloutState( + case_id=getattr(input, "eval_case_id", str(state.get("case_id", ""))), + status=str(state.get("status", "success")), + answer=state.get("answer"), + outcome=dict(outcome or {}), + step_rewards=rewards, + usage=dict(state.get("usage") or {}), + timing=dict(state.get("timing") or {}), + standard_metrics=dict((state.get("metadata") or {}).get("standard_metrics") or {}), + metadata=dict(state.get("metadata") or {}), + ) + + +@scorer_register("runtime_outcome") +class RuntimeOutcomeScorer(Scorer): + def __init__(self, name: str = "runtime_outcome", **kwargs): + super().__init__(name=name) + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + state = _rollout_state_from_output(input, output) + metric_results: dict[str, MetricResult] = {} + target = dict(input.case_data.get("_target", {})) if isinstance(input.case_data, Mapping) else {} + for metric_name, criteria in self.eval_criterias.items(): + params = dict(criteria.scorer_params or {}) + grader_payload = params.get("grader") or {} + grader = StateCheckGrader( + metric_name=metric_name, + source=str(grader_payload.get("source", "outcome")), + path=tuple(grader_payload.get("path") or ()), + op=str(grader_payload.get("op", "==")), + expected=grader_payload.get("expected"), + weight=float(grader_payload.get("weight", 1.0)), + required=bool(grader_payload.get("required", True)), + ) + metric_results[metric_name] = grader.grade(state=state, case=input, target=target).to_metric_result() + return ScorerResult(scorer_name=self.name, metric_results=metric_results) + + +@scorer_register("runtime_reward") +class RuntimeRewardScorer(Scorer): + def __init__(self, name: str = "runtime_reward", **kwargs): + super().__init__(name=name) + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + state = _rollout_state_from_output(input, output) + return ScorerResult(scorer_name=self.name, metric_results=aggregate_step_rewards(state)) + + +@scorer_register("runtime_standard_metric") +class RuntimeStandardMetricScorer(Scorer): + def __init__(self, name: str = "runtime_standard_metric", **kwargs): + super().__init__(name=name) + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + state = _rollout_state_from_output(input, output) + metric_results: dict[str, MetricResult] = {} + for metric_name in self.eval_criterias: + metric_results[metric_name] = {"value": state.standard_metrics.get(metric_name, 0)} + return ScorerResult(scorer_name=self.name, metric_results=metric_results) diff --git a/aworld/evaluations/scorers/state_extractors.py b/aworld/evaluations/scorers/state_extractors.py new file mode 100644 index 000000000..98aaca170 --- /dev/null +++ b/aworld/evaluations/scorers/state_extractors.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any, Mapping + + +def get_eval_state(output: Any) -> dict[str, Any]: + if isinstance(output, Mapping) and isinstance(output.get("state"), Mapping): + return dict(output["state"]) + if isinstance(output, Mapping): + return dict(output) + return {} + + +def get_answer(output: Any) -> Any: + state = get_eval_state(output) + if "answer" in state: + return state["answer"] + return None + + +def get_completion(output: Any) -> list[Any]: + state = get_eval_state(output) + return list(state.get("completion") or []) + + +def get_trajectory(output: Any) -> list[dict[str, Any]]: + state = get_eval_state(output) + if "trajectory" in state: + return list(state.get("trajectory") or []) + return [] + + +def get_messages_by_role(output: Any, role: str) -> list[dict[str, Any]]: + return [ + dict(message) + for message in get_trajectory(output) + if isinstance(message, Mapping) and message.get("role") == role + ] + + +def get_assistant_messages(output: Any) -> list[dict[str, Any]]: + completion = get_completion(output) + if completion and all(isinstance(item, Mapping) for item in completion): + return [dict(item) for item in completion] + return get_messages_by_role(output, "assistant") + + +def get_tool_calls(output: Any) -> list[dict[str, Any]]: + tool_calls: list[dict[str, Any]] = [] + for message in get_trajectory(output): + if not isinstance(message, Mapping): + continue + for call in message.get("tool_calls") or []: + if isinstance(call, Mapping): + tool_calls.append(dict(call)) + action = message.get("action") + if isinstance(action, Mapping): + for call in action.get("tool_calls") or []: + if isinstance(call, Mapping): + tool_calls.append(dict(call)) + return tool_calls diff --git a/aworld/evaluations/scorers/suite_judge.py b/aworld/evaluations/scorers/suite_judge.py new file mode 100644 index 000000000..ad62d7405 --- /dev/null +++ b/aworld/evaluations/scorers/suite_judge.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import annotations + +from aworld.evaluations.base import EvalDataCase, MetricResult, ScorerResult +from aworld.evaluations.scorers import scorer_register +from aworld.evaluations.base import Scorer +from aworld.evaluations.scorers.state_extractors import get_eval_state + + +@scorer_register("score") +class SuiteJudgeScorer(Scorer): + def __init__(self, suite=None, name: str = None, **kwargs): + super().__init__(name=name or getattr(suite, "suite_id", None), **kwargs) + self.suite = suite + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + if self.suite is None: + raise ValueError("suite judge is required for suite-backed evaluation") + + case_input = dict(input.case_data) + target = dict(case_input.get("_target", {})) + state = get_eval_state(output) + if state: + target = {**target, **state} + execution = await self.suite.resolve_judge_backend().execute(case_input, target, self.suite) + payload = self.suite.judge_schema.validate_payload(dict(execution.payload)) + + metadata = { + **payload, + "_judge_backend": execution.backend_id, + } + metric_result: MetricResult = { + "value": float(payload["score"]), + "metadata": metadata, + } + metric_results = {"score": metric_result} + declared_trajectory_metrics = { + scorer.metric_name + for scorer in getattr(self.suite, "trajectory_scorers", tuple()) + } + declared_runtime_metrics = { + scorer.metric_name + for scorer in getattr(self.suite, "outcome_scorers", tuple()) + } | set(getattr(self.suite, "reward_metrics", tuple())) | set(getattr(self.suite, "standard_metrics", tuple())) + for metric_name, value in payload.items(): + if ( + metric_name == "score" + or metric_name in declared_trajectory_metrics + or metric_name in declared_runtime_metrics + or not isinstance(value, (int, float, bool, str)) + ): + continue + metric_results[metric_name] = { + "value": value, + "metadata": metadata, + } + return ScorerResult( + scorer_name=self.name, + metric_results=metric_results, + ) diff --git a/aworld/evaluations/scorers/trajectory_validators.py b/aworld/evaluations/scorers/trajectory_validators.py index 1feda12b1..cc5b13a80 100644 --- a/aworld/evaluations/scorers/trajectory_validators.py +++ b/aworld/evaluations/scorers/trajectory_validators.py @@ -7,6 +7,7 @@ from aworld.evaluations.base import Scorer, ScorerResult, EvalStatus, MetricResult, EvalDataCase from aworld.evaluations.scorers import scorer_register from aworld.evaluations.scorers.base_validator import RuleScorer, LLMAsJudgeScorer +from aworld.evaluations.scorers.state_extractors import get_trajectory from aworld.evaluations.types import MetricNames from aworld.logs.util import logger @@ -70,6 +71,9 @@ class TrajectoryValidator(RuleScorer): def _parse_trajectory(self, output: Any) -> Dict: + trajectory = get_trajectory(output) + if trajectory: + return trajectory if isinstance(output, dict): if "trajectory" in output: output = output["trajectory"] diff --git a/aworld/evaluations/sources.py b/aworld/evaluations/sources.py new file mode 100644 index 000000000..23d2c16c5 --- /dev/null +++ b/aworld/evaluations/sources.py @@ -0,0 +1,375 @@ +# coding: utf-8 +from __future__ import annotations + +import ast +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Iterable, Mapping, Protocol + + +_SCALAR_TYPES = (str, int, float, bool, type(None)) + + +def _is_serializable_value(value: Any) -> bool: + if isinstance(value, _SCALAR_TYPES): + return True + if isinstance(value, list): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, tuple): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, Mapping): + return all(isinstance(key, str) and _is_serializable_value(item) for key, item in value.items()) + return False + + +def _serializable_dict(payload: Mapping[str, Any] | None) -> dict[str, Any]: + return { + str(key): value + for key, value in dict(payload or {}).items() + if isinstance(key, str) and _is_serializable_value(value) + } + + +@dataclass(frozen=True) +class EvalSourceRecord: + case_id: str + input: Mapping[str, Any] + expected: Any | None = None + answer: Any | None = None + state: Mapping[str, Any] | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + raw_payload: Mapping[str, Any] = field(default_factory=dict) + + def to_dict(self, *, include_raw_payload: bool = False) -> dict[str, Any]: + payload = { + "case_id": self.case_id, + "input": _serializable_dict(self.input), + "expected": self.expected, + "answer": self.answer, + "state": _serializable_dict(self.state), + "metadata": _serializable_dict(self.metadata), + } + if include_raw_payload: + payload["raw_payload"] = _serializable_dict(self.raw_payload) + return {key: value for key, value in payload.items() if value not in (None, {}, [])} + + @classmethod + def from_dict(cls, payload: Mapping[str, Any]) -> "EvalSourceRecord": + return cls( + case_id=str(payload["case_id"]), + input=dict(payload.get("input") or {}), + expected=payload.get("expected"), + answer=payload.get("answer"), + state=dict(payload.get("state") or {}) if isinstance(payload.get("state"), Mapping) else None, + metadata=dict(payload.get("metadata") or {}), + raw_payload=dict(payload.get("raw_payload") or {}), + ) + + def to_case(self): + from aworld.evaluations.substrate import EvalCaseDef + + return EvalCaseDef( + case_id=self.case_id, + input=dict(self.input), + expected=self.expected, + metadata={ + **dict(self.metadata or {}), + "source_record": self.to_dict(), + }, + ) + + +class EvalSource(Protocol): + def iter_records(self) -> Iterable[EvalSourceRecord]: + ... + + def to_cases(self): + ... + + def default_adapter(self): + ... + + +class _BaseEvalSource: + def to_cases(self): + return tuple(record.to_case() for record in self.iter_records()) + + +@dataclass(frozen=True) +class JsonlTaskAnswerSource(_BaseEvalSource): + path: str | Path + id_field: str = "id" + input_field: str = "input" + answer_field: str = "answer" + expected_field: str | None = None + metadata_field: str | None = None + + def iter_records(self) -> Iterable[EvalSourceRecord]: + path = Path(self.path).expanduser() + with path.open(encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + payload = json.loads(stripped) + if not isinstance(payload, Mapping): + raise ValueError(f"{path}:{line_number} must contain a JSON object") + for field_name in (self.id_field, self.input_field, self.answer_field): + if field_name not in payload: + raise ValueError(f"{path}:{line_number} missing required field: {field_name}") + metadata = {} + if self.metadata_field is not None and isinstance(payload.get(self.metadata_field), Mapping): + metadata.update(dict(payload[self.metadata_field])) + metadata.update({"source_kind": "answer", "source_path": str(path), "line_number": line_number}) + expected = payload.get(self.expected_field) if self.expected_field else None + yield EvalSourceRecord( + case_id=str(payload[self.id_field]), + input={"input": payload[self.input_field]}, + expected=expected, + answer=payload[self.answer_field], + metadata=metadata, + raw_payload=dict(payload), + ) + + def default_adapter(self): + from aworld.evaluations.state_adapters import AnswerStateAdapter + + return AnswerStateAdapter() + + +@dataclass(frozen=True) +class JsonlTaskSource(_BaseEvalSource): + path: str | Path + id_field: str = "id" + input_field: str = "input" + expected_field: str | None = None + metadata_field: str | None = None + + def iter_records(self) -> Iterable[EvalSourceRecord]: + path = Path(self.path).expanduser() + with path.open(encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + payload = json.loads(stripped) + if not isinstance(payload, Mapping): + raise ValueError(f"{path}:{line_number} must contain a JSON object") + for field_name in (self.id_field, self.input_field): + if field_name not in payload: + raise ValueError(f"{path}:{line_number} missing required field: {field_name}") + metadata = {} + if self.metadata_field is not None and isinstance(payload.get(self.metadata_field), Mapping): + metadata.update(dict(payload[self.metadata_field])) + metadata.update({"source_kind": "task", "source_path": str(path), "line_number": line_number}) + expected = payload.get(self.expected_field) if self.expected_field else None + yield EvalSourceRecord( + case_id=str(payload[self.id_field]), + input={"input": payload[self.input_field]}, + expected=expected, + metadata=metadata, + raw_payload=dict(payload), + ) + + def default_adapter(self): + raise ValueError("task source requires a runtime_harness") + + +def _truthy_string(value: Any) -> bool: + return str(value).strip().lower() in {"true", "1", "yes"} + + +def _tool_calls_from_action(action: Mapping[str, Any]) -> list[dict[str, Any]]: + calls: list[dict[str, Any]] = [] + for tool_call in action.get("tool_calls") or []: + if not isinstance(tool_call, Mapping): + continue + function = tool_call.get("function") or {} + if isinstance(function, Mapping): + calls.append({"name": function.get("name"), "arguments": str(function.get("arguments"))}) + return calls + + +def extract_aworld_trajectory_payload( + trajectory: Iterable[Mapping[str, Any]], + *, + task_id: str, + is_sub_task: Any | None = None, +) -> dict[str, Any]: + trajectory = list(trajectory) + if not isinstance(trajectory, list): + raise ValueError(f"task_id {task_id} trajectory must be a list") + + question = None + system_prompt = "" + if trajectory: + first_state = trajectory[0].get("state", {}) if isinstance(trajectory[0], Mapping) else {} + question = (first_state.get("input", {}) or {}).get("content") if isinstance(first_state, Mapping) else None + first_messages = first_state.get("messages", []) if isinstance(first_state, Mapping) else [] + if first_messages and isinstance(first_messages[0], Mapping) and first_messages[0].get("role") == "system": + system_prompt = str(first_messages[0].get("content") or "") + + steps = [] + final_answer = None + for item in trajectory: + if not isinstance(item, Mapping): + continue + meta = item.get("meta", {}) if isinstance(item.get("meta"), Mapping) else {} + action = item.get("action", {}) if isinstance(item.get("action"), Mapping) else {} + finished = _truthy_string(action.get("is_agent_finished")) + content = str(action.get("content") or "") + steps.append( + { + "step": meta.get("step"), + "pre_agent": meta.get("pre_agent"), + "agent_id": meta.get("agent_id"), + "tool_calls": _tool_calls_from_action(action), + "assistant_content": content, + "is_agent_finished": finished, + } + ) + if finished and content: + final_answer = content + + final_messages = [] + if trajectory and isinstance(trajectory[-1], Mapping): + final_state = trajectory[-1].get("state", {}) + if isinstance(final_state, Mapping): + final_messages = final_state.get("messages", []) or [] + evidence = [ + {"msg_index": index, "content": str(message.get("content") or "")} + for index, message in enumerate(final_messages) + if isinstance(message, Mapping) and message.get("role") == "tool" + ] + + return { + "task_id": task_id, + "is_sub_task": is_sub_task, + "num_steps": len(trajectory), + "question": question, + "system_prompt_excerpt": system_prompt[:8000], + "steps": steps, + "final_answer": final_answer, + "evidence": evidence, + } + + +def _parse_aworld_trajectory_log_line(line: str) -> Mapping[str, Any]: + clean = re.sub(r"\x1b\[[0-9;]*m", "", line).strip() + record = ast.literal_eval(clean) + if not isinstance(record, Mapping): + raise ValueError("trajectory log line must contain a mapping") + return record + + +def _extract_aworld_trajectory_record_payload(record: Mapping[str, Any], *, task_id: str) -> dict[str, Any]: + trajectory = json.loads(record["trajectory"]) + return extract_aworld_trajectory_payload( + trajectory, + task_id=task_id, + is_sub_task=record.get("is_sub_task"), + ) + + +def iter_aworld_trajectory_records(log_path: str | Path) -> Iterable[tuple[str, dict[str, Any]]]: + path = Path(log_path).expanduser() + with path.open(encoding="utf-8", errors="replace") as handle: + for line_number, line in enumerate(handle, start=1): + if not line.strip(): + continue + try: + record = _parse_aworld_trajectory_log_line(line) + except (SyntaxError, ValueError) as exc: + raise ValueError(f"{path}:{line_number} is not a valid AWorld trajectory log record") from exc + task_id = record.get("task_id") + if task_id is None: + raise ValueError(f"{path}:{line_number} missing required field: task_id") + yield str(task_id), _extract_aworld_trajectory_record_payload(record, task_id=str(task_id)) + + +def extract_aworld_trajectory_record(log_path: str | Path, task_id: str) -> dict[str, Any]: + path = Path(log_path).expanduser() + with path.open(encoding="utf-8", errors="replace") as handle: + for line in handle: + if task_id not in line: + continue + record = _parse_aworld_trajectory_log_line(line) + if str(record.get("task_id")) == str(task_id): + return _extract_aworld_trajectory_record_payload(record, task_id=str(task_id)) + raise ValueError(f"task_id {task_id} not found in {path}") + + +@dataclass(frozen=True) +class AWorldTrajectoryLogSource(_BaseEvalSource): + path: str | Path + task_ids: Iterable[str] | None + extraction_dir: str | Path | None = None + + def iter_records(self) -> Iterable[EvalSourceRecord]: + path = Path(self.path).expanduser() + items = iter_aworld_trajectory_records(path) if self.task_ids is None else ( + (str(task_id), extract_aworld_trajectory_record(path, str(task_id))) + for task_id in self.task_ids + ) + for task_id, extracted in items: + yield EvalSourceRecord( + case_id=task_id, + input={"task_id": task_id, "trajectory_log": str(path)}, + answer=extracted.get("final_answer"), + metadata={ + "source_kind": "trajectory", + "source_path": str(path), + "extraction_dir": str(Path(self.extraction_dir).expanduser()) if self.extraction_dir else None, + }, + raw_payload=extracted, + ) + + def default_adapter(self): + from aworld.evaluations.state_adapters import TrajectoryLogStateAdapter + + return TrajectoryLogStateAdapter(extraction_dir=self.extraction_dir) + + +def create_source_eval_suite( + *, + suite_id: str, + source: EvalSource, + judge_backend, + judge_schema, + gate_policy=None, + state_adapter=None, + runtime_harness=None, + outcome_scorers=tuple(), + reward_metrics=tuple(), + standard_metrics=tuple(), + trajectory_scorers=tuple(), + metadata: Mapping[str, Any] | None = None, +): + from aworld.evaluations.state_adapters import ReplayRuntimeHarness + from aworld.evaluations.substrate import EvalSuiteDef + + records = list(source.iter_records()) + harness = runtime_harness + if harness is None: + adapter = state_adapter + if adapter is None: + adapter = source.default_adapter() + harness = ReplayRuntimeHarness(adapter=adapter, records=tuple(records)) + return EvalSuiteDef( + suite_id=suite_id, + cases=[record.to_case() for record in records], + runtime_harness=harness, + judge_backend=judge_backend, + judge_schema=judge_schema, + gate_policy=gate_policy, + outcome_scorers=tuple(outcome_scorers), + reward_metrics=tuple(reward_metrics), + standard_metrics=tuple(standard_metrics), + trajectory_scorers=tuple(trajectory_scorers), + metadata={ + **dict(metadata or {}), + "source_backed": True, + }, + ) diff --git a/aworld/evaluations/state_adapters.py b/aworld/evaluations/state_adapters.py new file mode 100644 index 000000000..5810ecefe --- /dev/null +++ b/aworld/evaluations/state_adapters.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Mapping, Protocol + +from aworld.evaluations.runtime_composition import RolloutState +from aworld.evaluations.sources import EvalSourceRecord + + +class EvalStateAdapter(Protocol): + def adapt(self, *, record: EvalSourceRecord, case: Any, target: Mapping[str, Any]) -> RolloutState: + ... + + +@dataclass(frozen=True) +class AnswerStateAdapter: + def adapt(self, *, record: EvalSourceRecord, case: Any, target: Mapping[str, Any]) -> RolloutState: + return RolloutState( + case_id=str(getattr(case, "case_id", record.case_id)), + status="success", + answer=record.answer, + outcome={"has_answer": record.answer is not None}, + metadata={ + **dict(record.metadata or {}), + "source_case_id": record.case_id, + }, + ) + + +@dataclass(frozen=True) +class TrajectoryLogStateAdapter: + extraction_dir: str | Path | None = None + + def adapt(self, *, record: EvalSourceRecord, case: Any, target: Mapping[str, Any]) -> RolloutState: + extracted = dict(record.raw_payload or {}) + final_answer = extracted.get("final_answer") or "" + steps = list(extracted.get("steps") or []) + is_finished = any(bool(step.get("is_agent_finished")) for step in steps if isinstance(step, Mapping)) + tool_calls = [ + dict(tool_call) + for step in steps + if isinstance(step, Mapping) + for tool_call in step.get("tool_calls", []) + if isinstance(tool_call, Mapping) + ] + usage = {"total_tokens": 0} + timing = {"duration_ms": 0} + standard_metrics = { + "n_turns": len(steps), + "n_tool_calls": len(tool_calls), + "n_tokens": usage["total_tokens"], + "duration_ms": timing["duration_ms"], + } + extracted_path = self._write_extracted(record, extracted) + metadata = { + **dict(record.metadata or {}), + "source_case_id": record.case_id, + } + if extracted_path is not None: + metadata["extracted_path"] = str(extracted_path) + return RolloutState( + case_id=str(getattr(case, "case_id", record.case_id)), + status="success" if is_finished and final_answer else "failed", + answer=final_answer, + trajectory=steps, + tool_calls=tool_calls, + usage=usage, + timing=timing, + standard_metrics=standard_metrics, + outcome={ + "task_id": record.case_id, + "question": extracted.get("question"), + "evidence_blocks": len(extracted.get("evidence") or []), + "num_steps": extracted.get("num_steps", len(steps)), + "is_finished": is_finished, + "final_answer_len": len(final_answer), + **({"extracted_path": str(extracted_path)} if extracted_path is not None else {}), + }, + metadata=metadata, + ) + + def _write_extracted(self, record: EvalSourceRecord, extracted: Mapping[str, Any]) -> Path | None: + extraction_dir = self.extraction_dir or record.metadata.get("extraction_dir") + if not extraction_dir: + return None + out_dir = Path(str(extraction_dir)).expanduser() + out_dir.mkdir(parents=True, exist_ok=True) + path = out_dir / f"extracted_{record.case_id}.json" + path.write_text(json.dumps(dict(extracted), ensure_ascii=False, indent=2), encoding="utf-8") + return path + + +@dataclass(frozen=True) +class ReplayRuntimeHarness: + adapter: EvalStateAdapter + records: tuple[EvalSourceRecord, ...] = tuple() + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + metadata = getattr(case, "metadata", {}) or {} + record_payload = metadata.get("source_record") + if not isinstance(record_payload, Mapping): + record_payload = (getattr(case, "input", {}) or {}).get("_source_record") + if not isinstance(record_payload, Mapping): + raise ValueError("replay source case is missing source_record metadata") + record = self._resolve_record(record_payload) + return self.adapter.adapt(record=record, case=case, target=target) + + def _resolve_record(self, record_payload: Mapping[str, Any]) -> EvalSourceRecord: + case_id = str(record_payload.get("case_id")) + for record in self.records: + if record.case_id == case_id: + return record + return EvalSourceRecord.from_dict(record_payload) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py new file mode 100644 index 000000000..a873e6b4c --- /dev/null +++ b/aworld/evaluations/substrate.py @@ -0,0 +1,1826 @@ +# coding: utf-8 +from __future__ import annotations + +import asyncio +import base64 +import json +import math +import inspect +import os +import re +import tempfile +import uuid +from dataclasses import dataclass, field, replace +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Awaitable, Callable, ClassVar, Mapping + +from pydantic import BaseModel, ValidationError + +from aworld.config.conf import EvaluationConfig +from aworld.evaluations.base import EvalDataCase, EvalDataset, EvalTarget +from aworld.evaluations.base import NoActionEvalTarget +from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget, AworldTaskEvalTarget +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec, load_program_callable +from aworld.evaluations.manifests import validate_declared_eval_suite_manifest +from aworld.evaluations.runtime_composition import ( + CallableRuntimeHarness, + RuntimeHarness, + SinglePromptUserSimulator, + StateCheckGrader, + StepReward, +) +from aworld.evaluations.scorers import scorer_factory +from aworld.evaluations.types import MetricNames +from aworld.evaluations.execution_adapters import resolve_execution_adapter +from aworld.evaluations.report import ( + CaseEvaluationReport, + EVALUATOR_REPORT_FORMAT_ID, + EVALUATOR_REPORT_FORMAT_VERSION, + EvaluatorReport, +) +from aworld.runners.evaluate_runner import EvaluateRunner + + +JudgeCallable = Callable[[dict[str, Any], dict[str, Any]], Mapping[str, Any] | Awaitable[Mapping[str, Any]]] +JudgePrompt = str | tuple[str, list[str]] +JudgeExecutor = Callable[[JudgePrompt, str], Mapping[str, Any] | str | Awaitable[Mapping[str, Any] | str]] +EvalSuiteFactory = Callable[[dict[str, Any]], "EvalSuiteDef"] +EvalSuiteMatcher = Callable[[dict[str, Any]], bool] + +_IMAGE_SUFFIX_TO_MIME = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + ".svg": "image/svg+xml", +} + +@dataclass(frozen=True) +class EvalCaseDef: + case_id: str + input: dict[str, Any] + expected: Any | None = None + max_turns: int | None = None + timeout_seconds: float | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class EvalHarnessDef: + harness_id: str + execution: EvalExecutionSpec = field(default_factory=EvalExecutionSpec) + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class TrajectoryScorerDef: + metric_name: str + scorer_class: str | None = None + threshold: float = 0.0 + scorer_params: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class TrialPolicyDef: + num_trials: int = 1 + pass_at_k: tuple[int, ...] = tuple() + pass_caret_k: tuple[int, ...] = tuple() + success_metric: str | None = None + + def validate(self) -> None: + if self.num_trials < 1: + raise ValueError("num_trials must be >= 1") + invalid = [ + k + for k in (*self.pass_at_k, *self.pass_caret_k) + if k < 1 or k > self.num_trials + ] + if invalid: + raise ValueError("k values must be between 1 and num_trials") + + def to_dict(self) -> dict[str, Any]: + return { + "num_trials": self.num_trials, + "pass_at_k": list(self.pass_at_k), + "pass_caret_k": list(self.pass_caret_k), + "success_metric": self.success_metric, + } + + +@dataclass(frozen=True) +class JudgeSchemaDef: + required_fields: tuple[str, ...] = tuple() + output_model: type[BaseModel] | None = None + normalizer: Callable[[Mapping[str, Any]], Mapping[str, Any]] | None = None + + def validate(self, payload: Mapping[str, Any]) -> None: + self.validate_payload(payload) + + def validate_payload(self, payload: Mapping[str, Any]) -> dict[str, Any]: + if self.normalizer is not None: + payload = self.normalizer(dict(payload)) + if not isinstance(payload, Mapping): + raise ValueError("judge schema normalizer must return a mapping") + + if self.output_model is not None: + try: + model = self.output_model.model_validate(dict(payload)) + except ValidationError as exc: + raise ValueError(str(exc)) from exc + return model.model_dump(mode="json", by_alias=True) + + missing = [field for field in self.required_fields if field not in payload] + if missing: + joined = ", ".join(missing) + raise ValueError(f"missing required judge fields: {joined}") + return dict(payload) + + def json_schema(self) -> dict[str, Any]: + if self.output_model is not None: + return self.output_model.model_json_schema() + if self.required_fields: + return { + "type": "object", + "required": list(self.required_fields), + "properties": {field: {} for field in self.required_fields}, + } + return {} + + +@dataclass(frozen=True) +class GateDecision: + status: str + metric_name: str | None + value: float | int | str | bool | None + matched_conditions: list[dict[str, Any]] = field(default_factory=list) + failed_conditions: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass(frozen=True) +class GateMetricCondition: + metric_name: str + op: str + threshold: float | int | str | bool + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "op": self.op, + "threshold": self.threshold, + } + + def matches(self, metrics: Mapping[str, Any]) -> bool: + if self.metric_name not in metrics: + raise KeyError(f"metric {self.metric_name} is missing") + value = metrics[self.metric_name] + if self.op == ">=": + return float(value) >= float(self.threshold) + if self.op == "<=": + return float(value) <= float(self.threshold) + if self.op == ">": + return float(value) > float(self.threshold) + if self.op == "<": + return float(value) < float(self.threshold) + if self.op == "==": + return value == self.threshold + if self.op == "!=": + return value != self.threshold + raise ValueError(f"unsupported gate operator: {self.op}") + + +@dataclass(frozen=True) +class GatePolicyDef: + metric_name: str | None = None + pass_threshold: float | None = None + approval_threshold: float | None = None + pass_all: tuple[GateMetricCondition, ...] = tuple() + approval_all: tuple[GateMetricCondition, ...] = tuple() + + def normalized_conditions(self) -> tuple[tuple[GateMetricCondition, ...], tuple[GateMetricCondition, ...]]: + pass_all = self.pass_all + approval_all = self.approval_all + if not pass_all and self.metric_name is not None and self.pass_threshold is not None: + pass_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.pass_threshold),) + if not approval_all and self.metric_name is not None and self.approval_threshold is not None: + approval_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.approval_threshold),) + return pass_all, approval_all + + def primary_metric_name(self) -> str: + if self.metric_name is not None: + return self.metric_name + pass_all, approval_all = self.normalized_conditions() + for condition in (*pass_all, *approval_all): + if condition.metric_name == "score": + return condition.metric_name + for condition in (*pass_all, *approval_all): + return condition.metric_name + return "score" + + def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: + pass_all, approval_all = self.normalized_conditions() + matched_pass: list[dict[str, Any]] = [] + failed_pass: list[dict[str, Any]] = [] + for condition in pass_all: + try: + matched = condition.matches(metrics) + except KeyError: + failed_pass.append({**condition.to_dict(), "reason": "missing_metric"}) + continue + if matched: + matched_pass.append(condition.to_dict()) + else: + failed_pass.append(condition.to_dict()) + + metric_name = self.metric_name + value = metrics.get(metric_name) if metric_name is not None else None + if pass_all and not failed_pass: + return GateDecision( + status="pass", + metric_name=metric_name, + value=value, + matched_conditions=matched_pass, + failed_conditions=[], + ) + if any(condition.get("reason") == "missing_metric" for condition in failed_pass): + return GateDecision( + status="fail", + metric_name=metric_name, + value=value, + matched_conditions=matched_pass, + failed_conditions=failed_pass, + ) + + matched_approval: list[dict[str, Any]] = [] + failed_approval: list[dict[str, Any]] = [] + for condition in approval_all: + try: + matched = condition.matches(metrics) + except KeyError: + failed_approval.append({**condition.to_dict(), "reason": "missing_metric"}) + continue + if matched: + matched_approval.append(condition.to_dict()) + else: + failed_approval.append(condition.to_dict()) + + if approval_all and not failed_approval: + return GateDecision( + status="needs_approval", + metric_name=metric_name, + value=value, + matched_conditions=[*matched_pass, *matched_approval], + failed_conditions=failed_pass, + ) + return GateDecision( + status="fail", + metric_name=metric_name, + value=value, + matched_conditions=[*matched_pass, *matched_approval], + failed_conditions=[*failed_pass, *failed_approval], + ) + + +@dataclass(frozen=True) +class JudgeExecution: + backend_id: str + payload: dict[str, Any] + + +class _RuntimeCompositionJudgeOutput(BaseModel): + score: float + verdict: str + + +class JudgeBackend: + backend_id: ClassVar[str] = "judge-backend" + + def is_available(self) -> bool: + return True + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + raise NotImplementedError + + +@dataclass(frozen=True) +class CallableJudgeBackend: + backend_id: str + judge: JudgeCallable + + def is_available(self) -> bool: + return True + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + payload = await _maybe_await_judge(self.judge, case_input, target) + return JudgeExecution(backend_id=self.backend_id, payload=dict(payload)) + + +@dataclass(frozen=True) +class AgentJudgeBackend: + backend_id: str + system_prompt: str + executor: JudgeExecutor | None = None + prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], JudgePrompt] | None = None + timeout_seconds: float | None = None + + @classmethod + def from_agent_markdown( + cls, + path: str | Path, + *, + backend_id: str | None = None, + prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], JudgePrompt] | None = None, + timeout_seconds: float | None = None, + ) -> "AgentJudgeBackend": + agent_markdown_path = Path(path).expanduser() + resolved_backend_id = backend_id or agent_markdown_path.stem + + async def _executor(prompt: JudgePrompt, system_prompt: str) -> str: + if isinstance(prompt, tuple): + raise ValueError("agent markdown judge backend only supports text prompts") + from aworld.runner import Runners + + agent = await load_agent_markdown(agent_markdown_path, agent_id=resolved_backend_id) + response = await Runners.run(input=str(prompt), agent=agent) + return str(getattr(response, "answer", response)) + + return cls( + backend_id=resolved_backend_id, + system_prompt=f"Agent loaded from {agent_markdown_path}", + executor=_executor, + prompt_builder=prompt_builder, + timeout_seconds=timeout_seconds, + ) + + def is_available(self) -> bool: + if self.executor is not None: + return True + model_name = os.getenv("LLM_MODEL_NAME") + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + return bool(model_name and api_key) + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + if not self.is_available(): + raise RuntimeError(f"judge backend '{self.backend_id}' is not available") + prompt_builder = self.prompt_builder or _build_default_judge_prompt + prompt = prompt_builder(case_input, target, suite) + executor = self.executor or _default_agent_judge_executor + async def _run_executor(): + result = executor(prompt, self.system_prompt) + if inspect.isawaitable(result): + return await result + return result + + if self.timeout_seconds is not None: + task = asyncio.create_task(_run_executor()) + try: + response = await asyncio.wait_for(task, timeout=self.timeout_seconds) + except Exception: + task.cancel() + try: + await task + except BaseException: + pass + raise + else: + response = await _run_executor() + payload = _coerce_judge_payload(response) + return JudgeExecution(backend_id=self.backend_id, payload=payload) + + async def judge(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> dict[str, Any]: + execution = await self.execute(case_input, target, suite) + return execution.payload + + +def _safe_agent_markdown_name(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-._") or "markdown-agent" + + +def _frontmatter_scalar(value: Any, default: str) -> str: + text = str(value if value not in (None, "") else default) + return " ".join(text.splitlines()).strip() + + +def _normalize_markdown_tool_list(value: Any) -> dict[str, Any]: + if isinstance(value, Mapping): + return dict(value) + if isinstance(value, str) and value.strip(): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return {} + if isinstance(parsed, Mapping): + return dict(parsed) + return {} + + +def _materialize_agent_markdown_as_skill( + agent_markdown_path: Path, + *, + skills_root: Path, + skill_name: str, +) -> Path: + from aworld.utils.skill_loader import extract_front_matter + + lines = agent_markdown_path.read_text(encoding="utf-8").splitlines() + frontmatter, body_start = extract_front_matter(lines) + body = "\n".join(lines[body_start:]).strip() + description = _frontmatter_scalar( + frontmatter.get("description", frontmatter.get("desc")), + f"Agent loaded from {agent_markdown_path}", + ) + tool_list = _normalize_markdown_tool_list(frontmatter.get("tool_list", {})) + + skill_dir = skills_root / skill_name + skill_dir.mkdir(parents=True, exist_ok=True) + skill_path = skill_dir / "SKILL.md" + skill_path.write_text( + "---\n" + f"name: {_frontmatter_scalar(frontmatter.get('name'), skill_name)}\n" + f"description: {description}\n" + "type: agent\n" + f"tool_list: {json.dumps(tool_list, ensure_ascii=False)}\n" + "---\n\n" + f"{body}\n", + encoding="utf-8", + ) + return skill_path + + +async def load_agent_markdown(path: str | Path, *, agent_id: str): + from aworld.config.task_loader import _load_skill_agent + + agent_markdown_path = Path(path).expanduser() + skill_name = _safe_agent_markdown_name(agent_id) + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + with tempfile.TemporaryDirectory(prefix="aworld-agent-md-") as tmp_dir: + skills_root = Path(tmp_dir) / "skills" + _materialize_agent_markdown_as_skill( + agent_markdown_path, + skills_root=skills_root, + skill_name=skill_name, + ) + return await _load_skill_agent( + agent_id=agent_id, + agent_def={ + "skill_name": skill_name, + "config": { + "llm_config": { + "llm_model_name": os.getenv("LLM_MODEL_NAME"), + "llm_provider": os.getenv("LLM_PROVIDER"), + "llm_api_key": api_key, + "llm_base_url": os.getenv("LLM_BASE_URL"), + } + }, + }, + skills_path=skills_root, + global_mcp_config=None, + ) + + +@dataclass(frozen=True) +class FallbackJudgeBackend: + backend_id: str + backends: tuple[JudgeBackend, ...] + + def is_available(self) -> bool: + return any(backend.is_available() for backend in self.backends) + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + errors: list[str] = [] + for backend in self.backends: + if not backend.is_available(): + errors.append(f"{backend.backend_id}:unavailable") + continue + try: + return await backend.execute(case_input, target, suite) + except Exception as exc: + errors.append(f"{backend.backend_id}:{exc}") + joined = "; ".join(errors) if errors else "no candidate backend" + raise RuntimeError(f"no judge backend succeeded: {joined}") + + +@dataclass(frozen=True) +class _LegacyJudgeBackendAdapter: + backend: Any + + @property + def backend_id(self) -> str: + return getattr(self.backend, "backend_id", "legacy-judge-backend") + + def is_available(self) -> bool: + available = getattr(self.backend, "is_available", None) + if callable(available): + return bool(available()) + return True + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + payload = self.backend.judge(case_input, target, suite) + if inspect.isawaitable(payload): + payload = await payload + return JudgeExecution(backend_id=self.backend_id, payload=dict(payload)) + + +@dataclass(frozen=True) +class EvalSuiteDef: + suite_id: str + cases: list[EvalCaseDef] = field(default_factory=list) + toolsets: tuple[str, ...] = tuple() + judge_schema: JudgeSchemaDef = field(default_factory=JudgeSchemaDef) + gate_policy: GatePolicyDef | None = None + execution: EvalExecutionSpec | None = None + harness: EvalHarnessDef | None = None + runtime_harness: RuntimeHarness | None = None + outcome_scorers: tuple[StateCheckGrader, ...] = tuple() + reward_metrics: tuple[str, ...] = tuple() + standard_metrics: tuple[str, ...] = tuple() + trial_policy: TrialPolicyDef = field(default_factory=TrialPolicyDef) + trajectory_scorers: tuple[TrajectoryScorerDef, ...] = tuple() + judge: JudgeCallable | None = None + judge_backend: JudgeBackend | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def with_cases(self, cases: list[EvalCaseDef]) -> "EvalSuiteDef": + return replace(self, cases=cases) + + def resolve_judge_backend(self) -> JudgeBackend: + if self.judge_backend is not None: + if hasattr(self.judge_backend, "execute"): + return self.judge_backend + if hasattr(self.judge_backend, "judge"): + return _LegacyJudgeBackendAdapter(self.judge_backend) + return self.judge_backend + if self.judge is not None: + return CallableJudgeBackend( + backend_id=f"{self.suite_id}-callable", + judge=self.judge, + ) + raise ValueError(f"suite '{self.suite_id}' has no judge backend") + + +@dataclass(frozen=True) +class EvaluationFlowDef: + target: dict[str, Any] + suite: EvalSuiteDef + interactive_approval: bool = False + output_path: str | None = None + + +@dataclass(frozen=True) +class CompiledEvaluationPlan: + suite: EvalSuiteDef + target: dict[str, Any] + dataset: EvalDataset + eval_config: EvaluationConfig + gate_policy: GatePolicyDef | None + harness: EvalHarnessDef | None = None + + +@dataclass(frozen=True) +class EvalSuiteRegistration: + suite_id: str + factory: EvalSuiteFactory + matcher: EvalSuiteMatcher | None = None + priority: int = 0 + workspace_root: str | None = None + + def matches(self, target: dict[str, Any]) -> bool: + if self.matcher is None: + return True + return bool(self.matcher(target)) + + +@dataclass(frozen=True) +class EvalSuiteSelection: + suite_id: str + suite: EvalSuiteDef + target: dict[str, Any] + mode: str + + +_EVAL_SUITE_REGISTRY: dict[tuple[str | None, str], EvalSuiteRegistration] = {} +_LOADED_EVAL_MANIFEST_PATHS: set[str] = set() +_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE: dict[str, set[tuple[str | None, str]]] = {} +_BUILTIN_EVAL_SUITE_IDS = {"app-evaluator", "runtime-composition-adoption"} + + +def _eval_suite_registry_key(suite_id: str, workspace_root: str | None = None) -> tuple[str | None, str]: + return workspace_root, suite_id + + +def _target_workspace_root(target: Mapping[str, Any]) -> str | None: + target_path = target.get("target_path") + if target_path is None: + return None + path = Path(str(target_path)).expanduser().resolve() + target_kind = target.get("target_kind") + if target_kind in {"file", "image"}: + return str(path.parent) + return str(path) + + +def _visible_eval_suite_registrations(target: Mapping[str, Any]) -> list[EvalSuiteRegistration]: + workspace_root = _target_workspace_root(target) + visible: list[EvalSuiteRegistration] = [] + for registration in _EVAL_SUITE_REGISTRY.values(): + if registration.workspace_root is not None and registration.workspace_root != workspace_root: + continue + visible.append(registration) + return visible + + +def register_eval_suite( + suite_id: str, + factory: EvalSuiteFactory, + *, + matcher: EvalSuiteMatcher | None = None, + priority: int = 0, + workspace_root: str | None = None, +) -> None: + _EVAL_SUITE_REGISTRY[_eval_suite_registry_key(suite_id, workspace_root)] = EvalSuiteRegistration( + suite_id=suite_id, + factory=factory, + matcher=matcher, + priority=priority, + workspace_root=workspace_root, + ) + + +def list_eval_suites() -> list[str]: + return sorted({registration.suite_id for registration in _EVAL_SUITE_REGISTRY.values()}) + + +def _build_declared_eval_suite(manifest: Mapping[str, Any]) -> EvalSuiteDef: + base_suite = str(manifest.get("base_suite") or "").strip() + if base_suite != "app-evaluator": + raise ValueError(f"unsupported base_suite: {base_suite}") + + suite = get_builtin_eval_suite(base_suite) + suite_id = str(manifest.get("suite_id") or "").strip() + if not suite_id: + raise ValueError("suite_id is required") + if suite_id in _BUILTIN_EVAL_SUITE_IDS: + raise ValueError(f"reserved suite_id: {suite_id}") + + gate_manifest = manifest.get("gate_policy") or {} + if gate_manifest: + suite = replace( + suite, + gate_policy=GatePolicyDef( + metric_name=str(gate_manifest.get("metric_name") or suite.gate_policy.metric_name), + pass_threshold=float(gate_manifest.get("pass_threshold", suite.gate_policy.pass_threshold)), + approval_threshold=( + float(gate_manifest["approval_threshold"]) + if gate_manifest.get("approval_threshold") is not None + else suite.gate_policy.approval_threshold + ), + ), + ) + + metadata = dict(suite.metadata) + metadata.update(dict(manifest.get("metadata") or {})) + metadata["declared_manifest"] = True + metadata["base_suite"] = base_suite + return replace(suite, suite_id=suite_id, metadata=metadata) + + +def load_declared_eval_suites(workspace: str | Path | None = None) -> list[str]: + root = Path(workspace or Path.cwd()).expanduser().resolve() + manifest_dir = root / ".aworld" / "evaluators" + workspace_key = str(root) + previous_suite_ids = _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE.get(workspace_key, set()) + if not manifest_dir.exists() or not manifest_dir.is_dir(): + for suite_id in previous_suite_ids: + _EVAL_SUITE_REGISTRY.pop(suite_id, None) + _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE.pop(workspace_key, None) + return [] + + loaded: list[str] = [] + current_suite_ids: set[tuple[str | None, str]] = set() + seen_suite_ids: set[str] = set() + for manifest_path in sorted(manifest_dir.glob("*.json")): + manifest_key = str(manifest_path.resolve()) + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + validate_declared_eval_suite_manifest(manifest) + suite = _build_declared_eval_suite(manifest) + if suite.suite_id in seen_suite_ids: + raise ValueError(f"duplicate suite_id in workspace manifests: {suite.suite_id}") + seen_suite_ids.add(suite.suite_id) + target_kinds = tuple(str(kind) for kind in (manifest.get("target_kinds") or ["file", "directory", "image"])) + register_eval_suite( + suite.suite_id, + lambda target, _suite=suite: _suite, + matcher=lambda target, _target_kinds=target_kinds: target.get("target_kind") in _target_kinds, + priority=int(manifest.get("priority", 100)), + workspace_root=workspace_key, + ) + _LOADED_EVAL_MANIFEST_PATHS.add(manifest_key) + current_suite_ids.add(_eval_suite_registry_key(suite.suite_id, workspace_key)) + loaded.append(suite.suite_id) + for removed_suite_id in previous_suite_ids - current_suite_ids: + _EVAL_SUITE_REGISTRY.pop(removed_suite_id, None) + _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE[workspace_key] = current_suite_ids + return loaded + + +def _sorted_eval_suite_registrations(registrations: list[EvalSuiteRegistration]) -> list[EvalSuiteRegistration]: + return sorted(registrations, key=lambda item: (-item.priority, item.suite_id)) + + +def _is_image_path(path: Path) -> bool: + return path.suffix.lower() in _IMAGE_SUFFIX_TO_MIME + + +def _infer_target_kind(path: Path) -> str: + if path.is_dir(): + return "directory" + if _is_image_path(path): + return "image" + return "file" + + +def describe_eval_target(target: str | Path | Mapping[str, Any]) -> dict[str, Any]: + if isinstance(target, Mapping): + normalized = dict(target) + value = normalized.pop("value", None) + if isinstance(value, Mapping): + normalized.update(value) + target_path = normalized.get("target_path") + if target_path is None: + return normalized + path = Path(str(target_path)).expanduser() + normalized["target_path"] = str(path) + normalized["target_kind"] = normalized.get("target_kind") or _infer_target_kind(path) + return normalized + + path = Path(target).expanduser().resolve() + return { + "target_path": str(path), + "target_kind": _infer_target_kind(path), + } + + +def _normalize_target(target: dict[str, Any]) -> dict[str, Any]: + return describe_eval_target(target) + + +def build_eval_dataset(cases: list[EvalCaseDef], target: dict[str, Any]) -> EvalDataset: + dataset_id = uuid.uuid4().hex + normalized_target = _normalize_target(target) + eval_cases = [ + EvalDataCase( + eval_case_id=case.case_id, + eval_dataset_id=dataset_id, + case_data={ + **case.input, + "_target": normalized_target, + "_case_metadata": dict(case.metadata), + "_expected": case.expected, + "_max_turns": case.max_turns, + "_timeout_seconds": case.timeout_seconds, + }, + ) + for case in cases + ] + return EvalDataset(eval_dataset_id=dataset_id, eval_dataset_name="suite_eval_dataset", eval_cases=eval_cases) + + +def _expand_trial_cases(cases: list[EvalCaseDef], trial_policy: TrialPolicyDef) -> list[EvalCaseDef]: + trial_policy.validate() + if trial_policy.num_trials == 1: + return cases + + expanded: list[EvalCaseDef] = [] + for case in cases: + for trial_index in range(1, trial_policy.num_trials + 1): + trial_id = f"{case.case_id}::trial-{trial_index}" + trial_metadata = { + "original_case_id": case.case_id, + "trial_index": trial_index, + "trial_id": trial_id, + } + expanded.append( + replace( + case, + case_id=trial_id, + input={**case.input, "_trial": trial_metadata}, + metadata={**case.metadata, "_trial": trial_metadata}, + ) + ) + return expanded + + +def resolve_eval_harness(suite: EvalSuiteDef) -> EvalHarnessDef: + if suite.harness is not None: + return suite.harness + if suite.execution is not None: + return EvalHarnessDef( + harness_id=f"{suite.suite_id}-execution", + execution=suite.execution, + metadata={"lowered_from": "suite.execution"}, + ) + return EvalHarnessDef(harness_id=f"{suite.suite_id}-static") + + +class _ConfiguredTaskEvalTarget(AworldTaskEvalTarget): + def __init__(self, *, target: dict[str, Any], execution: EvalExecutionSpec): + super().__init__() + self._target = dict(target) + self._execution = execution + + async def build_task(self, index: int, input: EvalDataCase[dict]): + builder = _load_callable(self._execution.task_builder_ref) + task = builder(index=index, input=input, target=self._target, execution=self._execution) + return await _maybe_await(task) + + +class _AdapterExecutionEvalTarget(EvalTarget[dict]): + def __init__(self, *, target: dict[str, Any], harness: EvalHarnessDef): + super().__init__() + self._target = dict(target) + self._harness = harness + self._adapter = resolve_execution_adapter(harness.execution) + + async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: + case = EvalCaseDef( + case_id=getattr(input, "eval_case_id", str(index)), + input=dict(input.case_data if isinstance(input, EvalDataCase) else input), + ) + state = await self._adapter.execute(case=case, target=self._target, spec=self._harness.execution) + return {"answer": state.answer, "state": state.to_dict()} + + +class _RuntimeCompositionEvalTarget(EvalTarget[dict]): + def __init__(self, *, target: dict[str, Any], harness: RuntimeHarness): + super().__init__() + self._target = dict(target) + self._harness = harness + + async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: + case = EvalCaseDef( + case_id=getattr(input, "eval_case_id", str(index)), + input=dict(input.case_data if isinstance(input, EvalDataCase) else input), + expected=(input.case_data or {}).get("_expected") if isinstance(input, EvalDataCase) else None, + metadata=(input.case_data or {}).get("_case_metadata", {}) if isinstance(input, EvalDataCase) else {}, + ) + rollout_state = await self._harness.run_rollout(case=case, target=self._target) + eval_state = rollout_state.to_eval_state(target=self._target) + return {"answer": eval_state.answer, "state": eval_state.to_dict()} + + +def _build_eval_target(flow: EvaluationFlowDef, target: dict[str, Any]): + if flow.suite.runtime_harness is not None: + return _RuntimeCompositionEvalTarget(target=target, harness=flow.suite.runtime_harness) + harness = resolve_eval_harness(flow.suite) + execution = harness.execution + if execution is None or execution.mode == EvalExecutionMode.STATIC: + return NoActionEvalTarget() + if execution.mode == EvalExecutionMode.AGENT: + if "agent" in execution.target_config: + return AworldAgentEvalTarget( + agent=execution.target_config["agent"], + query_column=execution.query_column or "query", + ) + return AworldAgentEvalTarget( + agent_config=execution.target_config, + query_column=execution.query_column or "query", + ) + if execution.mode == EvalExecutionMode.TASK: + if "task" in execution.target_config: + return _AdapterExecutionEvalTarget(target=target, harness=harness) + return _ConfiguredTaskEvalTarget(target=target, execution=execution) + if execution.mode == EvalExecutionMode.PROGRAM: + return _AdapterExecutionEvalTarget(target=target, harness=harness) + raise ValueError(f"unsupported execution mode: {execution.mode}") + + +def _trajectory_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: + criteria: list[dict[str, Any]] = [] + for scorer in suite.trajectory_scorers: + _validate_trajectory_scorer_def(scorer) + item: dict[str, Any] = { + "metric_name": scorer.metric_name, + "threshold": scorer.threshold, + "scorer_params": dict(scorer.scorer_params), + } + if scorer.scorer_class is not None: + item["scorer_class"] = scorer.scorer_class + criteria.append(item) + return criteria + + +def _runtime_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: + criteria: list[dict[str, Any]] = [] + for scorer in suite.outcome_scorers: + criteria.append( + { + "metric_name": scorer.metric_name, + "threshold": 1.0, + "scorer_class": "RuntimeOutcomeScorer", + "scorer_params": {"grader": scorer.to_dict()}, + } + ) + for metric_name in suite.reward_metrics: + criteria.append( + { + "metric_name": metric_name, + "threshold": 0.0, + "scorer_class": "RuntimeRewardScorer", + } + ) + for metric_name in suite.standard_metrics: + criteria.append( + { + "metric_name": metric_name, + "threshold": 0.0, + "scorer_class": "RuntimeStandardMetricScorer", + } + ) + return criteria + + +def _validate_trajectory_scorer_def(scorer: TrajectoryScorerDef) -> None: + scorer_class = scorer_factory.get_scorer_class(scorer.metric_name) + if scorer_class is None: + raise ValueError(f"unknown trajectory metric: {scorer.metric_name}") + if scorer.scorer_class is not None and scorer.scorer_class != scorer_class.__name__: + raise ValueError( + f"trajectory metric {scorer.metric_name} is registered to {scorer_class.__name__}, " + f"not {scorer.scorer_class}" + ) + if not scorer.scorer_params: + return + + signature = inspect.signature(scorer_class) + has_kwargs = any(param.kind == inspect.Parameter.VAR_KEYWORD for param in signature.parameters.values()) + unsupported = [ + key + for key in scorer.scorer_params + if key not in signature.parameters and not has_kwargs + ] + if unsupported: + joined = ", ".join(sorted(unsupported)) + raise ValueError(f"unsupported trajectory scorer_params for {scorer.metric_name}: {joined}") + + +def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: + normalized_target = _normalize_target(flow.target) + trial_cases = _expand_trial_cases(flow.suite.cases, flow.suite.trial_policy) + dataset = build_eval_dataset(trial_cases, normalized_target) + harness = resolve_eval_harness(flow.suite) + gate_policy = flow.suite.gate_policy or GatePolicyDef(metric_name="score", pass_threshold=0.0) + score_bounds = _gate_metric_eval_bounds(gate_policy, "score") + eval_criteria = { + "metric_name": "score", + **score_bounds, + "scorer_params": { + "suite": flow.suite, + "name": flow.suite.suite_id, + }, + } + eval_config = EvaluationConfig( + eval_suite_id=flow.suite.suite_id, + eval_target=_build_eval_target(flow, normalized_target), + eval_criterias=[eval_criteria, *_trajectory_eval_criteria(flow.suite), *_runtime_eval_criteria(flow.suite)], + eval_dataset=dataset, + ) + return CompiledEvaluationPlan( + suite=flow.suite, + target=normalized_target, + dataset=dataset, + eval_config=eval_config, + gate_policy=flow.suite.gate_policy, + harness=harness, + ) + + +def _extract_metric_value(summary: Mapping[str, Any], metric_name: str) -> Any: + metric_summary = summary.get(metric_name, {}) + if "mean" in metric_summary: + return float(metric_summary["mean"]) + if "true_rate" in metric_summary: + return float(metric_summary["true_rate"]) + if "value" in metric_summary: + return metric_summary["value"] + raise KeyError(f"metric {metric_name} is missing aggregate summary") + + +def _extract_metric_value_from_result_summary(summary: Mapping[str, Any], metric_name: str) -> float: + try: + return _extract_metric_value(summary, metric_name) + except KeyError: + pass + for scorer_summary in summary.values(): + if not isinstance(scorer_summary, Mapping): + continue + try: + return _extract_metric_value(scorer_summary, metric_name) + except KeyError: + continue + raise KeyError(f"metric {metric_name} is missing aggregate summary") + + +def _case_trial_metadata(case_result: Any) -> dict[str, Any]: + input_obj = getattr(case_result, "input", None) + case_data = getattr(input_obj, "case_data", {}) if input_obj is not None else {} + trial = case_data.get("_trial") if isinstance(case_data, Mapping) else None + return dict(trial or {}) + + +def _case_metric_value(case_result: Any, metric_name: str) -> Any: + for score_row in getattr(case_result, "score_rows", {}).values(): + metric_result = getattr(score_row, "metric_results", {}).get(metric_name) + if isinstance(metric_result, Mapping) and "value" in metric_result: + return metric_result["value"] + if metric_result is not None: + return metric_result + raise KeyError(metric_name) + + +def _metric_value_passed(value: Any) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return float(value) > 0.0 + return bool(value) + + +def _summarize_binary_values(values: list[float]) -> dict[str, Any]: + if not values: + return {"mean": 0.0, "min": 0.0, "max": 0.0, "std": 0.0} + mean = sum(values) / len(values) + return { + "mean": mean, + "min": min(values), + "max": max(values), + "std": 0.0, + } + + +def _trial_base_success_metric(metric_name: str) -> str: + for marker in ("_pass@", "_pass^"): + if marker in metric_name: + return metric_name.split(marker, 1)[0] + return metric_name + + +def _apply_trial_metrics(eval_result: Any, suite: EvalSuiteDef, gate_policy: GatePolicyDef | None) -> dict[str, Any]: + policy = suite.trial_policy + if policy.num_trials == 1 and not policy.pass_at_k and not policy.pass_caret_k: + return { + "original_cases": len(eval_result.eval_case_results), + "trials_total": len(eval_result.eval_case_results), + } + + configured_metric = policy.success_metric or (gate_policy.primary_metric_name() if gate_policy else "score") + success_metric = _trial_base_success_metric(configured_metric) + groups: dict[str, list[Any]] = {} + for case_result in eval_result.eval_case_results: + trial = _case_trial_metadata(case_result) + original_case_id = trial.get("original_case_id") or case_result.eval_case_id + groups.setdefault(str(original_case_id), []).append(case_result) + + trial_metrics: dict[str, dict[str, Any]] = {} + for k in policy.pass_at_k: + values: list[float] = [] + for results in groups.values(): + ordered = sorted(results, key=lambda result: int(_case_trial_metadata(result).get("trial_index", 1))) + selected = ordered[:k] + passed = any(_metric_value_passed(_case_metric_value(result, success_metric)) for result in selected) + values.append(1.0 if passed else 0.0) + trial_metrics[f"{success_metric}_pass@{k}"] = _summarize_binary_values(values) + + for k in policy.pass_caret_k: + values = [] + for results in groups.values(): + ordered = sorted(results, key=lambda result: int(_case_trial_metadata(result).get("trial_index", 1))) + selected = ordered[:k] + passed = len(selected) >= k and all( + _metric_value_passed(_case_metric_value(result, success_metric)) + for result in selected + ) + values.append(1.0 if passed else 0.0) + trial_metrics[f"{success_metric}_pass^{k}"] = _summarize_binary_values(values) + + if trial_metrics: + eval_result.summary["trial_metrics"] = trial_metrics + return { + "original_cases": len(groups), + "trials_total": len(eval_result.eval_case_results), + } + + +def _flatten_result_metrics(summary: Mapping[str, Any]) -> dict[str, Any]: + metrics: dict[str, Any] = {} + for scorer_summary in summary.values(): + if not isinstance(scorer_summary, Mapping): + continue + for metric_name, metric_summary in scorer_summary.items(): + if isinstance(metric_summary, Mapping): + metrics[metric_name] = dict(metric_summary) + return metrics + + +def _gate_pass_conditions_by_metric(policy: GatePolicyDef | None) -> dict[str, tuple[GateMetricCondition, ...]]: + if policy is None: + return {} + pass_all, _ = policy.normalized_conditions() + by_metric: dict[str, list[GateMetricCondition]] = {} + for condition in pass_all: + by_metric.setdefault(condition.metric_name, []).append(condition) + return {metric_name: tuple(conditions) for metric_name, conditions in by_metric.items()} + + +def _gate_metric_status(value: Any, conditions: tuple[GateMetricCondition, ...]) -> str: + for condition in conditions: + if not condition.matches({condition.metric_name: value}): + return "FAILED" + return "PASSED" + + +def _gate_policy_conditions(policy: GatePolicyDef) -> tuple[GateMetricCondition, ...]: + pass_all, approval_all = policy.normalized_conditions() + seen: set[str] = set() + conditions: list[GateMetricCondition] = [] + for condition in (*pass_all, *approval_all): + key = f"{condition.metric_name}:{condition.op}:{condition.threshold}" + if key in seen: + continue + seen.add(key) + conditions.append(condition) + return tuple(conditions) + + +def _gate_metric_eval_bounds(policy: GatePolicyDef, metric_name: str) -> dict[str, float]: + bounds: dict[str, float] = {} + pass_all, _ = policy.normalized_conditions() + for condition in pass_all: + if condition.metric_name != metric_name: + continue + if condition.op == ">=": + bounds["threshold"] = float(condition.threshold) + elif condition.op == ">": + bounds["threshold"] = math.nextafter(float(condition.threshold), math.inf) + elif condition.op == "<=": + bounds["threshold"] = float("-inf") + bounds["max_value"] = float(condition.threshold) + elif condition.op == "<": + bounds["threshold"] = float("-inf") + bounds["max_value"] = math.nextafter(float(condition.threshold), -math.inf) + break + if "threshold" not in bounds: + if policy.metric_name == metric_name and policy.pass_threshold is not None: + bounds["threshold"] = float(policy.pass_threshold) + else: + bounds["threshold"] = 0.0 + return bounds + + +def _normalize_metric_status(status: Any) -> str | None: + if status is None: + return None + return getattr(status, "name", str(status)) + + +def _format_report_timestamp(timestamp: float) -> str: + return datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat().replace("+00:00", "Z") + + +def _build_state_summary(output: Mapping[str, Any] | Any) -> dict[str, Any]: + if not isinstance(output, Mapping): + return {} + state = output.get("state") if isinstance(output.get("state"), Mapping) else output + trajectory = state.get("trajectory") if isinstance(state, Mapping) else None + completion = state.get("completion") if isinstance(state, Mapping) else None + return { + "answer": state.get("answer") if isinstance(state, Mapping) else None, + "completion_count": len(completion or []) if isinstance(completion, list) else 0, + "trajectory_steps": len(trajectory or []) if isinstance(trajectory, list) else 0, + "tool_call_count": len(state.get("tool_calls") or []) if isinstance(state, Mapping) else 0, + "usage": dict(state.get("usage") or {}) if isinstance(state, Mapping) else {}, + "timing": dict(state.get("timing") or {}) if isinstance(state, Mapping) else {}, + "standard_metrics": dict((state.get("metadata") or {}).get("standard_metrics") or {}) if isinstance(state, Mapping) else {}, + "error": state.get("error") if isinstance(state, Mapping) else None, + } + + +def _build_state_artifacts(output: Mapping[str, Any] | Any) -> dict[str, Any]: + if not isinstance(output, Mapping): + return {} + state = output.get("state") if isinstance(output.get("state"), Mapping) else output + if not isinstance(state, Mapping): + return {} + return dict(state.get("artifacts") or {}) + + +def _build_state_metadata(output: Mapping[str, Any] | Any) -> dict[str, Any]: + if not isinstance(output, Mapping): + return {} + state = output.get("state") if isinstance(output.get("state"), Mapping) else output + if not isinstance(state, Mapping): + return {} + return dict(state.get("metadata") or {}) + + +async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: + compiled = compile_evaluation_flow(flow) + eval_result = await EvaluateRunner(config=compiled.eval_config).run() + trial_counts = _apply_trial_metrics(eval_result, compiled.suite, compiled.gate_policy) + + suite_summary = eval_result.summary.get(compiled.suite.suite_id, {}) + gate_metrics = {} + gate = None + if compiled.gate_policy is not None: + for condition in _gate_policy_conditions(compiled.gate_policy): + if condition.metric_name not in gate_metrics: + try: + gate_metrics[condition.metric_name] = _extract_metric_value_from_result_summary( + eval_result.summary, + condition.metric_name, + ) + except KeyError: + continue + gate = compiled.gate_policy.evaluate(gate_metrics) + + results: list[CaseEvaluationReport] = [] + report_backend_id = None + cases_with_metrics = 0 + cases_with_judge = 0 + gate_conditions_by_metric = _gate_pass_conditions_by_metric(compiled.gate_policy) + for case_result in eval_result.eval_case_results: + judge_payload = {} + case_metrics: dict[str, Any] = {} + case_metric_details: dict[str, Any] = {} + case_backend_id = None + if case_result.score_rows: + cases_with_metrics += 1 + for score_row in case_result.score_rows.values(): + for metric_name, metric_result in score_row.metric_results.items(): + if isinstance(metric_result, Mapping): + case_metrics[metric_name] = {} + if "value" in metric_result: + case_metrics[metric_name]["value"] = metric_result["value"] + status = _normalize_metric_status(metric_result.get("eval_status")) + if metric_name in gate_conditions_by_metric and "value" in case_metrics[metric_name]: + status = _gate_metric_status( + case_metrics[metric_name]["value"], + gate_conditions_by_metric[metric_name], + ) + if status is not None: + case_metrics[metric_name]["status"] = status + metadata = metric_result.get("metadata") or {} + if isinstance(metadata, Mapping) and metadata: + is_judge_metric = "_judge_backend" in metadata + if not is_judge_metric or metric_name == "score": + case_metric_details[metric_name] = dict(metadata) + if case_backend_id is None and isinstance(metadata, Mapping): + case_backend_id = metadata.get("_judge_backend") + else: + case_metrics[metric_name] = {"value": metric_result} + score_row = case_result.score_rows.get(compiled.suite.suite_id) + if score_row is not None: + metric_result = score_row.metric_results.get("score", {}) + judge_payload = dict(metric_result.get("metadata", {})) + report_backend_id = report_backend_id or judge_payload.pop("_judge_backend", None) + if judge_payload: + cases_with_judge += 1 + results.append( + CaseEvaluationReport( + case_id=case_result.eval_case_id, + input=dict(case_result.input.case_data if hasattr(case_result.input, "case_data") else case_result.input), + metrics=case_metrics, + judge=judge_payload, + judge_backend={"backend_id": case_backend_id} if case_backend_id is not None else None, + state_summary=_build_state_summary(case_result.output), + artifacts=_build_state_artifacts(case_result.output), + metadata=_build_state_metadata(case_result.output), + metric_details=case_metric_details, + trial=_case_trial_metadata(case_result), + ) + ) + + metrics = _flatten_result_metrics(eval_result.summary) + for metric_name, conditions in gate_conditions_by_metric.items(): + if metric_name not in metrics: + continue + try: + value = _extract_metric_value(metrics, metric_name) + except KeyError: + continue + metrics[metric_name]["eval_status"] = _gate_metric_status(value, conditions) + report = EvaluatorReport({ + "report_version": 1, + "report_format": { + "id": EVALUATOR_REPORT_FORMAT_ID, + "version": EVALUATOR_REPORT_FORMAT_VERSION, + }, + "generated_at": _format_report_timestamp(eval_result.create_time), + "suite_id": compiled.suite.suite_id, + "target": dict(compiled.target), + "summary": eval_result.summary, + "metrics": metrics, + "results": results, + "result_counts": { + "cases_total": len(results), + "cases_with_metrics": cases_with_metrics, + "cases_with_judge": cases_with_judge, + }, + "approval": { + "required": bool(gate and gate.status == "needs_approval"), + "resolved": False, + "approved": None, + }, + "suite_metadata": dict(compiled.suite.metadata), + "trial_policy": compiled.suite.trial_policy.to_dict(), + "trial_counts": trial_counts, + }) + judge_schema = compiled.suite.judge_schema.json_schema() + if judge_schema: + report["judge_schema"] = judge_schema + if report_backend_id is not None: + report["judge_backend"] = {"backend_id": report_backend_id} + if gate is not None: + report["gate"] = { + "status": gate.status, + "metric_name": gate.metric_name, + "value": gate.value, + "matched_conditions": gate.matched_conditions, + "failed_conditions": gate.failed_conditions, + } + return report + + +def _rank_for_score(score: float) -> str: + if score >= 0.8: + return "Exemplary" + if score >= 0.6: + return "Good" + if score >= 0.4: + return "Mediocre" + return "Fail" + + +def _artifact_quality_score(target_path: Path) -> tuple[float, list[str], list[str]]: + positive: list[str] = [] + improvements: list[str] = [] + + if target_path.is_file() and _is_image_path(target_path): + positive.append("A rendered screenshot is present for direct visual review.") + improvements.append("Provide a few more representative screens or brief implementation context for deeper evaluation.") + return 0.65, positive, improvements + + score = 0.3 + + if target_path.is_dir(): + files = [item for item in target_path.rglob("*") if item.is_file()] + else: + files = [target_path] + + suffixes = {item.suffix.lower() for item in files} + names = {item.name.lower() for item in files} + visual_files = [item for item in files if _is_image_path(item)] + + if visual_files and not {".html", ".css", ".js", ".ts", ".tsx", ".jsx"} & suffixes: + score = 0.55 + positive.append("Rendered screenshots are available for direct visual review.") + if len(visual_files) >= 3: + score += 0.1 + positive.append("Multiple screens provide broader product coverage.") + else: + improvements.append("Include a few more representative states to improve evaluation coverage.") + if {"readme.md", "README.md"} & names: + score += 0.1 + positive.append("Project metadata or usage notes are present.") + else: + improvements.append("Add brief context so evaluators understand what the screens are showing.") + return min(score, 0.95), positive, improvements + + if ".html" in suffixes: + score += 0.15 + positive.append("HTML entrypoints are present for direct artifact review.") + else: + improvements.append("Add a concrete HTML or UI artifact entrypoint for review.") + + if ".css" in suffixes: + score += 0.15 + positive.append("CSS assets suggest dedicated presentation work instead of raw markup only.") + else: + improvements.append("Add explicit CSS styling rather than relying on unstyled defaults.") + + if {".js", ".ts", ".tsx", ".jsx"} & suffixes: + score += 0.1 + positive.append("Interactive source files are present.") + else: + improvements.append("Add explicit interactive behavior coverage where the experience depends on it.") + + if {"readme.md", "README.md"} & names: + score += 0.1 + positive.append("Project metadata or usage notes are present.") + else: + improvements.append("Document the artifact so evaluators can understand intended behavior quickly.") + + if len(files) >= 3: + score += 0.1 + positive.append("The target contains multiple assets, which usually indicates a more complete deliverable.") + else: + improvements.append("Package the target with its supporting assets rather than a single thin file.") + + if visual_files: + score += 0.1 + positive.append("Visual assets are included for richer presentation.") + else: + improvements.append("Include branded or supporting visual assets to improve evaluability.") + + return min(score, 0.95), positive, improvements + + +def _app_evaluator_judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]: + target_path = Path(target["target_path"]) + score, positive, improvements = _artifact_quality_score(target_path) + rank = _rank_for_score(score) + praise = positive[0] if positive else "The artifact is present and can be evaluated." + criticism = improvements[0] if improvements else "The artifact still needs a stronger end-to-end product signal." + advice = " ".join(improvements[:2]) if improvements else "Raise the visual polish and make the main experience more explicit." + return { + "score": round(score, 2), + "rank": rank, + "criticism": criticism, + "praise": praise, + "improvement_advice": advice, + } + + +async def _maybe_await_judge(judge: JudgeCallable, case_input: dict[str, Any], target: dict[str, Any]) -> Mapping[str, Any]: + payload = judge(case_input, target) + if inspect.isawaitable(payload): + return await payload + return payload + + +async def _maybe_await(value: Any) -> Any: + if inspect.isawaitable(value): + return await value + return value + + +def _load_callable(ref: str | None) -> Callable[..., Any]: + if not ref: + raise ValueError("task execution mode requires task_builder_ref") + return load_program_callable(ref) + + +def _load_app_evaluator_skill_prompt() -> str: + skill_path = Path(__file__).resolve().parents[2] / "aworld-skills" / "app_evaluator" / "SKILL.md" + return skill_path.read_text(encoding="utf-8") + + +def _snapshot_text_for_file(path: Path, *, max_chars: int = 1600) -> str | None: + if path.suffix.lower() not in {".html", ".css", ".js", ".ts", ".tsx", ".jsx", ".md", ".json", ".txt"}: + return None + try: + return path.read_text(encoding="utf-8", errors="ignore")[:max_chars] + except Exception: + return None + + +def _build_target_snapshot(target: dict[str, Any], *, max_files: int = 6) -> dict[str, Any]: + target_path = Path(target["target_path"]) + files = [target_path] + if target_path.is_dir(): + files = sorted([item for item in target_path.rglob("*") if item.is_file()])[:max_files] + snapshot_files = [] + for item in files: + snapshot_files.append( + { + "path": str(item), + "name": item.name, + "suffix": item.suffix.lower(), + "preview": _snapshot_text_for_file(item), + } + ) + return { + "target_path": str(target_path), + "target_kind": target.get("target_kind", "directory" if target_path.is_dir() else "file"), + "files": snapshot_files, + } + + +def _build_default_judge_prompt(case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> str: + snapshot = _build_target_snapshot(target) + target_name = Path(target["target_path"]).name + return ( + "Evaluate the following app artifact snapshot.\n" + f"Suite: {suite.suite_id}\n" + f"Target: {target['target_path']}\n" + f"Case input: {json.dumps(case_input, ensure_ascii=False)}\n" + f"Artifact snapshot: {json.dumps(snapshot, ensure_ascii=False)}\n" + "Return a JSON object with a `results` array containing exactly one item for " + f"`{target_name}` and include `score`, `rank`, `criticism`, `praise`, and `improvement_advice`." + ) + + +def _encode_image_as_data_url(path: Path) -> str | None: + mime_type = _IMAGE_SUFFIX_TO_MIME.get(path.suffix.lower()) + if mime_type is None: + return None + try: + encoded = base64.b64encode(path.read_bytes()).decode("utf-8") + except Exception: + return None + return f"data:{mime_type};base64,{encoded}" + + +def _collect_target_image_urls(target: dict[str, Any], *, max_images: int = 4) -> list[str]: + target_path = Path(target["target_path"]) + image_paths: list[Path] = [] + + if target_path.is_file() and _is_image_path(target_path): + image_paths = [target_path] + elif target_path.is_dir(): + image_paths = sorted( + item for item in target_path.rglob("*") if item.is_file() and _is_image_path(item) + )[:max_images] + + image_urls: list[str] = [] + for path in image_paths: + data_url = _encode_image_as_data_url(path) + if data_url is not None: + image_urls.append(data_url) + return image_urls + + +def _build_app_evaluator_judge_prompt( + case_input: dict[str, Any], + target: dict[str, Any], + suite: EvalSuiteDef, +) -> JudgePrompt: + snapshot = _build_target_snapshot(target) + target_name = Path(target["target_path"]).name + image_urls = _collect_target_image_urls(target) + prompt = ( + "Evaluate the following app artifact.\n" + f"Suite: {suite.suite_id}\n" + f"Target: {target['target_path']}\n" + f"Case input: {json.dumps(case_input, ensure_ascii=False)}\n" + f"Artifact snapshot: {json.dumps(snapshot, ensure_ascii=False)}\n" + f"Attached visuals: {len(image_urls)}\n" + "Use attached visuals as the primary evidence when present. Use the artifact snapshot for filenames and implementation context.\n" + "Return a JSON object with a `results` array containing exactly one item for " + f"`{target_name}` and include `score`, `rank`, `criticism`, `praise`, and `improvement_advice`." + ) + if image_urls: + return prompt, image_urls + return prompt + + +def _extract_json_object(text: str) -> dict[str, Any]: + stripped = text.strip() + try: + loaded = json.loads(stripped) + if isinstance(loaded, dict): + return loaded + except json.JSONDecodeError: + pass + + matches = re.findall(r"\{.*\}", stripped, re.DOTALL) + for candidate in matches: + try: + loaded = json.loads(candidate) + if isinstance(loaded, dict): + return loaded + except json.JSONDecodeError: + continue + raise ValueError("judge response does not contain a valid JSON object") + + +def _coerce_judge_payload(response: Mapping[str, Any] | str) -> dict[str, Any]: + if isinstance(response, str): + response = _extract_json_object(response) + else: + response = dict(response) + + if "results" in response: + results = response.get("results") or [] + if not results: + raise ValueError("judge response results array is empty") + return dict(results[0]) + return dict(response) + + +async def _default_agent_judge_executor(prompt: JudgePrompt, system_prompt: str) -> str: + from aworld.agents.llm_agent import Agent + from aworld.config.conf import AgentConfig + from aworld.core.common import Observation + from aworld.core.context.base import Context + from aworld.utils.run_util import exec_agent + + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + model_name = os.getenv("LLM_MODEL_NAME") + if not api_key or not model_name: + raise RuntimeError("LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY are required for agent judge backend") + + prompt_text: str + image_urls: list[str] | None + if isinstance(prompt, tuple): + prompt_text, image_urls = prompt + else: + prompt_text, image_urls = prompt, None + + agent = Agent( + name="evaluation_judge", + conf=AgentConfig( + llm_provider=os.getenv("LLM_PROVIDER", "openai"), + llm_model_name=model_name, + llm_temperature=float(os.getenv("LLM_TEMPERATURE", "0.1")), + llm_base_url=os.getenv("LLM_BASE_URL"), + llm_api_key=api_key, + ), + system_prompt=system_prompt, + ) + request: str | Observation = prompt_text + if image_urls: + request = Observation(content=prompt_text, images=image_urls) + response = await exec_agent(request, agent=agent, context=Context()) + return str(response.answer) + + +async def _runtime_adoption_assistant_step(*, user_turn, state, case, target) -> dict[str, Any]: + return { + "answer": "runtime composition resolved the scripted case", + "outcome": {"ticket": {"status": "resolved"}}, + "step_rewards": [ + StepReward( + metric_name="process_quality", + step_index=len(state.turns), + value=1.0, + reason="scripted runtime reached the expected terminal state", + ) + ], + "tool_calls": [{"id": "call-1", "function": {"name": "resolve_ticket", "arguments": "{}"}}], + "usage": {"total_tokens": 8}, + "timing": {"duration_ms": 1}, + } + + +async def _runtime_adoption_judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]: + outcome = ((target.get("artifacts") or {}).get("outcome") or {}) + resolved = ((outcome.get("ticket") or {}).get("status") == "resolved") + return { + "score": 1.0 if resolved else 0.0, + "verdict": "approved" if resolved else "blocked", + } + + +def _get_runtime_composition_adoption_suite() -> EvalSuiteDef: + return EvalSuiteDef( + suite_id="runtime-composition-adoption", + runtime_harness=CallableRuntimeHarness( + simulator=SinglePromptUserSimulator(), + assistant_step=_runtime_adoption_assistant_step, + max_turns=1, + ), + judge_schema=JudgeSchemaDef(output_model=_RuntimeCompositionJudgeOutput), + judge=_runtime_adoption_judge, + outcome_scorers=( + StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ), + ), + reward_metrics=("process_quality",), + standard_metrics=("n_turns", "n_tool_calls", "n_tokens", "duration_ms"), + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="ticket_resolved", op="==", threshold=1.0), + GateMetricCondition(metric_name="process_quality", op=">=", threshold=1.0), + GateMetricCondition(metric_name="n_turns", op="==", threshold=2), + GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0), + ) + ), + metadata={ + "evaluation_purpose": "capability", + "adoption_suite": True, + "runtime_composition": True, + }, + ) + + +def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) -> EvalSuiteDef: + if name == "runtime-composition-adoption": + return _get_runtime_composition_adoption_suite() + if name != "app-evaluator": + raise KeyError(name) + + return EvalSuiteDef( + suite_id="app-evaluator", + judge_schema=JudgeSchemaDef( + required_fields=( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + ), + gate_policy=GatePolicyDef( + metric_name="score", + pass_threshold=0.8, + approval_threshold=0.6, + ), + judge_backend=judge_backend + or FallbackJudgeBackend( + backend_id="app-evaluator-fallback", + backends=( + AgentJudgeBackend( + backend_id="app-evaluator-agent", + system_prompt=_load_app_evaluator_skill_prompt(), + prompt_builder=_build_app_evaluator_judge_prompt, + timeout_seconds=float(os.getenv("AWORLD_EVALUATOR_AGENT_TIMEOUT_SECONDS", "8.0")), + ), + CallableJudgeBackend( + backend_id="app-evaluator-heuristic", + judge=_app_evaluator_judge, + ), + ), + ), + metadata={ + "rubric_source": "aworld-skills/app_evaluator/SKILL.md", + "preferred_backend": "app-evaluator-agent", + }, + ) + + +def _build_eval_suite_case(target_info: dict[str, Any]) -> EvalCaseDef: + return EvalCaseDef( + case_id=Path(target_info["target_path"]).name or "target", + input={ + "target_path": target_info["target_path"], + "target_kind": target_info["target_kind"], + }, + ) + + +def list_matching_eval_suites(target: str | Path | Mapping[str, Any]) -> list[str]: + target_info = describe_eval_target(target) + candidates = [registration for registration in _visible_eval_suite_registrations(target_info) if registration.matches(target_info)] + return [registration.suite_id for registration in _sorted_eval_suite_registrations(candidates)] + + +def resolve_eval_suite_selection(name: str | None, target: str | Path | Mapping[str, Any]) -> EvalSuiteSelection: + target_info = describe_eval_target(target) + if name is not None: + candidates = [ + registration + for registration in _visible_eval_suite_registrations(target_info) + if registration.suite_id == name + ] + if not candidates: + raise KeyError(name) + registration = _sorted_eval_suite_registrations(candidates)[0] + if not registration.matches(target_info): + raise ValueError(f"suite '{name}' does not support target kind '{target_info.get('target_kind')}'") + mode = "explicit" + else: + candidates = [ + registration for registration in _visible_eval_suite_registrations(target_info) if registration.matches(target_info) + ] + if not candidates: + raise KeyError(f"no evaluation suite matches target {target_info.get('target_path')}") + registration = _sorted_eval_suite_registrations(candidates)[0] + mode = "auto" + + suite = registration.factory(target_info).with_cases([ + _build_eval_suite_case(target_info), + ]) + return EvalSuiteSelection( + suite_id=suite.suite_id, + suite=suite, + target=target_info, + mode=mode, + ) + + +def resolve_eval_suite(name: str | None, target: str | Path) -> EvalSuiteDef: + selection = resolve_eval_suite_selection(name, target) + return selection.suite + + +register_eval_suite( + "app-evaluator", + lambda target: get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, +) +register_eval_suite( + "runtime-composition-adoption", + lambda target: get_builtin_eval_suite("runtime-composition-adoption"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image", "inline"}, + priority=1, +) diff --git a/aworld/evaluations/trajectory_judge.py b/aworld/evaluations/trajectory_judge.py new file mode 100644 index 000000000..e4f96fae1 --- /dev/null +++ b/aworld/evaluations/trajectory_judge.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any, Literal, Mapping + +from pydantic import BaseModel + +from aworld.evaluations.substrate import JudgeSchemaDef + + +class TrajectoryEvalJudgeOutput(BaseModel): + score: float + verdict: Literal["Excellent", "Pass", "Marginal", "Fail"] + A1_groundedness: int + A2_completeness: int + A3_relevance: int + A4_readability: int + B1_tool_use: int + B2_efficiency: int + B3_compliance: int + B4_robustness: int + veto_triggered: bool = False + + +def normalize_trajectory_judge_payload(payload: Mapping[str, Any]) -> dict[str, Any]: + if "dimensions" not in payload: + return dict(payload) + flattened = dict(payload) + if "score" not in flattened and "weighted_score" in flattened: + flattened["score"] = flattened["weighted_score"] + dimensions = payload.get("dimensions") or {} + for metric_name in ( + "A1_groundedness", + "A2_completeness", + "A3_relevance", + "A4_readability", + "B1_tool_use", + "B2_efficiency", + "B3_compliance", + "B4_robustness", + ): + metric_payload = dimensions.get(metric_name) if isinstance(dimensions, Mapping) else None + if isinstance(metric_payload, Mapping) and "score" in metric_payload: + flattened[metric_name] = metric_payload["score"] + return flattened + + +class TrajectoryJudgeSchema: + @staticmethod + def default() -> JudgeSchemaDef: + return JudgeSchemaDef( + output_model=TrajectoryEvalJudgeOutput, + normalizer=normalize_trajectory_judge_payload, + ) diff --git a/aworld/runners/evaluate_runner.py b/aworld/runners/evaluate_runner.py index 591029c1f..27e4c5bdc 100644 --- a/aworld/runners/evaluate_runner.py +++ b/aworld/runners/evaluate_runner.py @@ -126,6 +126,8 @@ async def load_dataset(self, eval_config: EvaluationConfig) -> EvalDataset: Returns: EvalDataset """ + if getattr(eval_config, "eval_dataset", None) is not None: + return eval_config.eval_dataset if self._is_file_path(eval_config.eval_dataset_id_or_file_path): dataset = Dataset[Dict[str, Any]](name="my_dataset", data=[]) preload_transform = None diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md new file mode 100644 index 000000000..5d86f73c9 --- /dev/null +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -0,0 +1,196 @@ +# Evaluator + +## What It Does + +The evaluator command runs suite-backed evaluation flows for local targets and exposes the resulting report as a stable machine-readable contract. + +It is the official CLI entrypoint for the framework substrate in `aworld.evaluations`: the CLI resolves targets, +workspace manifests, output paths, and hooks, while suite semantics, execution-backed state normalization, scoring, and +gate decisions remain framework-owned. + +Use it when you want to: + +- run a built-in evaluator suite such as `app-evaluator` +- load declaration-backed evaluator suites from workspace manifests +- evaluate task JSONL files, existing task+answer JSONL files, or AWorld trajectory logs +- inspect which suites match a target +- export the evaluator report schema +- validate a saved evaluator report in automation + +## Commands + +Top-level CLI usage: + +```bash +aworld-cli evaluator --target ./artifact +aworld-cli evaluator --target ./artifact --suite app-evaluator +aworld-cli evaluator --list-suites +aworld-cli evaluator --list-suites --target ./artifact +aworld-cli evaluator --print-report-schema +aworld-cli evaluator --validate-report ./.aworld/evaluations/artifact.app-evaluator.json +``` + +Source-backed usage: + +```bash +aworld-cli evaluator \ + --input ./tasks.jsonl \ + --kind task \ + --judge-agent ./judge_agents/answer_judge.md \ + --out-dir ./reports + +aworld-cli evaluator \ + --input ./task_answers.jsonl \ + --kind answer \ + --judge-agent ./judge_agents/answer_judge.md \ + --out-dir ./reports + +aworld-cli evaluator \ + --input ~/Documents/logs/trajectory.log \ + --kind trajectory \ + --task-id task_20260609193335 \ + --judge-agent ./judge_agents/trajectory_judge.md \ + --out-dir ./reports + +aworld-cli evaluator \ + --input ~/Documents/logs/trajectory.log \ + --kind trajectory \ + --judge-agent ./judge_agents/trajectory_judge.md \ + --out-dir ./reports + +aworld-cli evaluator \ + --input ./tasks.jsonl \ + --kind trajectory \ + --judge-agent ./judge_agents/trajectory_judge.md \ + --out-dir ./reports +``` + +For `task` JSONL inputs, the default fields are `id` and `input`; the evaluator runs each task through the CLI default `Aworld` agent unless `--agent` is supplied. For `trajectory`, passing `--task-id` replays one task from an existing AWorld trajectory log, omitting `--task-id` with a trajectory log replays all tasks in that log, and omitting `--task-id` with task JSONL runs the main agent, extracts the response trajectory, and evaluates that generated trajectory. For `answer` JSONL inputs, the default fields are `id`, `input`, and `answer`. Use `--id-field`, `--task-field`, and `--answer-field` only when the file uses different names. + +Useful options: + +```bash +aworld-cli evaluator --target ./artifact --output ./report.json +aworld-cli evaluator --target ./artifact --interactive-approval +aworld-cli evaluator --input ./tasks.jsonl --kind task --judge-agent ./agent.md --agent Aworld --output ./report.json +aworld-cli evaluator --input ./tasks.jsonl --kind trajectory --judge-agent ./trajectory_agent.md --output ./report.json +aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-agent ./agent.md --output ./report.json +``` + +## Declared Suite Manifests + +Evaluator suites can be declared under `.aworld/evaluators/*.json` and are loaded before suite resolution. This keeps the runtime on top of AWorld's existing runner and task substrate while letting a workspace expose stricter or context-specific evaluator variants without forking builtin code. + +Current manifest scope is intentionally narrow: + +- `base_suite` must be `app-evaluator` +- `suite_id` is required and becomes the suite name exposed to `aworld-cli evaluator` +- `target_kinds` optionally narrows matching to `file`, `directory`, and/or `image` +- `gate_policy`, `metadata`, and `priority` override selection and gating behavior on top of the builtin suite + +Minimal example: + +```json +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file", "directory"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + }, + "metadata": { + "owner": "qa" + }, + "priority": 120 +} +``` + +See [declared_evaluator_suite.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json) for a complete example. The current manifest schema is exported by `aworld_cli.evaluator_runtime.get_declared_evaluator_suite_schema()`. + +Resolution rules: + +- builtin suites are always available +- declared suites are discovered relative to the evaluation target workspace, not just the current shell cwd +- declared manifests currently extend `app-evaluator`; they are not yet a generic user-defined suite authoring API +- `--list-suites --target ...` and actual evaluator execution use the same target-relative discovery path + +## Plugin Hooks + +`aworld-cli evaluator` is a builtin plugin-backed command with narrow lifecycle hook points intended for CLI assembly concerns, not framework scoring semantics. + +Available hook points: + +- `evaluator.pre_discover`: inspect or annotate target/workspace inputs before suite discovery +- `evaluator.post_discover`: react to resolved suite candidates +- `evaluator.pre_run`: add lightweight CLI metadata before evaluation starts +- `evaluator.post_run`: upload or post-process the completed report +- `evaluator.render_summary`: augment rendered terminal summary text + +Current event payloads: + +- `evaluator.pre_discover`: `target`, `workspace_path` +- `evaluator.post_discover`: `target`, `workspace_path`, `suite_names` +- `evaluator.pre_run` for target mode: `mode`, `target`, `suite`, `workspace_path` +- `evaluator.pre_run` for source mode: `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` +- `evaluator.post_run` for target mode: `mode`, `report`, `target`, `suite`, `workspace_path` +- `evaluator.post_run` for source mode: `mode`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` +- `evaluator.render_summary`: `report`, `workspace_path` + +Hook boundaries: + +- mutable hook state is limited to lightweight CLI assembly metadata +- hooks should not replace suite logic, judge logic, or gate calculation +- suitable side effects include report upload, notifications, and summary augmentation + +## Report Contract + +Evaluator reports are JSON documents with a stable top-level format marker: + +```json +{ + "report_format": { + "id": "aworld.evaluator.report", + "version": 1 + } +} +``` + +Key report sections: + +- `metrics`: normalized aggregate metrics for the resolved suite +- `results`: per-case judge output plus normalized per-case metrics +- `gate`: structured `pass` / `fail` / `needs_approval` decision +- `automation`: exit-code-oriented summary fields for scripts and CI +- `suite_selection`: resolved/defaulted suite selection diagnostics +- `source_selection`: source input diagnostics for source-backed `aworld-cli evaluator --input ...` +- `approval`: approval decision metadata when the gate requires human confirmation + +See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/evaluator_report.example.json) for a minimal example. + +## Typical Workflow + +1. Inspect matching suites with `aworld-cli evaluator --list-suites --target ./artifact`. +2. Run evaluation with `aworld-cli evaluator --target ./artifact`. +3. For task-only inputs, run source-backed execution and evaluation with `aworld-cli evaluator --input --kind task --judge-agent `. +4. Save or collect the emitted JSON report. +5. Validate persisted reports with `aworld-cli evaluator --validate-report `. +6. Export the current JSON Schema with `aworld-cli evaluator --print-report-schema` when integrating with external tooling. + +## Exit Codes + +- `0`: evaluation passed, schema is valid, or metadata command succeeded +- `2`: evaluation gate failed +- `3`: evaluation requires approval and is not approved +- `4`: evaluator report validation failed + +## Notes And Limits + +- `--list-suites --target ...` shows only suites matching the target and prints the deterministic default suite. +- declared suite manifests are discovered from `.aworld/evaluators/*.json` relative to the evaluation target workspace. +- declared suite manifests currently layer on `app-evaluator` only; they are not a generic suite authoring format yet. +- `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. +- `--validate-report` validates an existing JSON report against that schema without re-running evaluation. +- `aworld-cli evaluator --input ...` currently supports `task`, `answer`, and `trajectory`; generic serialized-state sources are intentionally deferred until the framework provides those source kinds. +- the CLI command is an assembly/product layer; reusable evaluator building blocks stay in `aworld/evaluations/**`. diff --git a/docs/AWorld CLI/Commands/Overview.md b/docs/AWorld CLI/Commands/Overview.md index 74843c2c7..422f1415c 100644 --- a/docs/AWorld CLI/Commands/Overview.md +++ b/docs/AWorld CLI/Commands/Overview.md @@ -1,3 +1,8 @@ # Commands Use slash commands inside interactive AWorld CLI sessions to inspect workspace state, manage scheduled work, control plugins, and access command-bridge features exposed through gateway channels. + +Available command references: + +- [Evaluator](/Users/wuman/Documents/workspace/aworld-mas/aworld/docs/AWorld%20CLI/Commands/Evaluator.md): suite-backed evaluation, schema export, and report validation +- [Gateway](/Users/wuman/Documents/workspace/aworld-mas/aworld/docs/AWorld%20CLI/Commands/Gateway.md): multi-channel gateway lifecycle and command bridge behavior diff --git a/docs/AWorld CLI/Recipes/Mini App Build.md b/docs/AWorld CLI/Recipes/Mini App Build.md index 34d37a530..baeb1e347 100644 --- a/docs/AWorld CLI/Recipes/Mini App Build.md +++ b/docs/AWorld CLI/Recipes/Mini App Build.md @@ -63,3 +63,15 @@ help me create an English word learning app, with a UI quality score over 0.9 The agent will generate the app, evaluate it with the official Evaluator skill, and iterate until the UI quality score meets your target. When done, run or deploy the output as needed. For a reusable workspace setup, keep the same `.env` or interactive CLI configuration across runs. + +## Inspecting The Evaluator Report + +When you want a stable machine-readable artifact for CI or post-processing, use the standalone evaluator command against the generated app or artifact: + +```bash +aworld-cli evaluator --target ./artifact --output ./report.json +aworld-cli evaluator --print-report-schema +aworld-cli evaluator --validate-report ./report.json +``` + +The emitted report includes `report_format`, normalized `metrics`, structured `gate`, and `automation` fields. That makes it suitable for quality gates, regression checks, or downstream dashboards without parsing freeform evaluator text. diff --git a/docs/superpowers/plans/2026-06-01-evaluation-substrate.md b/docs/superpowers/plans/2026-06-01-evaluation-substrate.md new file mode 100644 index 000000000..c66b723d7 --- /dev/null +++ b/docs/superpowers/plans/2026-06-01-evaluation-substrate.md @@ -0,0 +1,312 @@ +# Evaluation Substrate Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build AWorld's internal evaluation substrate, add a working `aworld-cli evaluator` flow, and express `app_evaluator` as a suite-backed evaluation without breaking legacy evaluation APIs. + +**Architecture:** Add a small internal definition-and-compilation layer under `aworld.evaluations`, keep execution on top of existing `EvaluateRunner`/`EvalTarget`/`Scorer`, then wire a new CLI top-level command onto that substrate. `app_evaluator` becomes the first built-in suite definition and gateable report flow. + +**Tech Stack:** Python, dataclasses, existing AWorld evaluation runtime, argparse-based CLI, pytest, unittest-style async tests where already used + +--- + +### Task 1: Add substrate contracts and red-path tests + +**Files:** +- Create: `tests/evaluations/test_evaluation_substrate.py` +- Create: `aworld/evaluations/substrate.py` +- Modify: `aworld/config/conf.py` + +- [ ] **Step 1: Write the failing substrate contract tests** + +```python +from aworld.evaluations.substrate import ( + EvalSuiteDef, + EvalCaseDef, + JudgeSchemaDef, + GatePolicyDef, + EvaluationFlowDef, + compile_evaluation_flow, +) + + +def test_compile_evaluation_flow_preserves_legacy_runner_inputs(): + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge_schema=JudgeSchemaDef(required_fields=("score", "rank")), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8), + ) + flow = EvaluationFlowDef(target={"kind": "dict", "value": {"answer": "hello"}}, suite=suite) + + compiled = compile_evaluation_flow(flow) + + assert compiled.eval_config.eval_dataset_id_or_file_path is None + assert compiled.dataset.eval_cases[0].case_data["query"] == "hello" + assert compiled.gate_policy.metric_name == "score" +``` + +- [ ] **Step 2: Run the substrate tests to verify they fail** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: FAIL with import or missing symbol errors for the new substrate module and compile helpers. + +- [ ] **Step 3: Implement the minimal substrate layer** + +```python +@dataclass +class GatePolicyDef: + metric_name: str + pass_threshold: float + approval_threshold: float | None = None + + +def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: + dataset = build_eval_dataset(flow.suite.cases) + eval_config = EvaluationConfig( + eval_criterias=[], + eval_dataset_id_or_file_path=None, + ) + return CompiledEvaluationPlan( + suite=flow.suite, + dataset=dataset, + eval_config=eval_config, + gate_policy=flow.suite.gate_policy, + ) +``` + +- [ ] **Step 4: Re-run the substrate tests** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the substrate contract slice** + +```bash +git add tests/evaluations/test_evaluation_substrate.py aworld/evaluations/substrate.py aworld/config/conf.py +git commit -m "feat: add evaluation substrate contracts" +``` + +### Task 2: Add schema validation and gate decisions + +**Files:** +- Modify: `tests/evaluations/test_evaluation_substrate.py` +- Modify: `aworld/evaluations/substrate.py` + +- [ ] **Step 1: Write failing tests for schema validation and gate outcomes** + +```python +def test_gate_policy_returns_needs_approval_between_thresholds(): + decision = GatePolicyDef( + metric_name="score", + pass_threshold=0.85, + approval_threshold=0.6, + ).evaluate({"score": 0.7}) + + assert decision.status == "needs_approval" + + +def test_judge_schema_rejects_missing_required_fields(): + schema = JudgeSchemaDef(required_fields=("score", "rank", "criticism")) + + with pytest.raises(ValueError, match="missing required judge fields"): + schema.validate({"score": 0.8, "rank": "Good"}) +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: FAIL because schema validation and gate evaluation are not implemented yet. + +- [ ] **Step 3: Implement schema validation and gate decisions** + +```python +def validate(self, payload: Mapping[str, Any]) -> None: + missing = [field for field in self.required_fields if field not in payload] + if missing: + raise ValueError(f"missing required judge fields: {', '.join(missing)}") + + +def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: + score = float(metrics[self.metric_name]) + if score >= self.pass_threshold: + return GateDecision(status="pass", metric_name=self.metric_name, value=score) + if self.approval_threshold is not None and score >= self.approval_threshold: + return GateDecision(status="needs_approval", metric_name=self.metric_name, value=score) + return GateDecision(status="fail", metric_name=self.metric_name, value=score) +``` + +- [ ] **Step 4: Re-run the substrate tests** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the schema-and-gate slice** + +```bash +git add tests/evaluations/test_evaluation_substrate.py aworld/evaluations/substrate.py +git commit -m "feat: add evaluation schema and gate decisions" +``` + +### Task 3: Add the CLI evaluator command + +**Files:** +- Create: `tests/core/test_evaluator_top_level_command.py` +- Create: `aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py` +- Create: `aworld-cli/src/aworld_cli/evaluator_runtime.py` +- Modify: `aworld-cli/src/aworld_cli/top_level_commands/__init__.py` +- Modify: `aworld-cli/src/aworld_cli/main.py` + +- [ ] **Step 1: Write the failing evaluator CLI tests** + +```python +def test_registry_registers_builtin_evaluator_command(): + registry = main_module._build_top_level_command_registry() + command = registry.get("evaluator") + assert command is not None + + +def test_maybe_dispatch_top_level_command_runs_evaluator(monkeypatch, tmp_path, capsys): + target = tmp_path / "artifact.txt" + target.write_text("demo", encoding="utf-8") + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + lambda **kwargs: {"gate": {"status": "pass"}, "suite_id": "app-evaluator"}, + ) + + handled = main_module._maybe_dispatch_top_level_command( + ["aworld-cli", "evaluator", "--target", str(target)] + ) + + assert handled is True +``` + +- [ ] **Step 2: Run the evaluator CLI tests to verify they fail** + +Run: `pytest tests/core/test_evaluator_top_level_command.py -q` +Expected: FAIL because the evaluator command is not registered yet. + +- [ ] **Step 3: Implement the minimal evaluator command and runtime** + +```python +class EvaluatorTopLevelCommand: + @property + def name(self) -> str: + return "evaluator" + + def register_parser(self, subparsers) -> None: + parser = subparsers.add_parser("evaluator", help=self.description) + parser.add_argument("--target", required=True) + parser.add_argument("--suite") + parser.add_argument("--output") + + def run(self, args, context) -> int: + result = run_evaluator_cli(target=args.target, suite=args.suite, output=args.output) + print(render_evaluator_summary(result)) + return 0 +``` + +- [ ] **Step 4: Re-run the evaluator CLI tests** + +Run: `pytest tests/core/test_evaluator_top_level_command.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the evaluator command slice** + +```bash +git add tests/core/test_evaluator_top_level_command.py aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py aworld-cli/src/aworld_cli/evaluator_runtime.py aworld-cli/src/aworld_cli/top_level_commands/__init__.py aworld-cli/src/aworld_cli/main.py +git commit -m "feat: add evaluator top level command" +``` + +### Task 4: Add the built-in app evaluator suite and end-to-end report wiring + +**Files:** +- Create: `tests/evaluations/test_app_evaluator_suite.py` +- Modify: `aworld/evaluations/substrate.py` +- Modify: `aworld-skills/app_evaluator/SKILL.md` +- Modify: `aworld-cli/src/aworld_cli/evaluator_runtime.py` + +- [ ] **Step 1: Write the failing app evaluator suite tests** + +```python +from aworld.evaluations.substrate import get_builtin_eval_suite + + +def test_app_evaluator_suite_requires_expected_judge_fields(): + suite = get_builtin_eval_suite("app-evaluator") + + assert suite.judge_schema.required_fields == ( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + + +def test_app_evaluator_suite_uses_threshold_gate(): + suite = get_builtin_eval_suite("app-evaluator") + + assert suite.gate_policy.metric_name == "score" +``` + +- [ ] **Step 2: Run the app evaluator suite tests to verify they fail** + +Run: `pytest tests/evaluations/test_app_evaluator_suite.py -q` +Expected: FAIL because the builtin app evaluator suite registry does not exist yet. + +- [ ] **Step 3: Implement the builtin app evaluator suite and CLI result persistence** + +```python +def get_builtin_eval_suite(name: str) -> EvalSuiteDef: + if name != "app-evaluator": + raise KeyError(name) + return EvalSuiteDef( + suite_id="app-evaluator", + judge_schema=JudgeSchemaDef( + required_fields=( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + ), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8, approval_threshold=0.6), + ) +``` + +- [ ] **Step 4: Re-run the app evaluator suite tests and the focused CLI/substrate suite** + +Run: `pytest tests/evaluations/test_app_evaluator_suite.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_top_level_command.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the built-in suite slice** + +```bash +git add tests/evaluations/test_app_evaluator_suite.py aworld/evaluations/substrate.py aworld-skills/app_evaluator/SKILL.md aworld-cli/src/aworld_cli/evaluator_runtime.py +git commit -m "feat: add builtin app evaluator suite" +``` + +### Task 5: Full focused verification + +**Files:** +- Test: `tests/evaluations/test_evaluation_substrate.py` +- Test: `tests/evaluations/test_app_evaluator_suite.py` +- Test: `tests/core/test_evaluator_top_level_command.py` +- Test: `tests/evaluations/test_dataset_evaluate.py` + +- [ ] **Step 1: Run the focused verification suite** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_app_evaluator_suite.py tests/core/test_evaluator_top_level_command.py tests/evaluations/test_dataset_evaluate.py -q` +Expected: PASS + +- [ ] **Step 2: Sanity-check the CLI help output** + +Run: `python -m aworld_cli.main evaluator --help` +Expected: exit 0 and help output showing `--target`, `--suite`, and `--output` + +- [ ] **Step 3: Validate the OpenSpec change remains consistent** + +Run: `openspec validate aworld-evaluation-substrate-2026-06-01` +Expected: `Change 'aworld-evaluation-substrate-2026-06-01' is valid` diff --git a/examples/aworld_quick_start/cli/README.md b/examples/aworld_quick_start/cli/README.md index 4fc9fdd49..b88391844 100644 --- a/examples/aworld_quick_start/cli/README.md +++ b/examples/aworld_quick_start/cli/README.md @@ -32,6 +32,22 @@ aworld-cli --task "Your task" --agent MyAgent - `agents/document_agent.md` - Markdown agent example - `agents/hilp.py` - Human in the loop agent example +## Evaluator Report Example + +The file `evaluator_report.example.json` shows the current stable evaluator report contract, including: + +- `report_format` and `generated_at` +- normalized `metrics` and per-case `results` +- structured `gate`, `approval`, and `automation` sections + +Use it together with `aworld-cli evaluator --print-report-schema` and `aworld-cli evaluator --validate-report ` when integrating evaluator output into scripts or CI. + +## Declared Evaluator Suite Example + +The file `declared_evaluator_suite.example.json` shows the workspace manifest format loaded from `.aworld/evaluators/*.json`. + +Use it when you want to derive a stricter evaluator from `app-evaluator` while keeping AWorld's builtin runner, suite resolution, and report contract unchanged. + ## Create Your Agent ### Python Agent @@ -58,4 +74,4 @@ Create `agents/my_agent.md`: name: MyAgent description: My agent description --- -``` \ No newline at end of file +``` diff --git a/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json b/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json new file mode 100644 index 000000000..d6e5376e4 --- /dev/null +++ b/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json @@ -0,0 +1,15 @@ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file", "directory"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + }, + "metadata": { + "owner": "qa", + "purpose": "release-gate" + }, + "priority": 120 +} diff --git a/examples/aworld_quick_start/cli/evaluator_report.example.json b/examples/aworld_quick_start/cli/evaluator_report.example.json new file mode 100644 index 000000000..401339a82 --- /dev/null +++ b/examples/aworld_quick_start/cli/evaluator_report.example.json @@ -0,0 +1,93 @@ +{ + "report_version": 1, + "report_format": { + "id": "aworld.evaluator.report", + "version": 1 + }, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": { + "target_path": "/tmp/artifact.txt", + "target_kind": "file" + }, + "summary": { + "app-evaluator": { + "score": { + "mean": 0.91, + "min": 0.91, + "max": 0.91, + "std": 0.0, + "eval_status": "PASSED" + } + } + }, + "metrics": { + "score": { + "mean": 0.91, + "min": 0.91, + "max": 0.91, + "std": 0.0, + "eval_status": "PASSED" + } + }, + "results": [ + { + "case_id": "artifact.txt", + "input": { + "target_path": "/tmp/artifact.txt", + "target_kind": "file" + }, + "metrics": { + "score": { + "value": 0.91, + "status": "PASSED" + } + }, + "judge": { + "score": 0.91, + "rank": "Exemplary", + "criticism": "Minor polish remains.", + "praise": "Strong overall structure.", + "improvement_advice": "Keep the visual hierarchy consistent." + }, + "judge_backend": { + "backend_id": "app-evaluator-agent" + } + } + ], + "result_counts": { + "cases_total": 1, + "cases_with_metrics": 1, + "cases_with_judge": 1 + }, + "gate": { + "status": "pass", + "metric_name": "score", + "value": 0.91 + }, + "approval": { + "required": false, + "resolved": false, + "approved": null + }, + "judge_backend": { + "backend_id": "app-evaluator-agent" + }, + "suite_selection": { + "requested": null, + "resolved": "app-evaluator", + "mode": "auto" + }, + "automation": { + "gate_status": "pass", + "metric_name": "score", + "metric_value": 0.91, + "approval_required": false, + "approval_resolved": false, + "approved": null, + "suggested_exit_code": 0, + "case_count": 1, + "judge_backend": "app-evaluator-agent" + }, + "report_path": "/tmp/report.json" +} diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py new file mode 100644 index 000000000..65ffd1a8c --- /dev/null +++ b/tests/core/test_evaluator_runtime.py @@ -0,0 +1,1039 @@ +from __future__ import annotations + +import base64 +import json +import sys +from pathlib import Path +from types import SimpleNamespace + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + +import aworld.evaluations.substrate as substrate_module +from aworld.evaluations.manifests import get_declared_eval_suite_schema +from aworld.evaluations.report import EvaluatorReport +from aworld_cli.evaluator_runtime import ( + _CliAgentRuntimeHarness, + _build_source_suite, + _build_source_prompt, + _build_trajectory_prompt, + available_evaluator_suites, + evaluator_exit_code, + get_declared_evaluator_suite_schema, + get_evaluator_report_schema, + run_evaluator_cli, + run_evaluator_source_cli, + validate_evaluator_report, +) +from aworld_cli.evaluator_rendering import render_evaluator_summary + + +@pytest.fixture(autouse=True) +def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + monkeypatch.setattr(substrate_module, "_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE", {}) + substrate_module.register_eval_suite( + "app-evaluator", + lambda target: substrate_module.get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, + ) + + +def test_run_evaluator_cli_persists_approval_state( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + output = tmp_path / "report.json" + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.7}}}, + "results": [], + "gate": {"status": "needs_approval", "metric_name": "score", "value": 0.7}, + "approval": {"required": True, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + monkeypatch.setattr("builtins.input", lambda _: "y") + + report = run_evaluator_cli( + target=str(target), + interactive_approval=True, + output=str(output), + ) + + persisted = json.loads(output.read_text(encoding="utf-8")) + + assert report["approval"]["resolved"] is True + assert report["approval"]["approved"] is True + assert persisted["approval"]["approved"] is True + assert persisted["judge_backend"]["backend_id"] == "stub-agent" + + +def test_run_evaluator_source_cli_builds_task_answer_flow_with_default_fields( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + output = tmp_path / "report.json" + captured = {} + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + return { + "report_version": 1, + "suite_id": "answer-source-evaluator", + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="answer", + judge_agent=str(judge_agent), + output=str(output), + ) + + flow = captured["flow"] + assert flow.target["target_kind"] == "source" + assert flow.target["source_kind"] == "answer" + assert flow.suite.cases[0].case_id == "case-1" + assert flow.suite.cases[0].input == {"input": "question"} + assert flow.suite.judge_backend.backend_id == "source-agent-md" + assert report["source_selection"]["kind"] == "answer" + assert report["automation"]["source_kind"] == "answer" + assert output.exists() + + +def test_run_evaluator_source_cli_builds_task_flow_with_default_agent( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + captured = {} + + class FakeHarness: + pass + + def fake_build_cli_agent_runtime_harness(*, agent_name): + captured["agent_name"] = agent_name + return FakeHarness() + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + return { + "report_version": 1, + "suite_id": "task-source-evaluator", + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"task-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="task", + judge_agent=str(judge_agent), + output=str(tmp_path / "report.json"), + ) + + flow = captured["flow"] + assert captured["agent_name"] == "Aworld" + assert flow.target["source_kind"] == "task" + assert flow.target["agent"] == "Aworld" + assert flow.suite.cases[0].case_id == "case-1" + assert flow.suite.cases[0].input == {"input": "question"} + assert flow.suite.runtime_harness is not None + assert report["source_selection"]["kind"] == "task" + assert report["source_selection"]["agent"] == "Aworld" + assert report["automation"]["source_kind"] == "task" + + +def test_task_source_gate_consumes_answer_veto_signal( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + class FakeHarness: + pass + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + lambda *, agent_name: FakeHarness(), + ) + + suite = _build_source_suite( + kind="task", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=None, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + payload = suite.judge_schema.validate_payload( + {"score": 95.0, "verdict": "Excellent", "veto_triggered": True} + ) + assert payload["veto_triggered"] is True + pass_conditions = suite.gate_policy.normalized_conditions()[0] + assert any( + condition.metric_name == "veto_triggered" + and condition.op == "==" + and condition.threshold is False + for condition in pass_conditions + ) + decision = suite.gate_policy.evaluate({"score": 95.0, "veto_triggered": True}) + assert decision.status == "fail" + assert any(condition["metric_name"] == "veto_triggered" for condition in decision.failed_conditions) + + +def test_run_evaluator_source_cli_builds_generated_trajectory_flow_with_default_agent( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + captured = {} + + class FakeHarness: + pass + + def fake_build_cli_agent_runtime_harness(*, agent_name): + captured["agent_name"] = agent_name + return FakeHarness() + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + return { + "report_version": 1, + "suite_id": "trajectory-source-evaluator", + "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, + "summary": {"trajectory-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="trajectory", + judge_agent=str(judge_agent), + output=str(tmp_path / "report.json"), + ) + + flow = captured["flow"] + assert captured["agent_name"] == "Aworld" + assert flow.target["source_kind"] == "trajectory" + assert flow.target["agent"] == "Aworld" + assert flow.suite.cases[0].case_id == "case-1" + assert flow.suite.cases[0].input == {"input": "question"} + assert report["source_selection"]["kind"] == "trajectory" + assert report["source_selection"]["agent"] == "Aworld" + + +@pytest.mark.asyncio +async def test_cli_agent_runtime_harness_returns_rollout_state( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeExecutor: + async def chat(self, query): + return f"answer for {query}" + + async def fake_load_cli_agent_executor(agent_name): + assert agent_name == "Aworld" + return FakeExecutor() + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_cli_agent_executor", + fake_load_cli_agent_executor, + ) + + case = SimpleNamespace( + case_id="case-1", + input={"input": "question"}, + metadata={ + "source_record": { + "metadata": {"source_kind": "task", "source_path": "tasks.jsonl"}, + }, + }, + ) + state = await _CliAgentRuntimeHarness(agent_name="Aworld").run_rollout( + case=case, + target={"source_kind": "task"}, + ) + + assert state.status == "success" + assert state.answer == "answer for question" + assert state.outcome["has_answer"] is True + assert state.metadata["agent"] == "Aworld" + assert state.metadata["source_kind"] == "task" + assert state.standard_metrics["n_turns"] == 2 + + +@pytest.mark.asyncio +async def test_cli_agent_runtime_harness_prefers_swarm_task_response( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeSwarm: + pass + + class FakeExecutor: + swarm = FakeSwarm() + + async def chat(self, query): + raise AssertionError("chat fallback should not be used for local swarm executors") + + async def fake_load_cli_agent_executor(agent_name): + return FakeExecutor() + + async def fake_run(*, input, swarm): + assert input == "question" + assert isinstance(swarm, FakeSwarm) + return { + "answer": "answer with tools", + "trajectory": [{"tool_calls": [{"name": "search"}]}], + "usage": {"total_tokens": 12}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_cli_agent_executor", + fake_load_cli_agent_executor, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.Runners.run", fake_run) + + case = SimpleNamespace(case_id="case-1", input={"input": "question"}, metadata={}) + state = await _CliAgentRuntimeHarness(agent_name="Aworld").run_rollout( + case=case, + target={"source_kind": "task"}, + ) + + assert state.answer == "answer with tools" + assert state.tool_calls == [{"name": "search"}] + assert state.trajectory == [{"tool_calls": [{"name": "search"}]}] + assert state.standard_metrics["n_tool_calls"] == 1 + assert state.standard_metrics["n_tokens"] == 12 + + +def test_source_prompt_uses_zero_to_hundred_score_contract() -> None: + prompt = _build_source_prompt( + {"input": "question"}, + {"answer": "existing"}, + suite=None, + ) + + payload = json.loads(prompt) + assert payload["required_output_schema"]["score"] == "number, weighted score from 0 to 100" + assert payload["required_output_schema"]["veto_triggered"] == "boolean, true only for one-vote veto failures" + + +def test_run_evaluator_source_cli_rejects_unsupported_source_kind(tmp_path: Path) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + with pytest.raises(ValueError, match="unsupported source kind"): + run_evaluator_source_cli( + input=str(input_path), + kind="task-only", + judge_agent=str(judge_agent), + ) + + +def test_trajectory_source_gate_consumes_veto_signal(tmp_path: Path) -> None: + task_id = "task-with-veto" + trajectory = [ + { + "state": {"input": {"content": "question"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "final", "is_agent_finished": "True"}, + } + ] + input_path = tmp_path / "trajectory.log" + input_path.write_text( + repr({"task_id": task_id, "is_sub_task": False, "trajectory": json.dumps(trajectory)}) + "\n", + encoding="utf-8", + ) + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + suite = _build_source_suite( + kind="trajectory", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=task_id, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + pass_conditions = suite.gate_policy.normalized_conditions()[0] + assert any( + condition.metric_name == "veto_triggered" + and condition.op == "==" + and condition.threshold is False + for condition in pass_conditions + ) + decision = suite.gate_policy.evaluate( + { + "score": 95.0, + "A1_groundedness": 5, + "has_evidence": 1.0, + "agent_finished": 1.0, + "veto_triggered": True, + } + ) + assert decision.status == "fail" + assert any(condition["metric_name"] == "veto_triggered" for condition in decision.failed_conditions) + + +def test_aworld_trajectory_log_without_task_id_builds_task_execution_suite( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + captured = {} + + class FakeHarness: + pass + + def fake_build_cli_agent_runtime_harness(*, agent_name): + captured["agent_name"] = agent_name + return FakeHarness() + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + + suite = _build_source_suite( + kind="trajectory", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=None, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + assert captured["agent_name"] == "Aworld" + assert suite.suite_id == "trajectory-source-evaluator" + assert suite.cases[0].case_id == "case-1" + assert suite.cases[0].input == {"input": "question"} + assert suite.runtime_harness is not None + assert suite.judge_backend.backend_id == "trajectory-evaluator-agent-md" + pass_conditions = suite.gate_policy.normalized_conditions()[0] + assert any(condition.metric_name == "A1_groundedness" for condition in pass_conditions) + assert any(condition.metric_name == "veto_triggered" for condition in pass_conditions) + + +def test_trajectory_log_without_task_id_builds_replay_suite_for_all_tasks( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "trajectory.log" + trajectory = [ + { + "state": {"input": {"content": "question"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "final", "is_agent_finished": "True"}, + } + ] + input_path.write_text( + "\n".join( + [ + repr({"task_id": "task-1", "is_sub_task": False, "trajectory": json.dumps(trajectory)}), + repr({"task_id": "task-2", "is_sub_task": False, "trajectory": json.dumps(trajectory)}), + ] + ) + + "\n", + encoding="utf-8", + ) + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + def fake_build_cli_agent_runtime_harness(*, agent_name): + raise AssertionError("trajectory log replay must not execute the main agent") + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + + suite = _build_source_suite( + kind="trajectory", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=None, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + assert suite.suite_id == "trajectory-source-evaluator" + assert [case.case_id for case in suite.cases] == ["task-1", "task-2"] + assert suite.runtime_harness is not None + + +def test_trajectory_prompt_can_use_generated_runtime_trajectory() -> None: + prompt = json.loads( + _build_trajectory_prompt( + {"input": "question", "_case_metadata": {}}, + { + "case_id": "case-1", + "answer": "final answer", + "trajectory": [ + { + "state": { + "input": {"content": "question"}, + "messages": [{"role": "tool", "content": "evidence"}], + }, + "meta": {"step": 1, "agent_id": "Aworld"}, + "action": { + "content": "final answer", + "is_agent_finished": "True", + "tool_calls": [{"function": {"name": "search", "arguments": "{}"}}], + }, + } + ], + }, + suite=None, + ) + ) + + extracted = prompt["extracted_trajectory"] + assert extracted["task_id"] == "case-1" + assert extracted["question"] == "question" + assert extracted["final_answer"] == "final answer" + assert extracted["evidence"][0]["content"] == "evidence" + assert extracted["steps"][0]["tool_calls"] == [{"name": "search", "arguments": "{}"}] + + +def test_run_evaluator_source_cli_passes_source_fields_to_hooks( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + events: list[tuple[str, dict]] = [] + + class CaptureHook: + def __init__(self, hook_point: str): + self.hook_point = hook_point + + async def run(self, *, event, state): + events.append((self.hook_point, dict(event))) + return {"metadata": {"hook_tag": "source-hook"}} + + async def fake_run_evaluation_flow(flow): + assert flow.target["hook_tag"] == "source-hook" + return { + "report_version": 1, + "suite_id": "answer-source-evaluator", + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": False, "resolved": False, "approved": None}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_evaluator_hooks", + lambda: { + "evaluator.pre_run": (CaptureHook("pre"),), + "evaluator.post_run": (CaptureHook("post"),), + }, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + run_evaluator_source_cli( + input=str(input_path), + kind="answer", + judge_agent=str(judge_agent), + task_id="case-1", + output=str(tmp_path / "report.json"), + ) + + assert events[0][0] == "pre" + assert events[0][1]["mode"] == "source" + assert events[0][1]["input"] == str(input_path.resolve()) + assert events[0][1]["kind"] == "answer" + assert events[0][1]["task_id"] == "case-1" + assert events[0][1]["judge_agent"] == str(judge_agent.resolve()) + assert events[1][0] == "post" + assert events[1][1]["mode"] == "source" + assert events[1][1]["report"]["source_selection"]["kind"] == "answer" + + +def test_run_evaluator_source_cli_persists_schema_valid_source_report( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-10T00:00:00Z", + "suite_id": "answer-source-evaluator", + "target": flow.target, + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}}, + "metrics": {"score": {"mean": 88.0}}, + "results": [ + { + "case_id": "case-1", + "input": {"input": "question"}, + "metrics": {"score": {"value": 88.0, "status": "PASSED"}}, + "judge": {"score": 88.0, "verdict": "Pass"}, + "judge_backend": {"backend_id": "source-agent-md"}, + "state_summary": {"answer": "existing"}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "pass", "metric_name": "score", "value": 88.0}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="answer", + judge_agent=str(judge_agent), + output=str(tmp_path / "report.json"), + ) + + validate_evaluator_report(report) + + +@pytest.mark.asyncio +async def test_framework_run_evaluation_flow_returns_report_object() -> None: + async def fake_judge(case_input, target): + return {"score": 0.9} + + flow = substrate_module.EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=substrate_module.EvalSuiteDef( + suite_id="app-evaluator", + cases=[substrate_module.EvalCaseDef(case_id="case-1", input={"query": "demo"})], + gate_policy=substrate_module.GatePolicyDef(metric_name="score", pass_threshold=0.0), + judge=fake_judge, + ), + ) + + report = await substrate_module.run_evaluation_flow(flow) + + assert isinstance(report, EvaluatorReport) + assert report["suite_id"] == "app-evaluator" + + +def test_run_evaluator_cli_writes_default_report_when_output_is_omitted( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + monkeypatch.chdir(tmp_path) + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + report_path = Path(report["report_path"]) + persisted = json.loads(report_path.read_text(encoding="utf-8")) + + assert report_path.exists() + assert report_path.parent == tmp_path / ".aworld" / "evaluations" + assert persisted["suite_id"] == "app-evaluator" + + +def test_available_evaluator_suites_lists_builtin_suite() -> None: + suites = available_evaluator_suites() + + assert "app-evaluator" in suites + + +def test_cli_schema_helpers_delegate_to_framework_sources() -> None: + assert get_declared_evaluator_suite_schema() == get_declared_eval_suite_schema() + assert get_evaluator_report_schema()["title"] == "AWorld Evaluator Report" + + +def test_available_evaluator_suites_filters_by_target( + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.png" + target.write_bytes( + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=" + ) + ) + + suites = available_evaluator_suites(target=str(target)) + + assert suites == ["app-evaluator"] + + +def test_available_evaluator_suites_loads_declared_suites_from_workspace( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.chdir(tmp_path) + + suites = available_evaluator_suites(target=str(target)) + + assert "strict-ui" in suites + + +def test_available_evaluator_suites_uses_target_workspace_not_process_cwd( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + workspace = tmp_path / "project" + manifest_dir = workspace / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + target = workspace / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.chdir(tmp_path) + + suites = available_evaluator_suites(target=str(target)) + + assert "strict-ui" in suites + + +def test_run_evaluator_cli_marks_image_targets( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.png" + target.write_bytes( + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=" + ) + ) + + async def fake_run_evaluation_flow(flow): + assert flow.target["target_kind"] == "image" + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["suite_id"] == "app-evaluator" + + +def test_run_evaluator_cli_records_suite_selection_metadata( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["suite_selection"]["mode"] == "auto" + assert report["suite_selection"]["resolved"] == "app-evaluator" + + +def test_run_evaluator_cli_adds_automation_metadata( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.7}}}, + "metrics": {"score": {"mean": 0.7}}, + "result_counts": {"cases_total": 2, "cases_with_metrics": 2, "cases_with_judge": 2}, + "results": [{}, {}], + "gate": {"status": "needs_approval", "metric_name": "score", "value": 0.7}, + "approval": {"required": True, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["automation"]["gate_status"] == "needs_approval" + assert report["automation"]["case_count"] == 2 + assert report["automation"]["judge_backend"] == "stub-agent" + assert report["automation"]["suggested_exit_code"] == 3 + + +def test_run_evaluator_cli_rejects_missing_target( + tmp_path: Path, +) -> None: + missing = tmp_path / "missing.txt" + + with pytest.raises(FileNotFoundError, match="does not exist"): + run_evaluator_cli(target=str(missing)) + + +def test_evaluator_exit_code_matches_gate_and_approval() -> None: + assert evaluator_exit_code({"gate": {"status": "pass"}, "approval": {}}) == 0 + assert evaluator_exit_code({"gate": {"status": "fail"}, "approval": {}}) == 2 + assert evaluator_exit_code( + {"gate": {"status": "needs_approval"}, "approval": {"approved": False}} + ) == 3 + + +def test_get_evaluator_report_schema_describes_report_contract() -> None: + schema = get_evaluator_report_schema() + + assert schema["$schema"] == "https://json-schema.org/draft/2020-12/schema" + assert schema["title"] == "AWorld Evaluator Report" + assert "report_format" in schema["required"] + assert schema["properties"]["report_format"]["properties"]["id"]["const"] == "aworld.evaluator.report" + assert schema["properties"]["report_format"]["properties"]["version"]["const"] == 1 + assert schema["properties"]["metrics"]["additionalProperties"]["$ref"] == "#/$defs/metricAggregate" + assert ( + schema["properties"]["results"]["items"]["properties"]["metrics"]["additionalProperties"]["$ref"] + == "#/$defs/caseMetric" + ) + assert schema["properties"]["gate"]["$ref"] == "#/$defs/gateDecision" + assert schema["properties"]["automation"]["$ref"] == "#/$defs/automationSummary" + assert schema["$defs"]["gateDecision"]["properties"]["status"]["enum"] == ["pass", "fail", "needs_approval"] + assert schema["$defs"]["automationSummary"]["properties"]["suggested_exit_code"]["enum"] == [0, 2, 3] + assert schema["$defs"]["automationSummary"]["required"] == [ + "gate_status", + "metric_name", + "metric_value", + "approval_required", + "approval_resolved", + "approved", + "suggested_exit_code", + "case_count", + "judge_backend", + ] + + +def test_validate_evaluator_report_accepts_valid_report() -> None: + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9, "min": 0.9, "max": 0.9, "std": 0.0, "eval_status": "PASSED"}}, + "results": [ + { + "case_id": "artifact.txt", + "input": {"target_path": "/tmp/artifact.txt"}, + "metrics": {"score": {"value": 0.9, "status": "PASSED"}}, + "judge": {"score": 0.9}, + "judge_backend": {"backend_id": "stub-agent"}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "pass", + "metric_name": "score", + "metric_value": 0.9, + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 0, + "case_count": 1, + "judge_backend": "stub-agent", + }, + } + + validate_evaluator_report(report) + + +def test_validate_and_render_categorical_gate_report() -> None: + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "categorical-suite", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"categorical-suite": {"verdict": {"value": "approved"}}}, + "metrics": {"verdict": {"value": "approved", "eval_status": "PASSED"}}, + "results": [ + { + "case_id": "artifact.txt", + "input": {"target_path": "/tmp/artifact.txt"}, + "metrics": {"verdict": {"value": "approved", "status": "PASSED"}}, + "judge": {"score": 1.0, "verdict": "approved"}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "pass", "metric_name": "verdict", "value": "approved"}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "pass", + "metric_name": "verdict", + "metric_value": "approved", + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 0, + "case_count": 1, + "judge_backend": None, + }, + } + + validate_evaluator_report(report) + + assert "approved" in render_evaluator_summary(report) + + +def test_validate_evaluator_report_rejects_invalid_gate_status() -> None: + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "gate": {"status": "maybe", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "maybe", + "metric_name": "score", + "metric_value": 0.9, + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 0, + "case_count": 0, + "judge_backend": None, + }, + } + + with pytest.raises(ValueError, match="status"): + validate_evaluator_report(report) + + +def test_get_declared_evaluator_suite_schema_describes_manifest_contract() -> None: + schema = get_declared_evaluator_suite_schema() + + assert schema["$schema"] == "https://json-schema.org/draft/2020-12/schema" + assert schema["title"] == "AWorld Declared Evaluator Suite" + assert schema["properties"]["base_suite"]["const"] == "app-evaluator" + assert "suite_id" in schema["required"] + assert "target_kinds" in schema["properties"] diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py new file mode 100644 index 000000000..41af8bbe9 --- /dev/null +++ b/tests/core/test_evaluator_top_level_command.py @@ -0,0 +1,555 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from types import SimpleNamespace + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + +import aworld.evaluations.substrate as substrate_module +from aworld_cli import main as main_module +from aworld_cli.core.top_level_command_system import TopLevelCommandContext +from aworld_cli.top_level_commands.evaluator_cmd import EvaluatorTopLevelCommand + + +@pytest.fixture(autouse=True) +def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + monkeypatch.setattr(substrate_module, "_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE", {}) + substrate_module.register_eval_suite( + "app-evaluator", + lambda target: substrate_module.get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, + ) + + +def test_registry_registers_builtin_evaluator_command() -> None: + registry = main_module._build_top_level_command_registry() + + command = registry.get("evaluator") + + assert command is not None + assert command.name == "evaluator" + + +def test_maybe_dispatch_top_level_command_runs_evaluator_command( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + def fake_run_evaluator_cli(**kwargs): + assert kwargs["target"] == str(target) + return { + "suite_id": "app-evaluator", + "gate": {"status": "pass"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + fake_run_evaluator_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + ["aworld-cli", "evaluator", "--target", str(target)] + ) + output = capsys.readouterr().out + + assert handled is True + assert "app-evaluator" in output + assert "pass" in output + + +def test_maybe_dispatch_top_level_command_runs_source_evaluator_command( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "answer-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + [ + "aworld-cli", + "evaluator", + "--input", + str(input_path), + "--kind", + "answer", + "--judge-agent", + str(judge_agent), + "--out-dir", + str(tmp_path / "reports"), + ] + ) + output = capsys.readouterr().out + + assert handled is True + assert calls["input"] == str(input_path) + assert calls["kind"] == "answer" + assert calls["judge_agent"] == str(judge_agent) + assert calls["out_dir"] == str(tmp_path / "reports") + assert calls["id_field"] == "id" + assert calls["task_field"] == "input" + assert calls["answer_field"] == "answer" + assert "answer-source-evaluator" in output + assert "pass" in output + + +def test_maybe_dispatch_top_level_command_runs_task_source_with_default_agent( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "task-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"task-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [{"state_summary": {"answer": "answer"}}], + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + [ + "aworld-cli", + "evaluator", + "--input", + str(input_path), + "--kind", + "task", + "--judge-agent", + str(judge_agent), + "--out-dir", + str(tmp_path / "reports"), + ] + ) + output = capsys.readouterr().out + + assert handled is True + assert calls["kind"] == "task" + assert calls["agent"] is None + assert "task-source-evaluator" in output + assert "pass" in output + + +def test_evaluator_command_returns_nonzero_for_failed_gate( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + lambda **kwargs: { + "suite_id": "app-evaluator", + "gate": {"status": "fail", "value": 0.3}, + "approval": {"required": False, "resolved": False, "approved": None}, + }, + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target="artifact.txt", + suite=None, + output=None, + interactive_approval=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + assert exit_code == 2 + + +def test_evaluator_source_run_rejects_target_mode_arguments( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target="artifact.txt", + input="answers.jsonl", + kind="answer", + judge_agent="agent.md", + out_dir=None, + output=None, + task_id=None, + agent=None, + id_field="id", + task_field="input", + answer_field="answer", + interactive_approval=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert "--target cannot be used with --input" in output + + +@pytest.mark.parametrize( + ("arg_name", "expected"), + [ + ("suite", "--suite cannot be used with --input"), + ("list_suites", "--list-suites cannot be used with --input"), + ("print_report_schema", "--print-report-schema cannot be used with --input"), + ("validate_report", "--validate-report cannot be used with --input"), + ], +) +def test_evaluator_source_run_rejects_other_target_mode_arguments( + arg_name: str, + expected: str, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + args = { + "target": None, + "suite": None, + "input": "answers.jsonl", + "kind": "answer", + "judge_agent": "agent.md", + "out_dir": None, + "output": None, + "task_id": None, + "agent": None, + "id_field": "id", + "task_field": "input", + "answer_field": "answer", + "interactive_approval": False, + "list_suites": False, + "print_report_schema": False, + "validate_report": None, + } + args[arg_name] = "value" if arg_name in {"suite", "validate_report"} else True + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace(**args), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert expected in output + + +def test_evaluator_source_mode_requires_kind_and_judge_agent( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + input="answers.jsonl", + kind=None, + judge_agent=None, + out_dir=None, + output=None, + task_id=None, + agent=None, + id_field="id", + task_field="input", + answer_field="answer", + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=None, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert "--kind is required with --input" in output + + +def test_evaluator_command_returns_nonzero_for_unresolved_approval( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + lambda **kwargs: { + "suite_id": "app-evaluator", + "gate": {"status": "needs_approval", "value": 0.7}, + "approval": {"required": True, "resolved": False, "approved": None}, + }, + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target="artifact.txt", + suite=None, + output=None, + interactive_approval=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + assert exit_code == 3 + + +def test_evaluator_command_lists_available_suites( + capsys: pytest.CaptureFixture[str], +) -> None: + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=True, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "app-evaluator" in output + + +def test_evaluator_command_lists_target_matching_suites_and_default( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=str(target), + suite=None, + output=None, + interactive_approval=False, + list_suites=True, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "Available evaluator suites for target:" in output + assert "Default suite: app-evaluator" in output + + +def test_evaluator_command_prints_report_schema( + capsys: pytest.CaptureFixture[str], +) -> None: + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + print_report_schema=True, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "\"title\": \"AWorld Evaluator Report\"" in output + assert "\"aworld.evaluator.report\"" in output + + +def test_evaluator_command_validates_report_file( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + report_path = tmp_path / "report.json" + report_path.write_text( + """ +{ + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": false, "resolved": false, "approved": null}, + "automation": { + "gate_status": null, + "metric_name": null, + "metric_value": null, + "approval_required": false, + "approval_resolved": false, + "approved": null, + "suggested_exit_code": 0, + "case_count": 0, + "judge_backend": null + } +} +""".strip(), + encoding="utf-8", + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=str(report_path), + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "Report is valid" in output + + +def test_evaluator_command_returns_nonzero_for_invalid_report( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + report_path = tmp_path / "report.json" + report_path.write_text( + """ +{ + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "gate": {"status": "maybe", "metric_name": "score", "value": 0.9}, + "approval": {"required": false, "resolved": false, "approved": null}, + "automation": { + "gate_status": "maybe", + "metric_name": "score", + "metric_value": 0.9, + "approval_required": false, + "approval_resolved": false, + "approved": null, + "suggested_exit_code": 0, + "case_count": 0, + "judge_backend": null + } +} +""".strip(), + encoding="utf-8", + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=str(report_path), + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 4 + assert "Report is invalid" in output + + +def test_evaluator_command_returns_usage_error_without_target( + capsys: pytest.CaptureFixture[str], +) -> None: + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 1 + assert "--target is required" in output + + +def test_evaluator_command_returns_nonzero_for_missing_target( + capsys: pytest.CaptureFixture[str], + tmp_path: Path, +) -> None: + missing = tmp_path / "missing.txt" + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=str(missing), + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 1 + assert "does not exist" in output diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py new file mode 100644 index 000000000..801aab593 --- /dev/null +++ b/tests/docs/test_evaluator_report_docs.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from pathlib import Path + + +def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: + doc_path = Path("docs/AWorld CLI/Commands/Evaluator.md") + overview_path = Path("docs/AWorld CLI/Commands/Overview.md") + + content = doc_path.read_text(encoding="utf-8") + overview = overview_path.read_text(encoding="utf-8") + + assert "aworld-cli evaluator" in content + assert "--print-report-schema" in content + assert "--validate-report" in content + assert "aworld-cli evaluator --input" in content + assert "--kind task" in content + assert "--kind answer" in content + assert "--kind trajectory" in content + assert "report_format" in content + assert "automation" in content + assert ".aworld/evaluators/*.json" in content + assert "declared_evaluator_suite.example.json" in content + assert "get_declared_evaluator_suite_schema()" in content + assert "Evaluator" in overview + + +def test_evaluator_report_example_includes_stable_contract_fields() -> None: + example_path = Path("examples/aworld_quick_start/cli/evaluator_report.example.json") + manifest_example_path = Path("examples/aworld_quick_start/cli/declared_evaluator_suite.example.json") + recipe_path = Path("docs/AWorld CLI/Recipes/Mini App Build.md") + readme_path = Path("examples/aworld_quick_start/cli/README.md") + + content = example_path.read_text(encoding="utf-8") + manifest_example = manifest_example_path.read_text(encoding="utf-8") + recipe = recipe_path.read_text(encoding="utf-8") + readme = readme_path.read_text(encoding="utf-8") + + assert '"report_format"' in content + assert '"generated_at"' in content + assert '"metrics"' in content + assert '"automation"' in content + assert '"suite_id"' in manifest_example + assert '"base_suite": "app-evaluator"' in manifest_example + assert '"target_kinds"' in manifest_example + assert '"gate_policy"' in manifest_example + assert ".aworld/evaluators/*.json" in readme + assert "declared_evaluator_suite.example.json" in readme + assert "--validate-report" in recipe + assert "--print-report-schema" in recipe diff --git a/tests/evaluations/conftest.py b/tests/evaluations/conftest.py new file mode 100644 index 000000000..baa2605bf --- /dev/null +++ b/tests/evaluations/conftest.py @@ -0,0 +1,46 @@ +from __future__ import annotations + + +def pytest_addoption(parser): + group = parser.getgroup("trajectory evaluator") + group.addoption( + "--task-id", + "--task_id", + action="store", + dest="trajectory_task_id", + default=None, + help="Task id to replay from the trajectory log.", + ) + group.addoption( + "--trajectory-log", + "--trajectory_log", + action="store", + dest="trajectory_log", + default=None, + help="Path to the trajectory log used by the manual replay test.", + ) + group.addoption( + "--agent-prompt", + "--agent_prompt", + action="store", + dest="trajectory_agent_prompt", + default=None, + help="Path to the trajectory evaluator agent.md prompt.", + ) + group.addoption( + "--out-dir", + "--out_dir", + action="store", + dest="trajectory_out_dir", + default=None, + help="Directory for extracted trajectory and evaluator report outputs.", + ) + group.addoption( + "--judge-timeout", + "--judge_timeout", + action="store", + dest="trajectory_judge_timeout", + default=None, + type=float, + help="Timeout in seconds for the trajectory evaluator judge agent.", + ) diff --git a/tests/evaluations/test_environment_isolation.py b/tests/evaluations/test_environment_isolation.py new file mode 100644 index 000000000..51f052ff1 --- /dev/null +++ b/tests/evaluations/test_environment_isolation.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.runtime_composition import ( + EnvironmentIsolatedRuntimeHarness, + EnvironmentSnapshot, + RetryRuntimeHarness, + RolloutState, +) +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + TrialPolicyDef, + run_evaluation_flow, +) + + +def test_environment_snapshot_excludes_live_handles(): + snapshot = EnvironmentSnapshot( + environment_id="env-1", + trial_id="case-1::trial-1", + metadata={"workspace": "/tmp/demo", "client": object()}, + ) + + assert snapshot.to_dict() == { + "environment_id": "env-1", + "trial_id": "case-1::trial-1", + "metadata": {"workspace": "/tmp/demo"}, + } + + +@pytest.mark.asyncio +async def test_environment_isolated_harness_resets_and_cleans_up(): + events = [] + + class RecordingFixture: + async def reset(self, *, case, target): + events.append(("reset", case.case_id)) + return EnvironmentSnapshot( + environment_id="env-1", + trial_id=case.input["_trial"]["trial_id"], + metadata={"workspace": "/tmp/demo", "client": object()}, + ) + + async def cleanup(self, *, snapshot, case, target, state): + events.append(("cleanup", snapshot.environment_id, state.status)) + return EnvironmentSnapshot( + environment_id=snapshot.environment_id, + trial_id=snapshot.trial_id, + metadata={"cleaned": True}, + ) + + class InspectingHarness: + async def run_rollout(self, *, case, target): + assert case.input["_environment"]["environment_id"] == "env-1" + assert case.metadata["_environment"]["trial_id"] == "case-1::trial-1" + assert target["_environment"]["metadata"]["workspace"] == "/tmp/demo" + return RolloutState(case_id=case.case_id, status="success", answer="ok") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=InspectingHarness(), + fixture=RecordingFixture(), + ) + case = EvalCaseDef( + case_id="case-1::trial-1", + input={"_trial": {"trial_id": "case-1::trial-1"}}, + ) + + state = await harness.run_rollout(case=case, target={}) + + assert events == [("reset", "case-1::trial-1"), ("cleanup", "env-1", "success")] + assert state.metadata["environment"]["environment_id"] == "env-1" + assert state.metadata["environment_cleanup"]["metadata"]["cleaned"] is True + + +@pytest.mark.asyncio +async def test_environment_isolation_resets_once_per_trial(): + class CountingFixture: + def __init__(self): + self.resets = [] + self.cleanups = [] + + async def reset(self, *, case, target): + trial_id = case.input["_trial"]["trial_id"] + self.resets.append(trial_id) + return EnvironmentSnapshot( + environment_id=f"env-{len(self.resets)}", + trial_id=trial_id, + metadata={"trial_id": trial_id}, + ) + + async def cleanup(self, *, snapshot, case, target, state): + self.cleanups.append(snapshot.trial_id) + return EnvironmentSnapshot( + environment_id=snapshot.environment_id, + trial_id=snapshot.trial_id, + metadata={"cleaned": True}, + ) + + class EnvironmentAwareHarness: + async def run_rollout(self, *, case, target): + environment_id = case.input["_environment"]["environment_id"] + return RolloutState( + case_id=case.case_id, + status="success", + answer=environment_id, + ) + + async def fake_judge(case_input, target): + return {"score": 1.0} + + fixture = CountingFixture() + suite = EvalSuiteDef( + suite_id="environment-trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + runtime_harness=EnvironmentIsolatedRuntimeHarness( + base_harness=EnvironmentAwareHarness(), + fixture=fixture, + ), + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert fixture.resets == ["case-1::trial-1", "case-1::trial-2"] + assert fixture.cleanups == ["case-1::trial-1", "case-1::trial-2"] + assert report["results"][0]["metadata"]["environment"]["environment_id"] == "env-1" + assert report["results"][1]["metadata"]["environment"]["environment_id"] == "env-2" + + +@pytest.mark.asyncio +async def test_retry_inside_environment_isolation_does_not_increase_reset_count(): + class CountingFixture: + def __init__(self): + self.reset_count = 0 + + async def reset(self, *, case, target): + self.reset_count += 1 + return EnvironmentSnapshot(environment_id=f"env-{self.reset_count}") + + async def cleanup(self, *, snapshot, case, target, state): + return None + + class FlakyHarness: + def __init__(self): + self.calls = 0 + + async def run_rollout(self, *, case, target): + self.calls += 1 + if self.calls % 2 == 1: + return RolloutState(case_id=case.case_id, status="failed", answer="failed-attempt") + return RolloutState(case_id=case.case_id, status="success", answer="passed-trial") + + async def fake_judge(case_input, target): + return {"score": 1.0 if target.get("answer") == "passed-trial" else 0.0} + + fixture = CountingFixture() + suite = EvalSuiteDef( + suite_id="environment-retry-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + runtime_harness=EnvironmentIsolatedRuntimeHarness( + base_harness=RetryRuntimeHarness(base_harness=FlakyHarness(), max_attempts=2), + fixture=fixture, + ), + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert fixture.reset_count == 2 + assert len(report["results"][0]["artifacts"]["attempts"]) == 2 + assert len(report["results"][1]["artifacts"]["attempts"]) == 2 + + +@pytest.mark.asyncio +async def test_environment_cleanup_runs_when_rollout_raises(): + events = [] + + class RecordingFixture: + async def reset(self, *, case, target): + events.append("reset") + return EnvironmentSnapshot(environment_id="env-1") + + async def cleanup(self, *, snapshot, case, target, state): + events.append(("cleanup", state.status)) + return None + + class RaisingHarness: + async def run_rollout(self, *, case, target): + raise RuntimeError("rollout boom") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=RaisingHarness(), + fixture=RecordingFixture(), + ) + + with pytest.raises(RuntimeError, match="rollout boom"): + await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + assert events == ["reset", ("cleanup", "failed")] + + +@pytest.mark.asyncio +async def test_reset_failure_prevents_rollout_execution(): + class FailingResetFixture: + async def reset(self, *, case, target): + raise RuntimeError("reset boom") + + async def cleanup(self, *, snapshot, case, target, state): + raise AssertionError("cleanup should not run when reset fails") + + class UnexpectedHarness: + async def run_rollout(self, *, case, target): + raise AssertionError("rollout should not run when reset fails") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=UnexpectedHarness(), + fixture=FailingResetFixture(), + ) + + with pytest.raises(RuntimeError, match="reset boom"): + await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + +@pytest.mark.asyncio +async def test_cleanup_failure_during_rollout_error_preserves_rollout_error(): + class FailingCleanupFixture: + async def reset(self, *, case, target): + return EnvironmentSnapshot(environment_id="env-1") + + async def cleanup(self, *, snapshot, case, target, state): + raise RuntimeError("cleanup boom") + + class RaisingHarness: + async def run_rollout(self, *, case, target): + raise RuntimeError("rollout boom") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=RaisingHarness(), + fixture=FailingCleanupFixture(), + ) + + with pytest.raises(RuntimeError, match="rollout boom"): + await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + +@pytest.mark.asyncio +async def test_cleanup_failure_after_success_marks_rollout_failed(): + class FailingCleanupFixture: + async def reset(self, *, case, target): + return EnvironmentSnapshot(environment_id="env-1") + + async def cleanup(self, *, snapshot, case, target, state): + raise RuntimeError("cleanup boom") + + class PassingHarness: + async def run_rollout(self, *, case, target): + return RolloutState(case_id=case.case_id, status="success", answer="ok") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=PassingHarness(), + fixture=FailingCleanupFixture(), + ) + + state = await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + assert state.status == "failed" + assert state.error == { + "type": "RuntimeError", + "message": "cleanup boom", + "phase": "environment_cleanup", + } + assert state.metadata["environment_cleanup_error"]["message"] == "cleanup boom" diff --git a/tests/evaluations/test_evaluation_input_sources.py b/tests/evaluations/test_evaluation_input_sources.py new file mode 100644 index 000000000..c2cb97690 --- /dev/null +++ b/tests/evaluations/test_evaluation_input_sources.py @@ -0,0 +1,258 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Literal, Mapping + +import pytest +from pydantic import BaseModel + +from aworld.evaluations.sources import ( + AWorldTrajectoryLogSource, + JsonlTaskSource, + JsonlTaskAnswerSource, + create_source_eval_suite, +) +from aworld.evaluations.state_adapters import ( + AnswerStateAdapter, + TrajectoryLogStateAdapter, +) +from aworld.evaluations.substrate import ( + CallableJudgeBackend, + EvalSuiteDef, + EvaluationFlowDef, + GatePolicyDef, + JudgeSchemaDef, + run_evaluation_flow, +) +from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema + + +class _ScoreJudgeOutput(BaseModel): + score: float + verdict: Literal["pass", "fail"] + + +def test_jsonl_task_answer_source_defaults_fields_and_default_adapter(tmp_path: Path) -> None: + path = tmp_path / "answers.jsonl" + path.write_text( + json.dumps({"id": "case-1", "input": "What is 2+2?", "answer": "4"}) + "\n", + encoding="utf-8", + ) + + source = JsonlTaskAnswerSource(path=path) + records = list(source.iter_records()) + cases = source.to_cases() + + assert records[0].case_id == "case-1" + assert records[0].input == {"input": "What is 2+2?"} + assert records[0].answer == "4" + assert isinstance(source.default_adapter(), AnswerStateAdapter) + assert cases[0].case_id == "case-1" + assert cases[0].input == {"input": "What is 2+2?"} + assert cases[0].metadata["source_record"]["answer"] == "4" + + +def test_jsonl_task_source_defaults_fields_without_answer(tmp_path: Path) -> None: + path = tmp_path / "tasks.jsonl" + path.write_text( + json.dumps({"id": "case-1", "input": "What is 2+2?"}) + "\n", + encoding="utf-8", + ) + + source = JsonlTaskSource(path=path) + records = list(source.iter_records()) + cases = source.to_cases() + + assert records[0].case_id == "case-1" + assert records[0].input == {"input": "What is 2+2?"} + assert records[0].answer is None + assert records[0].metadata["source_kind"] == "task" + assert cases[0].case_id == "case-1" + assert cases[0].input == {"input": "What is 2+2?"} + assert "answer" not in cases[0].metadata["source_record"] + + +@pytest.mark.asyncio +async def test_source_eval_suite_replays_task_answer_without_execution(tmp_path: Path) -> None: + path = tmp_path / "answers.jsonl" + path.write_text( + "\n".join( + [ + json.dumps({"id": "case-1", "input": "question", "answer": "existing answer"}), + ] + ), + encoding="utf-8", + ) + captured: dict[str, Any] = {} + + async def judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]: + captured["answer"] = target["answer"] + captured["status"] = target["status"] + return {"score": 1.0, "verdict": "pass"} + + suite = create_source_eval_suite( + suite_id="answer-source", + source=JsonlTaskAnswerSource(path=path), + judge_backend=CallableJudgeBackend(backend_id="judge", judge=judge), + judge_schema=JudgeSchemaDef(output_model=_ScoreJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=1.0), + ) + + assert isinstance(suite, EvalSuiteDef) + assert suite.runtime_harness is not None + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "source", "target_path": str(path)}, suite=suite) + ) + + assert captured == {"answer": "existing answer", "status": "success"} + assert report["gate"]["status"] == "pass" + assert report["results"][0]["state_summary"]["answer"] == "existing answer" + + +@pytest.mark.asyncio +async def test_trajectory_log_source_replays_rollout_state_with_standard_metrics(tmp_path: Path) -> None: + task_id = "task-1" + trajectory = [ + { + "state": { + "input": {"content": "question"}, + "messages": [{"role": "system", "content": "system prompt"}], + }, + "meta": {"step": 1, "pre_agent": "user", "agent_id": "agent"}, + "action": { + "tool_calls": [ + {"function": {"name": "search", "arguments": "{}"}}, + ], + "is_agent_finished": "False", + }, + }, + { + "state": { + "messages": [ + {"role": "tool", "content": "search result"}, + {"role": "assistant", "content": "final"}, + ], + }, + "meta": {"step": 2, "pre_agent": "agent", "agent_id": "agent"}, + "action": {"content": "final answer", "is_agent_finished": "True"}, + }, + ] + log_path = tmp_path / "trajectory.log" + log_path.write_text( + repr({"task_id": task_id, "is_sub_task": False, "trajectory": json.dumps(trajectory)}) + "\n", + encoding="utf-8", + ) + + source = AWorldTrajectoryLogSource(path=log_path, task_ids=[task_id], extraction_dir=tmp_path) + suite = create_source_eval_suite( + suite_id="trajectory-source", + source=source, + judge_backend=CallableJudgeBackend( + backend_id="judge", + judge=lambda case_input, target: {"score": 1.0, "verdict": "pass"}, + ), + judge_schema=JudgeSchemaDef(output_model=_ScoreJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=1.0), + ) + + assert isinstance(source.default_adapter(), TrajectoryLogStateAdapter) + assert "raw_payload" not in source.to_cases()[0].metadata["source_record"] + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "source", "target_path": str(log_path)}, suite=suite) + ) + result = report["results"][0] + + assert result["state_summary"]["answer"] == "final answer" + assert result["state_summary"]["tool_call_count"] == 1 + assert result["metadata"]["extracted_path"].endswith(f"extracted_{task_id}.json") + assert result["artifacts"]["outcome"]["evidence_blocks"] == 1 + assert result["metadata"]["standard_metrics"]["n_turns"] == 2 + assert result["metadata"]["standard_metrics"]["n_tool_calls"] == 1 + + +def test_judge_schema_normalizer_runs_before_typed_validation() -> None: + schema = JudgeSchemaDef( + output_model=_ScoreJudgeOutput, + normalizer=lambda payload: { + "score": payload["weighted_score"], + "verdict": payload["final_verdict"], + }, + ) + + payload = schema.validate_payload({"weighted_score": 0.9, "final_verdict": "pass"}) + + assert payload == {"score": 0.9, "verdict": "pass"} + + +def test_trajectory_log_source_reports_missing_task_id(tmp_path: Path) -> None: + path = tmp_path / "trajectory.log" + path.write_text("", encoding="utf-8") + + source = AWorldTrajectoryLogSource(path=path, task_ids=["missing-task"]) + + with pytest.raises(ValueError, match="missing-task"): + list(source.iter_records()) + + +def test_trajectory_log_source_can_iterate_all_tasks(tmp_path: Path) -> None: + path = tmp_path / "trajectory.log" + first = [ + { + "state": {"input": {"content": "first"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "first answer", "is_agent_finished": "True"}, + } + ] + second = [ + { + "state": {"input": {"content": "second"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "second answer", "is_agent_finished": "True"}, + } + ] + path.write_text( + "\n".join( + [ + repr({"task_id": "task-1", "is_sub_task": False, "trajectory": json.dumps(first)}), + repr({"task_id": "task-2", "is_sub_task": False, "trajectory": json.dumps(second)}), + ] + ) + + "\n", + encoding="utf-8", + ) + + records = list(AWorldTrajectoryLogSource(path=path, task_ids=None).iter_records()) + + assert [record.case_id for record in records] == ["task-1", "task-2"] + assert records[0].answer == "first answer" + assert records[1].answer == "second answer" + assert records[0].metadata["source_kind"] == "trajectory" + + +def test_trajectory_judge_schema_normalizes_dimensions_report() -> None: + schema = TrajectoryJudgeSchema.default() + + payload = schema.validate_payload( + { + "weighted_score": 76, + "verdict": "Pass", + "dimensions": { + "A1_groundedness": {"score": 4}, + "A2_completeness": {"score": 3}, + "A3_relevance": {"score": 4}, + "A4_readability": {"score": 5}, + "B1_tool_use": {"score": 4}, + "B2_efficiency": {"score": 2}, + "B3_compliance": {"score": 4}, + "B4_robustness": {"score": 3}, + }, + "veto_triggered": False, + } + ) + + assert payload["score"] == 76 + assert payload["A1_groundedness"] == 4 + assert payload["B2_efficiency"] == 2 diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py new file mode 100644 index 000000000..5c085debc --- /dev/null +++ b/tests/evaluations/test_evaluation_substrate.py @@ -0,0 +1,1413 @@ +from __future__ import annotations + +import base64 +import pytest +from pydantic import BaseModel, Field + +from aworld.evaluations.base import EvaluationConfig +import aworld.evaluations.substrate as substrate_module +from aworld.evaluations.substrate import ( + AgentJudgeBackend, + CallableJudgeBackend, + EvalCaseDef, + EvalHarnessDef, + EvalSuiteDef, + EvaluationFlowDef, + FallbackJudgeBackend, + GateMetricCondition, + GatePolicyDef, + JudgeSchemaDef, + TrajectoryScorerDef, + compile_evaluation_flow, + get_builtin_eval_suite, + list_eval_suites, + list_matching_eval_suites, + load_declared_eval_suites, + register_eval_suite, + resolve_eval_harness, + resolve_eval_suite, + resolve_eval_suite_selection, + run_evaluation_flow, +) +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.manifests import validate_declared_eval_suite_manifest +from aworld.evaluations.report import validate_evaluator_report +from aworld.evaluations.types import MetricNames + + +class DemoJudgeOutput(BaseModel): + score: float + verdict: str + + +class AliasJudgeOutput(BaseModel): + final_score: float = Field(alias="score") + verdict: str + + +@pytest.fixture(autouse=True) +def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + monkeypatch.setattr(substrate_module, "_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE", {}) + substrate_module.register_eval_suite( + "app-evaluator", + lambda target: get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, + ) + + +def test_compile_evaluation_flow_builds_inline_dataset_and_gate_config() -> None: + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[ + EvalCaseDef( + case_id="case-1", + input={"query": "hello world"}, + ) + ], + judge_schema=JudgeSchemaDef(required_fields=("score", "rank")), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8), + ) + flow = EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + + compiled = compile_evaluation_flow(flow) + + assert isinstance(compiled.eval_config, EvaluationConfig) + assert compiled.eval_config.eval_dataset is compiled.dataset + assert compiled.dataset.eval_cases[0].case_data["query"] == "hello world" + assert compiled.dataset.eval_cases[0].case_data["_target"]["target_path"] == "demo.txt" + assert compiled.dataset.eval_cases[0].case_data["_expected"] is None + assert compiled.gate_policy.metric_name == "score" + + +def test_compile_evaluation_flow_lowers_trajectory_scorers_to_eval_criteria() -> None: + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello world"})], + judge=lambda case_input, target: {"score": 1.0}, + trajectory_scorers=( + TrajectoryScorerDef( + metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, + threshold=1.0, + ), + ), + ) + + compiled = compile_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + ) + + metric_names = [criteria["metric_name"] for criteria in compiled.eval_config.eval_criterias] + assert metric_names == ["score", MetricNames.TRAJECTORY_TOOL_CALLS] + + +def test_compile_evaluation_flow_rejects_unknown_trajectory_metric() -> None: + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello world"})], + judge=lambda case_input, target: {"score": 1.0}, + trajectory_scorers=( + TrajectoryScorerDef(metric_name="trajectory_typo"), + ), + ) + + with pytest.raises(ValueError, match="unknown trajectory metric"): + compile_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + ) + + +def test_compile_evaluation_flow_rejects_unsupported_trajectory_scorer_params() -> None: + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello world"})], + judge=lambda case_input, target: {"score": 1.0}, + trajectory_scorers=( + TrajectoryScorerDef( + metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, + scorer_params={"minimum_calls": 2}, + ), + ), + ) + + with pytest.raises(ValueError, match="unsupported trajectory scorer_params"): + compile_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + ) + + +def test_eval_case_def_supports_expected_and_runtime_overrides() -> None: + case = EvalCaseDef( + case_id="case-1", + input={"query": "demo"}, + expected={"answer": "ok"}, + max_turns=3, + timeout_seconds=5.0, + metadata={"toolsets": ["search"]}, + ) + + assert case.expected == {"answer": "ok"} + assert case.max_turns == 3 + assert case.timeout_seconds == 5.0 + assert case.metadata["toolsets"] == ["search"] + + +def test_compile_evaluation_flow_uses_execution_backed_target_when_suite_declares_execution() -> None: + suite = EvalSuiteDef( + suite_id="task-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.TASK, task_builder_ref="tests.helpers:build_demo_task"), + ) + flow = EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + + compiled = compile_evaluation_flow(flow) + + assert compiled.eval_config.eval_target.__class__.__name__ == "_ConfiguredTaskEvalTarget" + + +@pytest.mark.asyncio +async def test_task_execution_rejects_path_style_task_builder_ref() -> None: + suite = EvalSuiteDef( + suite_id="task-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.TASK, task_builder_ref="scripts/run.py:build_task"), + ) + compiled = compile_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + ) + + with pytest.raises(ValueError, match="importable callable"): + await compiled.eval_config.eval_target.build_task(0, compiled.dataset.eval_cases[0]) + + +def test_compile_evaluation_flow_preserves_live_agent_target_config() -> None: + live_agent = object() + suite = EvalSuiteDef( + suite_id="agent-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec( + mode=EvalExecutionMode.AGENT, + target_config={"agent": live_agent}, + ), + ) + flow = EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + + compiled = compile_evaluation_flow(flow) + + assert compiled.eval_config.eval_target.agent is live_agent + + +@pytest.mark.asyncio +async def test_program_execution_receives_normalized_target( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def demo_program(case, spec, target): + return { + "status": "success", + "answer": target["target_path"], + "metadata": {"target_kind_seen": target["target_kind"]}, + } + + async def fake_judge(case_input, target): + return {"score": 1.0, "answer": target["answer"]} + + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: demo_program, + ) + + suite = EvalSuiteDef( + suite_id="program-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + judge=fake_judge, + ) + flow = EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + + report = await run_evaluation_flow(flow) + + assert report["results"][0]["judge"]["answer"] == "demo.txt" + assert report["results"][0]["state_summary"]["answer"] == "demo.txt" + + +@pytest.mark.asyncio +async def test_task_execution_uses_adapter_target_config_task( + monkeypatch: pytest.MonkeyPatch, +) -> None: + task = type("Task", (), {"id": "task-1"})() + + async def fake_run_task(*, task): + return {task.id: {"status": "success", "answer": "task-ok"}} + + async def fake_judge(case_input, target): + return {"score": 1.0, "answer": target["answer"]} + + monkeypatch.setattr("aworld.evaluations.execution_adapters.Runners.run_task", fake_run_task) + + suite = EvalSuiteDef( + suite_id="task-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.TASK, target_config={"task": task}), + judge=fake_judge, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "file", "target_path": "artifact.txt"}, suite=suite) + ) + + assert report["results"][0]["judge"]["answer"] == "task-ok" + + +def test_resolve_eval_harness_lowers_direct_suite_execution() -> None: + suite = EvalSuiteDef( + suite_id="program-suite", + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + ) + + harness = resolve_eval_harness(suite) + + assert harness.harness_id == "program-suite-execution" + assert harness.execution is suite.execution + assert harness.metadata["lowered_from"] == "suite.execution" + + +def test_resolve_eval_harness_prefers_explicit_harness() -> None: + harness = EvalHarnessDef( + harness_id="shared-program", + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + ) + suite = EvalSuiteDef(suite_id="program-suite", harness=harness) + + assert resolve_eval_harness(suite) is harness + + +def test_judge_schema_validation_rejects_missing_fields() -> None: + schema = JudgeSchemaDef(required_fields=("score", "rank", "criticism")) + + with pytest.raises(ValueError, match="missing required judge fields"): + schema.validate({"score": 0.8, "rank": "Good"}) + + +def test_typed_judge_model_accepts_valid_payload() -> None: + schema = JudgeSchemaDef(output_model=DemoJudgeOutput) + + payload = schema.validate_payload({"score": 0.8, "verdict": "ok"}) + + assert payload["score"] == 0.8 + assert payload["verdict"] == "ok" + + +def test_typed_judge_model_rejects_invalid_payload() -> None: + schema = JudgeSchemaDef(output_model=DemoJudgeOutput) + + with pytest.raises(ValueError, match="verdict"): + schema.validate_payload({"score": 0.8}) + + +def test_legacy_required_fields_schema_still_returns_payload() -> None: + schema = JudgeSchemaDef(required_fields=("score", "rank")) + + payload = schema.validate_payload({"score": 0.9, "rank": 1}) + + assert payload["rank"] == 1 + + +def test_judge_schema_exports_json_schema_for_typed_model() -> None: + schema = JudgeSchemaDef(output_model=DemoJudgeOutput) + + exported = schema.json_schema() + + assert exported["properties"]["score"]["type"] == "number" + assert "verdict" in exported["required"] + + +def test_typed_judge_model_returns_alias_keys_to_match_exported_schema() -> None: + schema = JudgeSchemaDef(output_model=AliasJudgeOutput) + + payload = schema.validate_payload({"score": 0.8, "verdict": "ok"}) + exported = schema.json_schema() + + assert payload["score"] == 0.8 + assert "final_score" not in payload + assert "score" in exported["properties"] + + +def test_gate_policy_uses_pass_and_approval_thresholds() -> None: + gate = GatePolicyDef( + metric_name="score", + pass_threshold=0.85, + approval_threshold=0.6, + ) + + assert gate.evaluate({"score": 0.9}).status == "pass" + assert gate.evaluate({"score": 0.7}).status == "needs_approval" + assert gate.evaluate({"score": 0.5}).status == "fail" + + +def test_composite_gate_returns_pass_when_all_conditions_hold() -> None: + policy = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ) + + decision = policy.evaluate({"score": 0.95, "latency": 4.2}) + + assert decision.status == "pass" + assert decision.metric_name is None + assert decision.value is None + assert len(decision.matched_conditions) == 2 + + +def test_composite_gate_returns_needs_approval_when_approval_conditions_hold() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),), + approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.75),), + ) + + decision = policy.evaluate({"score": 0.8}) + + assert decision.status == "needs_approval" + assert len(decision.failed_conditions) == 1 + assert len(decision.matched_conditions) == 1 + + +def test_legacy_threshold_gate_lowers_to_structured_policy() -> None: + policy = GatePolicyDef(metric_name="score", pass_threshold=0.9, approval_threshold=0.8) + + decision = policy.evaluate({"score": 0.85}) + + assert decision.status == "needs_approval" + assert decision.metric_name == "score" + assert decision.value == pytest.approx(0.85) + + +@pytest.mark.parametrize( + ("op", "threshold", "value"), + [ + (">", 0.9, 0.91), + ("<", 0.9, 0.89), + (">=", 0.9, 0.9), + ("<=", 0.9, 0.9), + ("==", "approved", "approved"), + ("!=", "blocked", "approved"), + ], +) +def test_gate_metric_condition_supports_all_declared_operators(op, threshold, value) -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="metric", op=op, threshold=threshold),) + ) + + assert policy.evaluate({"metric": value}).status == "pass" + + +def test_gate_policy_reports_missing_metric() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),) + ) + + decision = policy.evaluate({}) + + assert decision.status == "fail" + assert decision.failed_conditions == [ + {"metric_name": "score", "op": ">=", "threshold": 0.9, "reason": "missing_metric"} + ] + + +def test_gate_policy_missing_pass_metric_fails_even_when_approval_matches() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="trajectory_tool_calls", op=">=", threshold=1.0),), + approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.7),), + ) + + decision = policy.evaluate({"score": 0.8}) + + assert decision.status == "fail" + assert decision.failed_conditions == [ + {"metric_name": "trajectory_tool_calls", "op": ">=", "threshold": 1.0, "reason": "missing_metric"} + ] + + +def test_gate_policy_rejects_unsupported_operator() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op="contains", threshold=0.9),) + ) + + with pytest.raises(ValueError, match="unsupported gate operator"): + policy.evaluate({"score": 0.95}) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_executes_suite_judge_and_returns_gate() -> None: + async def fake_judge(case_input, target): + assert case_input["query"] == "hello" + assert target["target_path"] == "artifact.txt" + return { + "score": 0.7, + "rank": "Good", + "criticism": "Needs stronger hierarchy.", + "praise": "The layout is clear.", + "improvement_advice": "Increase contrast around the hero area.", + } + + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge_schema=JudgeSchemaDef( + required_fields=( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + ), + gate_policy=GatePolicyDef( + metric_name="score", + pass_threshold=0.85, + approval_threshold=0.6, + ), + judge=fake_judge, + ) + flow = EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + + report = await run_evaluation_flow(flow) + + assert report["suite_id"] == "demo-suite" + assert report["report_format"]["id"] == "aworld.evaluator.report" + assert report["report_format"]["version"] == 1 + assert report["generated_at"] + assert report["gate"]["status"] == "needs_approval" + assert report["results"][0]["judge"]["rank"] == "Good" + assert report["results"][0]["metrics"]["score"]["value"] == pytest.approx(0.7) + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["mean"] == pytest.approx(0.7) + assert report["result_counts"]["cases_total"] == 1 + assert report["result_counts"]["cases_with_metrics"] == 1 + assert report["summary"]["demo-suite"]["score"]["mean"] == pytest.approx(0.7) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_exposes_judge_schema_once() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95, "verdict": "ok"} + + suite = EvalSuiteDef( + suite_id="typed-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge_schema=JudgeSchemaDef(output_model=DemoJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8), + judge=fake_judge, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["judge_schema"]["properties"]["score"]["type"] == "number" + assert "_judge_schema" not in report["results"][0]["judge"] + validate_evaluator_report(report.to_dict()) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_evaluates_composite_gate_metrics() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert len(report["gate"]["matched_conditions"]) == 2 + assert report["metrics"]["latency"]["mean"] == pytest.approx(4.2) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_failed_composite_gate_keeps_metric_status_and_matches() -> None: + async def fake_judge(case_input, target): + return {"score": 0.7, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["eval_status"] == "FAILED" + assert report["gate"]["matched_conditions"] == [ + {"metric_name": "latency", "op": "<=", "threshold": 5.0} + ] + assert report["gate"]["failed_conditions"] == [ + {"metric_name": "score", "op": ">=", "threshold": 0.9} + ] + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_missing_gate_metric_fails_closed_and_keeps_results() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="trajectory_tool_calls", op=">=", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["gate"]["failed_conditions"] == [ + {"metric_name": "trajectory_tool_calls", "op": ">=", "threshold": 1.0, "reason": "missing_metric"} + ] + assert report["results"][0]["case_id"] == "case-1" + assert report["results"][0]["metrics"]["score"]["value"] == pytest.approx(0.95) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_composite_gate_is_not_condition_order_sensitive() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["metrics"]["score"]["mean"] == pytest.approx(0.95) + assert report["metrics"]["latency"]["mean"] == pytest.approx(4.2) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_composite_gate_without_score_condition_still_runs_suite_judge() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="latency-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="latency", op="<=", threshold=5.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["metrics"]["latency"]["mean"] == pytest.approx(4.2) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_legacy_non_score_gate_does_not_set_score_threshold() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "latency": 6.0} + + suite = EvalSuiteDef( + suite_id="latency-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef(metric_name="latency", pass_threshold=5.0), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["results"][0]["metrics"]["score"]["status"] == "PASSED" + assert report["metrics"]["score"]["eval_status"] == "PASSED" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_strict_gate_operator_keeps_metric_status_consistent() -> None: + async def fake_judge(case_input, target): + return {"score": 0.9} + + suite = EvalSuiteDef( + suite_id="strict-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">", threshold=0.9),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["eval_status"] == "FAILED" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_equality_gate_keeps_metric_status_consistent() -> None: + async def fake_judge(case_input, target): + return {"score": 0.7} + + suite = EvalSuiteDef( + suite_id="equality-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op="==", threshold=0.9),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["eval_status"] == "FAILED" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_categorical_gate_metric() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "verdict": "approved"} + + suite = EvalSuiteDef( + suite_id="categorical-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="verdict", op="==", threshold="approved"),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["gate"]["value"] is None + assert report["results"][0]["metrics"]["verdict"]["value"] == "approved" + assert report["metrics"]["verdict"]["value"] == "approved" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_reports_trajectory_scorer_metrics( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def demo_program(case, spec, target): + return { + "status": "success", + "answer": "ok", + "trajectory": [ + {"action": {"tool_calls": [{"id": "call-1", "function": {"name": "search", "arguments": "{}"}}]}} + ], + } + + async def fake_judge(case_input, target): + return {"score": 1.0} + + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: demo_program, + ) + + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + judge=fake_judge, + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["results"][0]["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["value"] == pytest.approx(1.0) + assert report["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["mean"] == pytest.approx(1.0) + + +@pytest.mark.asyncio +async def test_declared_trajectory_metric_takes_precedence_over_judge_payload_collision( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def demo_program(case, spec, target): + return { + "status": "success", + "answer": "ok", + "trajectory": [ + {"action": {"tool_calls": [{"id": "call-1", "function": {"name": "search", "arguments": "{}"}}]}} + ], + } + + async def fake_judge(case_input, target): + return {"score": 1.0, MetricNames.TRAJECTORY_TOOL_CALLS: 0.0} + + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: demo_program, + ) + + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + judge=fake_judge, + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["results"][0]["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["value"] == pytest.approx(1.0) + assert report["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["mean"] == pytest.approx(1.0) + + +@pytest.mark.asyncio +async def test_suite_judge_prefers_state_payload_over_static_case_target() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "answer": target["answer"]} + + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + judge=fake_judge, + ) + from aworld.evaluations.scorers.suite_judge import SuiteJudgeScorer + + scorer = SuiteJudgeScorer(suite=suite) + input_case = type("Case", (), {"case_data": {"query": "demo", "_target": {"path": "legacy"}}})() + output = {"state": {"answer": "from-state", "status": "success"}} + + result = await scorer.score(0, input_case, output) + + assert result.metric_results["score"]["metadata"]["answer"] == "from-state" + + +@pytest.mark.asyncio +async def test_report_keeps_full_judge_metadata_only_on_score_metric() -> None: + async def fake_judge(case_input, target): + return { + "score": 0.5, + "verdict": "Fail", + "A1_groundedness": 1, + "veto_triggered": True, + } + + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + judge=fake_judge, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo"}}, + suite=suite, + ) + ) + + result = report["results"][0] + assert result["judge"]["A1_groundedness"] == 1 + assert result["metrics"]["verdict"]["value"] == "Fail" + assert set(result["metric_details"]) == {"score"} + assert result["metric_details"]["score"]["veto_triggered"] is True + + +def test_builtin_app_evaluator_suite_has_required_schema_and_score_gate() -> None: + suite = get_builtin_eval_suite("app-evaluator") + + assert suite.suite_id == "app-evaluator" + assert suite.judge_schema.required_fields == ( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + assert suite.gate_policy.metric_name == "score" + + +def test_eval_suite_registry_resolves_explicit_and_target_defaults( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + def generic_factory(target): + return EvalSuiteDef(suite_id="generic-review") + + def image_factory(target): + return EvalSuiteDef(suite_id="image-review") + + register_eval_suite( + "generic-review", + generic_factory, + matcher=lambda target: True, + priority=10, + ) + register_eval_suite( + "image-review", + image_factory, + matcher=lambda target: target["target_kind"] == "image", + priority=50, + ) + + image_target = tmp_path / "artifact.png" + image_target.write_bytes(base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=")) + text_target = tmp_path / "artifact.txt" + text_target.write_text("artifact", encoding="utf-8") + + listed = list_eval_suites() + explicit = resolve_eval_suite("generic-review", image_target) + image_default = resolve_eval_suite(None, image_target) + text_default = resolve_eval_suite(None, text_target) + + assert listed == ["generic-review", "image-review"] + assert explicit.suite_id == "generic-review" + assert image_default.suite_id == "image-review" + assert image_default.cases[0].input["target_kind"] == "image" + assert text_default.suite_id == "generic-review" + assert text_default.cases[0].input["target_kind"] == "file" + + +def test_eval_suite_registry_reports_matching_suites_and_selection_mode( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + register_eval_suite( + "generic-review", + lambda target: EvalSuiteDef(suite_id="generic-review"), + matcher=lambda target: True, + priority=10, + ) + register_eval_suite( + "image-review", + lambda target: EvalSuiteDef(suite_id="image-review"), + matcher=lambda target: target["target_kind"] == "image", + priority=50, + ) + + image_target = tmp_path / "artifact.png" + image_target.write_bytes(base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=")) + + matching = list_matching_eval_suites(image_target) + auto_selection = resolve_eval_suite_selection(None, image_target) + explicit_selection = resolve_eval_suite_selection("generic-review", image_target) + + assert matching == ["image-review", "generic-review"] + assert auto_selection.mode == "auto" + assert auto_selection.suite_id == "image-review" + assert explicit_selection.mode == "explicit" + assert explicit_selection.suite_id == "generic-review" + + +def test_load_declared_eval_suites_registers_manifest_backed_suite( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file", "directory"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + }, + "metadata": { + "owner": "qa" + } +} +""".strip(), + encoding="utf-8", + ) + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + loaded = load_declared_eval_suites(tmp_path) + listed = list_eval_suites() + + assert loaded == ["strict-ui"] + assert "strict-ui" in listed + + +def test_load_declared_eval_suites_rejects_execution_in_manifest(tmp_path) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "program-suite.json").write_text( + """ +{ + "suite_id": "program-suite", + "base_suite": "app-evaluator", + "execution": { + "mode": "program", + "target_ref": "pkg.module:run_case" + } +} +""".strip(), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Additional properties are not allowed"): + load_declared_eval_suites(tmp_path) + + +def test_declared_eval_suite_manifest_schema_rejects_execution_contract() -> None: + with pytest.raises(ValueError, match="Additional properties are not allowed"): + validate_declared_eval_suite_manifest( + { + "suite_id": "program-suite", + "base_suite": "app-evaluator", + "execution": {"mode": "program", "target_ref": "pkg.module:run_case"}, + } + ) + + +def test_declared_eval_suite_can_be_selected_for_matching_target( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + } +} +""".strip(), + encoding="utf-8", + ) + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + load_declared_eval_suites(tmp_path) + + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + selection = resolve_eval_suite_selection("strict-ui", target) + + assert selection.suite_id == "strict-ui" + assert selection.suite.gate_policy.pass_threshold == pytest.approx(0.92) + + +def test_load_declared_eval_suites_refreshes_existing_manifest_changes( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + manifest_path = manifest_dir / "strict-ui.json" + manifest_path.write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + } +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + load_declared_eval_suites(tmp_path) + + manifest_path.write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.99, + "approval_threshold": 0.8 + } +} +""".strip(), + encoding="utf-8", + ) + + load_declared_eval_suites(tmp_path) + selection = resolve_eval_suite_selection("strict-ui", tmp_path / "artifact.txt") + + assert selection.suite.gate_policy.pass_threshold == pytest.approx(0.99) + + +def test_load_declared_eval_suites_removes_deleted_manifest_registration( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + manifest_path = manifest_dir / "strict-ui.json" + manifest_path.write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + load_declared_eval_suites(tmp_path) + manifest_path.unlink() + + load_declared_eval_suites(tmp_path) + + assert "strict-ui" not in list_eval_suites() + + +def test_declared_eval_suites_are_resolved_per_workspace( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + workspace_a = tmp_path / "a" + workspace_b = tmp_path / "b" + for workspace, threshold in ((workspace_a, 0.91), (workspace_b, 0.99)): + manifest_dir = workspace / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + f""" +{{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": {{ + "metric_name": "score", + "pass_threshold": {threshold}, + "approval_threshold": 0.8 + }} +}} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + load_declared_eval_suites(workspace_a) + load_declared_eval_suites(workspace_b) + + selection_a = resolve_eval_suite_selection("strict-ui", workspace_a / "artifact.txt") + selection_b = resolve_eval_suite_selection("strict-ui", workspace_b / "artifact.txt") + + assert selection_a.suite.gate_policy.pass_threshold == pytest.approx(0.91) + assert selection_b.suite.gate_policy.pass_threshold == pytest.approx(0.99) + + +def test_load_declared_eval_suites_rejects_builtin_suite_id_override( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "override.json").write_text( + """ +{ + "suite_id": "app-evaluator", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="reserved suite_id"): + load_declared_eval_suites(tmp_path) + + assert list_eval_suites() == ["app-evaluator"] + + +@pytest.mark.asyncio +async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: + async def fake_executor(prompt: str, system_prompt: str): + assert "artifact.txt" in prompt + assert "UI review committee" in system_prompt + return { + "results": [ + { + "filename": "artifact.txt", + "score": 0.91, + "rank": "Exemplary", + "criticism": "Almost none.", + "praise": "Strong visual hierarchy.", + "improvement_advice": "Keep the current direction.", + } + ] + } + + backend = AgentJudgeBackend( + backend_id="agent-backend", + system_prompt="You are a UI review committee.", + executor=fake_executor, + ) + + payload = await backend.judge( + case_input={"target_path": "artifact.txt"}, + target={"target_path": "artifact.txt", "target_kind": "file"}, + suite=EvalSuiteDef(suite_id="app-evaluator"), + ) + + assert payload["score"] == pytest.approx(0.91) + assert payload["rank"] == "Exemplary" + + +@pytest.mark.asyncio +async def test_builtin_app_evaluator_can_use_injected_judge_backend() -> None: + class StubBackend: + backend_id = "stub-agent" + + def is_available(self) -> bool: + return True + + async def judge(self, case_input, target, suite): + return { + "score": 0.72, + "rank": "Good", + "criticism": "Needs slightly better spacing.", + "praise": "Solid composition.", + "improvement_advice": "Increase whitespace around the main section.", + } + + suite = get_builtin_eval_suite("app-evaluator", judge_backend=StubBackend()).with_cases( + [EvalCaseDef(case_id="artifact", input={"target_path": "artifact.txt"})] + ) + flow = EvaluationFlowDef( + target={"target_path": "artifact.txt", "target_kind": "file"}, + suite=suite, + ) + + report = await run_evaluation_flow(flow) + + assert report["judge_backend"]["backend_id"] == "stub-agent" + assert report["results"][0]["judge"]["rank"] == "Good" + assert report["report_version"] == 1 + assert report["approval"]["required"] is True + + +@pytest.mark.asyncio +async def test_fallback_judge_backend_uses_next_backend_after_timeout() -> None: + async def slow_executor(prompt: str, system_prompt: str): + await asyncio.sleep(0.05) + return {"results": [{"filename": "artifact.txt", "score": 0.99}]} + + fallback = FallbackJudgeBackend( + backend_id="fallback", + backends=( + AgentJudgeBackend( + backend_id="slow-agent", + system_prompt="judge", + executor=slow_executor, + timeout_seconds=0.01, + ), + CallableJudgeBackend( + backend_id="heuristic", + judge=lambda case_input, target: { + "score": 0.61, + "rank": "Good", + "criticism": "Fallback path used.", + "praise": "Fallback stayed responsive.", + "improvement_advice": "Keep timeout budgets explicit.", + }, + ), + ), + ) + + execution = await fallback.execute( + case_input={"target_path": "artifact.txt"}, + target={"target_path": "artifact.txt", "target_kind": "file"}, + suite=EvalSuiteDef(suite_id="app-evaluator"), + ) + + assert execution.backend_id == "heuristic" + assert execution.payload["score"] == pytest.approx(0.61) + + +@pytest.mark.asyncio +async def test_builtin_app_evaluator_passes_visual_target_images_to_agent_backend( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + image_path = tmp_path / "artifact.png" + image_path.write_bytes( + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=" + ) + ) + + captured = {} + + async def fake_executor(prompt, system_prompt: str): + captured["prompt"] = prompt + return { + "results": [ + { + "filename": image_path.name, + "score": 0.88, + "rank": "Exemplary", + "criticism": "Minor spacing polish remains.", + "praise": "The main visual is clear.", + "improvement_advice": "Tighten secondary detail spacing.", + } + ] + } + + monkeypatch.setenv("LLM_MODEL_NAME", "test-model") + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + monkeypatch.setattr(substrate_module, "_default_agent_judge_executor", fake_executor) + + suite = get_builtin_eval_suite("app-evaluator").with_cases( + [ + EvalCaseDef( + case_id="artifact", + input={"target_path": str(image_path), "target_kind": "image"}, + ) + ] + ) + flow = EvaluationFlowDef( + target={"target_path": str(image_path), "target_kind": "image"}, + suite=suite, + ) + + report = await run_evaluation_flow(flow) + + prompt = captured["prompt"] + + assert isinstance(prompt, tuple) + assert prompt[0].startswith("Evaluate the following app artifact") + assert len(prompt[1]) == 1 + assert prompt[1][0].startswith("data:image/png;base64,") + assert report["judge_backend"]["backend_id"] == "app-evaluator-agent" diff --git a/tests/evaluations/test_evaluator_trials.py b/tests/evaluations/test_evaluator_trials.py new file mode 100644 index 000000000..76e4020d0 --- /dev/null +++ b/tests/evaluations/test_evaluator_trials.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.runtime_composition import RetryRuntimeHarness, RolloutState +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + TrialPolicyDef, + compile_evaluation_flow, + run_evaluation_flow, +) +from aworld.evaluations.report import validate_evaluator_report + + +def test_trial_policy_rejects_invalid_k_values(): + with pytest.raises(ValueError, match="k values"): + TrialPolicyDef(num_trials=2, pass_at_k=(3,)).validate() + + +def test_build_eval_dataset_expands_trial_cases(): + suite = EvalSuiteDef( + suite_id="trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + trial_policy=TrialPolicyDef(num_trials=3), + ) + + compiled = compile_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + ids = [case.eval_case_id for case in compiled.dataset.eval_cases] + assert ids == ["case-1::trial-1", "case-1::trial-2", "case-1::trial-3"] + assert compiled.dataset.eval_cases[0].case_data["_trial"]["original_case_id"] == "case-1" + assert compiled.dataset.eval_cases[0].case_data["_trial"]["trial_index"] == 1 + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_reports_pass_at_k_and_pass_caret_k(): + async def fake_judge(case_input, target): + trial_index = case_input["_trial"]["trial_index"] + return {"score": 1.0 if trial_index == 2 else 0.0} + + suite = EvalSuiteDef( + suite_id="trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + trial_policy=TrialPolicyDef( + num_trials=3, + pass_at_k=(2,), + pass_caret_k=(2,), + success_metric="score", + ), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score_pass@2", op=">=", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["metrics"]["score_pass@2"]["mean"] == pytest.approx(1.0) + assert report["metrics"]["score_pass^2"]["mean"] == pytest.approx(0.0) + assert report["gate"]["status"] == "pass" + + +@pytest.mark.asyncio +async def test_trial_success_metric_defaults_from_trial_gate_metric_base_name(): + async def fake_judge(case_input, target): + return {"score": 1.0 if case_input["_trial"]["trial_index"] == 2 else 0.0} + + suite = EvalSuiteDef( + suite_id="trial-default-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2, pass_at_k=(2,)), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score_pass@2", op=">=", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["metrics"]["score_pass@2"]["mean"] == pytest.approx(1.0) + assert report["gate"]["status"] == "pass" + + +@pytest.mark.asyncio +async def test_retry_attempts_do_not_count_as_trials(): + class RetryInsideTrialHarness: + def __init__(self): + self.calls = 0 + + async def run_rollout(self, *, case, target): + self.calls += 1 + if self.calls % 2 == 1: + return RolloutState(case_id=case.case_id, status="failed", answer="failed-attempt") + return RolloutState(case_id=case.case_id, status="success", answer="passed-trial") + + async def fake_judge(case_input, target): + return {"score": 1.0 if target.get("answer") == "passed-trial" else 0.0} + + suite = EvalSuiteDef( + suite_id="retry-trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + runtime_harness=RetryRuntimeHarness( + base_harness=RetryInsideTrialHarness(), + max_attempts=2, + ), + judge=fake_judge, + trial_policy=TrialPolicyDef( + num_trials=2, + pass_at_k=(2,), + success_metric="score", + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["trial_counts"]["trials_total"] == 2 + assert report["metrics"]["score_pass@2"]["mean"] == pytest.approx(1.0) + assert len(report["results"][0]["artifacts"]["attempts"]) == 2 + + +@pytest.mark.asyncio +async def test_multi_trial_report_exposes_trial_metadata(): + async def fake_judge(case_input, target): + return {"score": 1.0} + + suite = EvalSuiteDef( + suite_id="trial-report-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2, pass_at_k=(2,), success_metric="score"), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["trial_policy"]["num_trials"] == 2 + assert report["trial_counts"] == {"original_cases": 1, "trials_total": 2} + assert report["results"][0]["trial"]["original_case_id"] == "case-1" + assert report["results"][0]["trial"]["trial_index"] == 1 + validate_evaluator_report(report.to_dict()) diff --git a/tests/evaluations/test_execution_adapters.py b/tests/evaluations/test_execution_adapters.py new file mode 100644 index 000000000..59c100178 --- /dev/null +++ b/tests/evaluations/test_execution_adapters.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import pytest + +from aworld.core.task import TaskResponse +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution_adapters import resolve_execution_adapter +from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget +from aworld.evaluations.substrate import EvalCaseDef + + +async def _demo_program(case, spec, target): + return { + "status": "success", + "answer": f"ran:{case.input['query']}", + "completion": [{"role": "assistant", "content": "final"}], + "trajectory": [{"role": "assistant", "content": "step"}], + "usage": {"total_tokens": 7}, + } + + +@pytest.mark.asyncio +async def test_program_execution_adapter_normalizes_result(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: _demo_program, + ) + spec = EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case") + adapter = resolve_execution_adapter(spec) + + state = await adapter.execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo"}), + target={"target_kind": "directory"}, + spec=spec, + ) + + assert state.case_id == "case-1" + assert state.answer == "ran:demo" + assert state.completion[0]["content"] == "final" + assert state.trajectory[0]["content"] == "step" + assert state.usage["total_tokens"] == 7 + + +@pytest.mark.asyncio +async def test_program_execution_adapter_does_not_copy_case_input_into_state_metadata( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: _demo_program, + ) + spec = EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case") + + state = await resolve_execution_adapter(spec).execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo", "large_blob": "x" * 1000}), + target={"target_kind": "directory"}, + spec=spec, + ) + + assert state.metadata["_execution_mode"] == "program" + assert state.metadata["_target"] == {"target_kind": "directory"} + assert "query" not in state.metadata + assert "large_blob" not in state.metadata + + +@pytest.mark.asyncio +async def test_task_execution_adapter_does_not_copy_case_input_into_state_metadata( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class DemoTask: + id = "task-1" + + async def fake_run_task(*, task): + return TaskResponse(success=True, answer="ok") + + monkeypatch.setattr("aworld.evaluations.execution_adapters.Runners.run_task", fake_run_task) + spec = EvalExecutionSpec(mode=EvalExecutionMode.TASK, target_config={"task": DemoTask()}) + + state = await resolve_execution_adapter(spec).execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo", "large_blob": "x" * 1000}), + target={"target_kind": "task"}, + spec=spec, + ) + + assert state.metadata["_target"] == {"target_kind": "task"} + assert "query" not in state.metadata + assert "large_blob" not in state.metadata + + +@pytest.mark.asyncio +async def test_agent_eval_target_reuses_resolved_execution_spec(monkeypatch: pytest.MonkeyPatch) -> None: + seen_specs = [] + + class FakeAdapter: + def __init__(self, expected_spec): + self.expected_spec = expected_spec + + async def execute(self, *, case, target, spec): + assert spec is self.expected_spec + return TaskResponse(success=True, answer="ok") + + def fake_resolve(spec): + seen_specs.append(spec) + return FakeAdapter(spec) + + monkeypatch.setattr("aworld.evaluations.eval_targets.agent_eval.resolve_execution_adapter", fake_resolve) + + result = await AworldAgentEvalTarget(agent=object()).predict(0, {"query": "hello"}) + + assert result["answer"] == "ok" + assert len(seen_specs) == 1 + + +def test_resolve_execution_adapter_rejects_missing_program_ref() -> None: + with pytest.raises(ValueError, match="target_ref"): + resolve_execution_adapter(EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM)) + + +def test_resolve_execution_adapter_rejects_command_style_program_ref() -> None: + with pytest.raises(ValueError, match="importable callable"): + resolve_execution_adapter( + EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="python script.py") + ) + + +@pytest.mark.parametrize( + "spec", + [ + EvalExecutionSpec( + mode=EvalExecutionMode.PROGRAM, + target_ref="pkg.module:run_case", + runner_method="shell", + ), + EvalExecutionSpec( + mode=EvalExecutionMode.PROGRAM, + target_ref="pkg.module:run_case", + target_config={"command": "python script.py"}, + ), + EvalExecutionSpec( + mode=EvalExecutionMode.PROGRAM, + target_ref="pkg.module:run_case", + target_config={"workflow": "external"}, + ), + ], +) +def test_resolve_execution_adapter_rejects_unsupported_program_runtime_config(spec: EvalExecutionSpec) -> None: + with pytest.raises(ValueError, match="unsupported program execution configuration"): + resolve_execution_adapter(spec) + + +@pytest.mark.parametrize( + "target_ref", + [ + "script.py", + "./script.py", + "scripts/run.py", + "script.py:main", + "scripts.run.py:main", + ], +) +def test_resolve_execution_adapter_rejects_path_style_program_ref(target_ref: str) -> None: + with pytest.raises(ValueError, match="importable callable"): + resolve_execution_adapter( + EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref=target_ref) + ) diff --git a/tests/evaluations/test_execution_state.py b/tests/evaluations/test_execution_state.py new file mode 100644 index 000000000..0cf5a826d --- /dev/null +++ b/tests/evaluations/test_execution_state.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import pytest + +from aworld.core.task import TaskResponse +from aworld.evaluations.execution import EvalState, normalize_task_response_to_eval_state +from aworld.evaluations.scorers.state_extractors import ( + get_assistant_messages, + get_completion, + get_tool_calls, +) + + +def test_normalize_task_response_to_eval_state_captures_answer_usage_and_trajectory() -> None: + response = TaskResponse( + id="task-1", + answer="done", + usage={"total_tokens": 42}, + trajectory=[{"type": "tool", "name": "search"}], + success=True, + ) + + state = normalize_task_response_to_eval_state(case_id="case-1", response=response) + + assert state.case_id == "case-1" + assert state.answer == "done" + assert state.completion == ["done"] + assert state.usage["total_tokens"] == 42 + assert state.trajectory[0]["name"] == "search" + assert state.status == "success" + + +def test_state_extractors_support_completion_and_tool_queries() -> None: + state = { + "completion": [{"role": "assistant", "content": "final"}], + "trajectory": [ + {"role": "user", "content": "question"}, + {"role": "assistant", "content": "thinking"}, + {"action": {"tool_calls": [{"name": "search"}]}}, + ], + } + + assert get_completion(state)[0]["content"] == "final" + assert get_assistant_messages(state)[0]["content"] == "final" + assert get_tool_calls(state)[0]["name"] == "search" + + +def test_normalize_mapping_response_preserves_completion_and_tool_calls() -> None: + state = normalize_task_response_to_eval_state( + case_id="case-2", + response={ + "status": "success", + "answer": "ok", + "completion": [{"role": "assistant", "content": "ok"}], + "trajectory": [{"tool_calls": [{"name": "search"}]}], + }, + ) + + assert state.completion[0]["content"] == "ok" + assert state.tool_calls[0]["name"] == "search" + + +def test_normalize_eval_state_response_preserves_state_fields() -> None: + state = normalize_task_response_to_eval_state( + case_id="case-3", + response=EvalState( + case_id="source-case", + status="success", + answer="done", + trajectory=[{"role": "assistant", "content": "step"}], + ), + target={"target_kind": "program"}, + ) + + assert state.case_id == "case-3" + assert state.answer == "done" + assert state.trajectory[0]["content"] == "step" + assert state.metadata["_target"]["target_kind"] == "program" + + +def test_normalize_eval_state_shaped_mapping_preserves_response_metadata() -> None: + state = normalize_task_response_to_eval_state( + case_id="case-4", + response=EvalState( + case_id="source-case", + status="success", + answer="done", + metadata={"program": "demo"}, + ).to_dict(), + target={"target_kind": "program"}, + metadata={"suite": "demo-suite"}, + ) + + assert state.metadata["program"] == "demo" + assert state.metadata["suite"] == "demo-suite" + assert state.metadata["_target"]["target_kind"] == "program" + + +def test_normalize_mapping_rejects_malformed_list_fields() -> None: + with pytest.raises(ValueError, match="trajectory"): + normalize_task_response_to_eval_state( + case_id="case-5", + response={ + "status": "success", + "answer": "ok", + "trajectory": "bad", + }, + ) + + with pytest.raises(ValueError, match="completion"): + normalize_task_response_to_eval_state( + case_id="case-6", + response={ + "status": "success", + "answer": "ok", + "completion": {"role": "assistant"}, + }, + ) + + with pytest.raises(ValueError, match="tool_calls"): + normalize_task_response_to_eval_state( + case_id="case-7", + response={ + "status": "success", + "answer": "ok", + "tool_calls": "bad", + }, + ) diff --git a/tests/evaluations/test_llm_user_simulator.py b/tests/evaluations/test_llm_user_simulator.py new file mode 100644 index 000000000..60a11ebc5 --- /dev/null +++ b/tests/evaluations/test_llm_user_simulator.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.runtime_composition import ( + CallableRuntimeHarness, + LLMUserSimulator, + RolloutState, + RolloutTurn, +) +from aworld.evaluations.substrate import EvalCaseDef + + +@pytest.mark.asyncio +async def test_callable_runtime_harness_awaits_async_simulator(): + class AsyncSimulator: + async def next_turn(self, *, case, target, state, last_output=None): + if any(turn.role == "user" for turn in state.turns): + return None + return RolloutTurn(role="user", content="async hello") + + async def assistant_step(*, user_turn, state, case, target): + return {"answer": f"ack:{user_turn.content}"} + + harness = CallableRuntimeHarness( + simulator=AsyncSimulator(), + assistant_step=assistant_step, + max_turns=1, + ) + + state = await harness.run_rollout( + case=EvalCaseDef(case_id="case-1", input={}), + target={}, + ) + + assert state.answer == "ack:async hello" + assert state.turns[0].content == "async hello" + + +@pytest.mark.asyncio +async def test_llm_user_simulator_generates_adaptive_turns_from_context(): + calls = [] + + async def turn_generator(*, case, target, state, last_output, turn_index): + calls.append( + { + "case_id": case.case_id, + "goal": target["goal"], + "last_output": last_output, + "turn_index": turn_index, + "turn_count": len(state.turns), + } + ) + if turn_index == 0: + return "start" + if turn_index == 1: + return { + "content": f"clarify after {last_output}", + "metadata": {"intent": "clarify", "client": object()}, + } + return {"stop": True, "metadata": {"reason": "done"}} + + async def assistant_step(*, user_turn, state, case, target): + return {"answer": f"assistant:{user_turn.content}"} + + harness = CallableRuntimeHarness( + simulator=LLMUserSimulator(turn_generator=turn_generator), + assistant_step=assistant_step, + max_turns=3, + ) + + state = await harness.run_rollout( + case=EvalCaseDef(case_id="case-1", input={}), + target={"goal": "resolve ticket"}, + ) + + assert [turn.content for turn in state.turns if turn.role == "user"] == [ + "start", + "clarify after assistant:start", + ] + assert calls[0]["turn_index"] == 0 + assert calls[1]["last_output"] == "assistant:start" + assert calls[2]["turn_count"] == 4 + assert state.turns[2].metadata["intent"] == "clarify" + assert "client" not in state.trajectory[2]["metadata"] + + +def test_llm_user_simulator_accepts_rollout_turn_output(): + simulator = LLMUserSimulator( + turn_generator=lambda **kwargs: RolloutTurn( + role="user", + content="custom", + metadata={"safe": True}, + ) + ) + + turn = simulator.next_turn( + case=EvalCaseDef(case_id="case-1", input={}), + target={}, + state=RolloutState(case_id="case-1"), + last_output=None, + ) + + assert turn == RolloutTurn(role="user", content="custom", metadata={"safe": True}) diff --git a/tests/evaluations/test_runtime_composition.py b/tests/evaluations/test_runtime_composition.py new file mode 100644 index 000000000..8db02fe3a --- /dev/null +++ b/tests/evaluations/test_runtime_composition.py @@ -0,0 +1,351 @@ +from __future__ import annotations + +import pytest +from pydantic import BaseModel + +from aworld.evaluations.base import EvalCriteria +from aworld.evaluations.runtime_composition import ( + CallableRuntimeHarness, + RetryRuntimeHarness, + RolloutState, + RolloutTurn, + ScriptedUserSimulator, + SinglePromptUserSimulator, + StateCheckGrader, + StepReward, + aggregate_step_rewards, + derive_standard_metrics, +) +from aworld.evaluations.scorers import scorer_factory +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + JudgeSchemaDef, + TrajectoryScorerDef, + get_builtin_eval_suite, + run_evaluation_flow, +) +from aworld.evaluations.types import MetricNames + + +class RuntimeJudgeOutput(BaseModel): + score: float + verdict: str + + +def test_rollout_state_to_eval_state_excludes_live_handles(): + live_agent = object() + state = RolloutState( + case_id="case-1", + status="success", + answer="done", + turns=[RolloutTurn(role="user", content="hello")], + outcome={"artifact_exists": True}, + metadata={"live_agent": live_agent, "safe": "ok"}, + ) + + eval_state = state.to_eval_state(target={"target_kind": "inline"}) + + assert eval_state.case_id == "case-1" + assert eval_state.answer == "done" + assert eval_state.trajectory + assert eval_state.artifacts["outcome"]["artifact_exists"] is True + assert "live_agent" not in eval_state.metadata + assert eval_state.metadata["safe"] == "ok" + + +def test_state_check_grader_emits_outcome_metric(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"ticket": {"status": "resolved"}}, + ) + grader = StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ) + + result = grader.grade(state=state, case=None, target={}) + + assert result.metric_name == "ticket_resolved" + assert result.value == 1.0 + assert result.passed is True + + +def test_state_check_grader_fails_non_numeric_comparison_without_crashing(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"latency_ms": "not-a-number"}, + ) + grader = StateCheckGrader( + metric_name="latency_ok", + path=("latency_ms",), + op="<=", + expected=1000, + ) + + result = grader.grade(state=state, case=None, target={}) + + assert result.metric_name == "latency_ok" + assert result.value == 0.0 + assert result.passed is False + assert "not comparable" in result.reason + assert result.metadata["actual"] == "not-a-number" + + +def test_state_check_grader_rejects_unsupported_operator(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"latency_ms": 10}, + ) + grader = StateCheckGrader( + metric_name="latency_ok", + path=("latency_ms",), + op="between", + expected=1000, + ) + + with pytest.raises(ValueError, match="unsupported state-check operator"): + grader.grade(state=state, case=None, target={}) + + +def test_scripted_user_simulator_emits_turns_in_order(): + simulator = ScriptedUserSimulator() + state = RolloutState(case_id="case-1") + case = EvalCaseDef(case_id="case-1", input={"turns": ["hi", "again"]}) + + first = simulator.next_turn(case=case, target={}, state=state, last_output=None) + state.turns.append(first) + second = simulator.next_turn(case=case, target={}, state=state, last_output="ok") + + assert first.content == "hi" + assert second.content == "again" + + +def test_single_prompt_user_simulator_emits_one_turn(): + simulator = SinglePromptUserSimulator() + case = EvalCaseDef(case_id="case-1", input={"query": "hello"}) + state = RolloutState(case_id="case-1") + + first = simulator.next_turn(case=case, target={}, state=state) + state.turns.append(first) + second = simulator.next_turn(case=case, target={}, state=state) + + assert first.content == "hello" + assert second is None + + +@pytest.mark.asyncio +async def test_runtime_harness_executes_multi_turn_rollout(): + async def assistant_step(*, user_turn, state, case, target): + return { + "answer": f"ack:{user_turn.content}", + "tool_calls": [{"id": f"call-{len(state.turns)}"}], + } + + harness = CallableRuntimeHarness( + simulator=ScriptedUserSimulator(), + assistant_step=assistant_step, + max_turns=2, + ) + case = EvalCaseDef(case_id="case-1", input={"turns": ["hi", "again"]}) + + state = await harness.run_rollout(case=case, target={"target_kind": "inline"}) + + assert state.answer == "ack:again" + assert [turn.role for turn in state.turns] == ["user", "assistant", "user", "assistant"] + assert len(state.trajectory) == 4 + assert state.tool_calls == [{"id": "call-1"}, {"id": "call-3"}] + + +def test_step_rewards_aggregate_into_metrics(): + state = RolloutState( + case_id="case-1", + step_rewards=[ + StepReward(metric_name="process_quality", step_index=0, value=1.0, weight=2.0), + StepReward( + metric_name="process_quality", + step_index=1, + value=0.5, + weight=1.0, + partial_credit=True, + ), + ], + ) + + metrics = aggregate_step_rewards(state) + + assert metrics["process_quality"]["value"] == pytest.approx((1.0 * 2.0 + 0.5) / 3.0) + assert metrics["process_quality_total"]["value"] == pytest.approx(1.5) + assert metrics["process_quality_partial_credit_rate"]["value"] == pytest.approx(0.5) + + +@pytest.mark.asyncio +async def test_retry_wrapper_preserves_failed_attempts(): + attempts = [] + + class FlakyHarness: + async def run_rollout(self, *, case, target): + attempts.append(len(attempts) + 1) + if len(attempts) == 1: + return RolloutState(case_id=case.case_id, status="failed", answer="bad") + return RolloutState(case_id=case.case_id, status="success", answer="ok") + + wrapper = RetryRuntimeHarness(base_harness=FlakyHarness(), max_attempts=2) + case = EvalCaseDef(case_id="case-1", input={"query": "hello"}) + + state = await wrapper.run_rollout(case=case, target={}) + + assert state.status == "success" + assert state.answer == "ok" + assert [attempt.status for attempt in state.attempts] == ["failed", "success"] + assert "pass@1" not in state.standard_metrics + assert "pass^1" not in state.standard_metrics + + +@pytest.mark.asyncio +async def test_retry_wrapper_attempts_serialize_without_self_recursion(): + class FlakyHarness: + def __init__(self): + self.calls = 0 + + async def run_rollout(self, *, case, target): + self.calls += 1 + return RolloutState( + case_id=case.case_id, + status="success" if self.calls == 2 else "failed", + answer=f"attempt-{self.calls}", + ) + + wrapper = RetryRuntimeHarness(base_harness=FlakyHarness(), max_attempts=2) + case = EvalCaseDef(case_id="case-1", input={"query": "hello"}) + + state = await wrapper.run_rollout(case=case, target={}) + eval_state = state.to_eval_state(target={}) + state_dict = state.to_dict() + + assert [attempt["answer"] for attempt in eval_state.artifacts["attempts"]] == ["attempt-1", "attempt-2"] + assert [attempt["answer"] for attempt in state_dict["attempts"]] == ["attempt-1", "attempt-2"] + + +def test_rollout_standard_metrics_are_derived(): + state = RolloutState( + case_id="case-1", + turns=[ + RolloutTurn(role="user", content="hello"), + RolloutTurn(role="assistant", content="ok"), + ], + tool_calls=[{"id": "call-1"}], + usage={"prompt_tokens": 3, "completion_tokens": 2, "total_tokens": 5}, + timing={"duration_ms": 120}, + ) + + metrics = derive_standard_metrics(state) + + assert metrics == { + "n_turns": 2, + "n_tool_calls": 1, + "n_tokens": 5, + "duration_ms": 120, + } + + +@pytest.mark.asyncio +async def test_runtime_composition_adoption_suite_runs_end_to_end(): + async def assistant_step(*, user_turn, state, case, target): + return { + "answer": "ticket resolved", + "outcome": {"ticket": {"status": "resolved"}}, + "step_rewards": [ + StepReward(metric_name="process_quality", step_index=0, value=1.0, reason="direct resolution") + ], + "tool_calls": [{"id": "call-1", "function": {"name": "resolve_ticket", "arguments": "{}"}}], + "usage": {"total_tokens": 7}, + "timing": {"duration_ms": 25}, + } + + async def fake_judge(case_input, target): + assert target["artifacts"]["outcome"]["ticket"]["status"] == "resolved" + return {"score": 1.0, "verdict": "approved"} + + suite = EvalSuiteDef( + suite_id="runtime-adoption", + cases=[EvalCaseDef(case_id="case-1", input={"query": "resolve ticket"})], + runtime_harness=CallableRuntimeHarness( + simulator=SinglePromptUserSimulator(), + assistant_step=assistant_step, + max_turns=1, + ), + judge_schema=JudgeSchemaDef(output_model=RuntimeJudgeOutput), + judge=fake_judge, + outcome_scorers=( + StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ), + ), + reward_metrics=("process_quality",), + standard_metrics=("n_turns", "n_tool_calls", "n_tokens", "duration_ms"), + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="ticket_resolved", op="==", threshold=1.0), + GateMetricCondition(metric_name="process_quality", op=">=", threshold=1.0), + GateMetricCondition(metric_name="n_turns", op="==", threshold=2), + GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0), + ) + ), + metadata={"evaluation_purpose": "capability"}, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["gate"]["status"] == "pass" + assert report["metrics"]["ticket_resolved"]["mean"] == pytest.approx(1.0) + assert report["metrics"]["process_quality"]["mean"] == pytest.approx(1.0) + assert report["metrics"]["n_turns"]["mean"] == pytest.approx(2.0) + assert report["results"][0]["metric_details"]["ticket_resolved"]["passed"] is True + assert report["results"][0]["artifacts"]["outcome"]["ticket"]["status"] == "resolved" + assert report["suite_metadata"]["evaluation_purpose"] == "capability" + + +def test_builtin_runtime_composition_adoption_suite_is_registered(): + suite = get_builtin_eval_suite("runtime-composition-adoption") + + assert suite.suite_id == "runtime-composition-adoption" + assert suite.runtime_harness is not None + assert suite.judge_schema.output_model is not None + assert suite.outcome_scorers + assert suite.reward_metrics == ("process_quality",) + assert suite.metadata["evaluation_purpose"] == "capability" + + +def test_runtime_scorer_can_be_selected_by_full_class_name_for_dynamic_metric(): + scorers = scorer_factory( + criterias=[ + EvalCriteria( + metric_name="custom_outcome", + scorer_class="aworld.evaluations.scorers.runtime_composition.RuntimeOutcomeScorer", + scorer_params={ + "grader": { + "path": ["ok"], + "expected": True, + } + }, + ) + ] + ) + + assert scorers[0].__class__.__name__ == "RuntimeOutcomeScorer" diff --git a/tests/evaluations/test_trajectory_log_manual_case.py b/tests/evaluations/test_trajectory_log_manual_case.py new file mode 100644 index 000000000..a4253ed0d --- /dev/null +++ b/tests/evaluations/test_trajectory_log_manual_case.py @@ -0,0 +1,522 @@ +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from typing import Any, Mapping + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + +from aworld.evaluations.sources import AWorldTrajectoryLogSource, create_source_eval_suite +from aworld.evaluations.substrate import ( + AgentJudgeBackend, + GateMetricCondition, + GatePolicyDef, + StateCheckGrader, + EvaluationFlowDef, + EvalSuiteDef, + load_agent_markdown, + run_evaluation_flow, +) +from aworld.evaluations.report import validate_evaluator_report +from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema +from aworld_cli.evaluator_runtime import run_evaluator_source_cli + + +DEFAULT_JUDGE_TIMEOUT_SECONDS = 600.0 + + +class _FakePytestConfig: + def __init__(self, values: Mapping[str, Any]): + self._values = values + + def getoption(self, name: str) -> Any: + return self._values.get(name) + + +def _manual_replay_config(pytest_config: Any) -> dict[str, Any]: + required_options = { + "--task-id": pytest_config.getoption("trajectory_task_id"), + "--trajectory-log": pytest_config.getoption("trajectory_log"), + "--agent-prompt": pytest_config.getoption("trajectory_agent_prompt"), + "--out-dir": pytest_config.getoption("trajectory_out_dir"), + } + missing = [name for name, value in required_options.items() if not value] + if missing: + raise pytest.UsageError( + "manual trajectory replay requires explicit pytest options: " + + ", ".join(missing) + ) + + task_id = required_options["--task-id"] + log_path = Path(str(required_options["--trajectory-log"])).expanduser() + agent_prompt_path = Path(str(required_options["--agent-prompt"])) + out_dir = Path(str(required_options["--out-dir"])) + judge_timeout_seconds = pytest_config.getoption("trajectory_judge_timeout") or DEFAULT_JUDGE_TIMEOUT_SECONDS + return { + "task_id": str(task_id), + "log_path": log_path, + "agent_prompt_path": agent_prompt_path, + "out_dir": out_dir, + "judge_timeout_seconds": float(judge_timeout_seconds), + } + + +def test_manual_replay_config_requires_explicit_pytest_options(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("AWORLD_TRAJECTORY_TASK_ID", "task_from_env") + monkeypatch.setenv("AWORLD_TRAJECTORY_LOG", "~/env/trajectory.log") + monkeypatch.setenv("AWORLD_TRAJECTORY_AGENT_PROMPT", "env/agent.md") + monkeypatch.setenv("AWORLD_TRAJECTORY_OUT_DIR", "env/reports") + + with pytest.raises(pytest.UsageError, match="--task-id"): + _manual_replay_config(_FakePytestConfig({})) + + config = _manual_replay_config( + _FakePytestConfig( + { + "trajectory_task_id": "task_from_cli", + "trajectory_log": "~/cli/trajectory.log", + "trajectory_agent_prompt": "cli/agent.md", + "trajectory_out_dir": "cli/reports", + "trajectory_judge_timeout": 12.5, + } + ) + ) + + assert config["task_id"] == "task_from_cli" + assert config["log_path"] == Path("~/cli/trajectory.log").expanduser() + assert config["agent_prompt_path"] == Path("cli/agent.md") + assert config["out_dir"] == Path("cli/reports") + assert config["judge_timeout_seconds"] == 12.5 + + +@pytest.mark.asyncio +async def test_agent_markdown_loads_as_aworld_agent_via_existing_skill_loader( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + monkeypatch.setenv("LLM_MODEL_NAME", "test-model") + monkeypatch.setenv("LLM_API_KEY", "test-key") + + agent_md = tmp_path / "agent.md" + agent_md.write_text( + "---\n" + "name: custom trajectory judge\n" + "description: Evaluates trajectories\n" + "tools: Bash, Read\n" + "model: opus\n" + "---\n\n" + "# Judge Contract\n" + "Return strict JSON.\n", + encoding="utf-8", + ) + + agent = await load_agent_markdown(agent_md, agent_id="custom-judge") + + assert agent.name() == "custom-judge" + assert agent.desc() == "Evaluates trajectories" + assert agent.mcp_servers == [] + assert "Return strict JSON." in agent.system_prompt + + +@pytest.mark.asyncio +async def test_markdown_agent_judge_backend_runs_loaded_agent_with_runners( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + monkeypatch.setenv("LLM_MODEL_NAME", "test-model") + monkeypatch.setenv("LLM_API_KEY", "test-key") + + agent_md = tmp_path / "agent.md" + agent_md.write_text( + "---\n" + "name: trajectory judge\n" + "description: Test judge\n" + "---\n\n" + "You are the test judge.\n", + encoding="utf-8", + ) + calls: dict[str, Any] = {} + + class _FakeTaskResponse: + answer = json.dumps( + { + "weighted_score": 88, + "verdict": "Pass", + "dimensions": {"A1_groundedness": {"score": 4}}, + } + ) + + async def fake_run(input: str, agent: Any, **kwargs: Any) -> _FakeTaskResponse: + calls["input"] = input + calls["agent_name"] = agent.name() + calls["system_prompt"] = agent.system_prompt + return _FakeTaskResponse() + + monkeypatch.setattr("aworld.runner.Runners.run", fake_run) + + backend = AgentJudgeBackend.from_agent_markdown( + agent_md, + backend_id="trajectory-evaluator-agent-md", + prompt_builder=lambda case_input, target, suite: "judge this trajectory", + ) + execution = await backend.execute({}, {}, object()) + + assert calls == { + "input": "judge this trajectory", + "agent_name": "trajectory-evaluator-agent-md", + "system_prompt": "You are the test judge.", + } + assert execution.backend_id == "trajectory-evaluator-agent-md" + assert execution.payload["weighted_score"] == 88 + assert execution.payload["dimensions"]["A1_groundedness"]["score"] == 4 + + +@pytest.mark.asyncio +async def test_trajectory_log_source_default_adapter_populates_tool_calls_and_standard_metrics(tmp_path: Path): + task_id = "task_with_tool" + trajectory = [ + { + "state": { + "input": {"content": "question"}, + "messages": [{"role": "system", "content": "system"}], + }, + "meta": {"step": 1, "pre_agent": "user", "agent_id": "agent"}, + "action": { + "tool_calls": [ + {"function": {"name": "search", "arguments": "{}"}}, + {"function": {"name": "open", "arguments": "{\"url\":\"https://example.com\"}"}}, + ], + "is_agent_finished": "False", + }, + }, + { + "state": { + "messages": [ + {"role": "tool", "content": "search result"}, + {"role": "tool", "content": "page text"}, + ], + }, + "meta": {"step": 2, "pre_agent": "agent", "agent_id": "agent"}, + "action": {"content": "final", "is_agent_finished": "True"}, + }, + ] + log_path = tmp_path / "trajectory.log" + log_path.write_text( + repr({"task_id": task_id, "is_sub_task": False, "trajectory": json.dumps(trajectory)}) + + "\n", + encoding="utf-8", + ) + source = AWorldTrajectoryLogSource(path=log_path, task_ids=[task_id], extraction_dir=tmp_path) + record = next(iter(source.iter_records())) + case = source.to_cases()[0] + + state = source.default_adapter().adapt(record=record, case=case, target={}) + + assert [call["name"] for call in state.tool_calls] == ["search", "open"] + assert state.usage == {"total_tokens": 0} + assert state.timing == {"duration_ms": 0} + assert state.standard_metrics["n_turns"] == 2 + assert state.standard_metrics["n_tool_calls"] == 2 + + +def test_trajectory_step_assertion_uses_extracted_num_steps(tmp_path: Path): + extracted_path = tmp_path / "extracted_task.json" + extracted_path.write_text(json.dumps({"num_steps": 81}), encoding="utf-8") + result = { + "state_summary": {"trajectory_steps": 81}, + "metadata": {"extracted_path": str(extracted_path)}, + } + + _assert_report_trajectory_steps_match_extracted(result) + + +def test_source_cli_report_assertion_matches_manual_trajectory_goal(tmp_path: Path): + task_id = "task_for_cli_assertion" + log_path = tmp_path / "trajectory.log" + agent_prompt_path = tmp_path / "agent.md" + report_path = tmp_path / "report.json" + extracted_path = tmp_path / f"extracted_{task_id}.json" + log_path.write_text("log", encoding="utf-8") + agent_prompt_path.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + extracted_path.write_text( + json.dumps( + { + "task_id": task_id, + "num_steps": 2, + "final_answer": "final", + "evidence": [{"content": "tool result"}], + } + ), + encoding="utf-8", + ) + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-10T00:00:00Z", + "suite_id": "trajectory-source-evaluator", + "target": {"target_kind": "source", "target_path": str(log_path)}, + "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, + "summary": {"trajectory-source-evaluator": {"score": {"mean": 64.0}}}, + "metrics": { + "score": {"mean": 64.0}, + "has_evidence": {"mean": 1.0}, + "agent_finished": {"mean": 1.0}, + }, + "results": [ + { + "case_id": task_id, + "input": {"task_id": task_id, "trajectory_log": str(log_path)}, + "metrics": { + "score": {"value": 64.0, "status": "PASSED"}, + "has_evidence": {"value": True, "status": "PASSED"}, + "agent_finished": {"value": True, "status": "PASSED"}, + }, + "judge": {"score": 64.0, "verdict": "Marginal", "A1_groundedness": 3}, + "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, + "state_summary": {"answer": "final", "trajectory_steps": 2}, + "metadata": {"extracted_path": str(extracted_path)}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "fail", "metric_name": None, "value": None}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "fail", + "metric_name": None, + "metric_value": None, + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 2, + "case_count": 1, + "judge_backend": "trajectory-evaluator-agent-md", + "source_kind": "trajectory", + "source_input": str(log_path), + "task_id": task_id, + }, + "source_selection": { + "mode": "source", + "input": str(log_path), + "kind": "trajectory", + "task_id": task_id, + "judge_agent": str(agent_prompt_path), + }, + "report_path": str(report_path), + } + report_path.write_text(json.dumps(report), encoding="utf-8") + + _assert_source_cli_trajectory_report_matches_manual_goal( + report, + task_id=task_id, + log_path=log_path, + agent_prompt_path=agent_prompt_path, + ) + + +def _assert_report_trajectory_steps_match_extracted(result: Mapping[str, Any]) -> None: + extracted_path = Path(str(result["metadata"]["extracted_path"])) + extracted = json.loads(extracted_path.read_text(encoding="utf-8")) + assert result["state_summary"]["trajectory_steps"] == extracted["num_steps"] + + +def _assert_source_cli_trajectory_report_matches_manual_goal( + report: Mapping[str, Any], + *, + task_id: str, + log_path: Path, + agent_prompt_path: Path, +) -> None: + validate_evaluator_report(dict(report)) + report_path = Path(str(report["report_path"])) + assert report_path.exists() + assert report["suite_id"] == "trajectory-source-evaluator" + assert report["gate"]["status"] in {"pass", "fail", "needs_approval"} + assert report["metrics"]["has_evidence"]["mean"] == 1.0 + assert report["metrics"]["agent_finished"]["mean"] == 1.0 + assert report["judge_backend"]["backend_id"] == "trajectory-evaluator-agent-md" + + source_selection = report["source_selection"] + assert source_selection["mode"] == "source" + assert source_selection["kind"] == "trajectory" + assert source_selection["task_id"] == task_id + assert Path(str(source_selection["input"])).resolve() == log_path.resolve() + assert Path(str(source_selection["judge_agent"])).resolve() == agent_prompt_path.resolve() + + automation = report["automation"] + assert automation["source_kind"] == "trajectory" + assert automation["task_id"] == task_id + assert Path(str(automation["source_input"])).resolve() == log_path.resolve() + + result = report["results"][0] + assert result["case_id"] == task_id + assert result["judge"]["verdict"] in {"Excellent", "Pass", "Marginal", "Fail"} + assert 0 <= result["judge"]["score"] <= 100 + assert result["state_summary"]["answer"] + assert Path(result["metadata"]["extracted_path"]).exists() + _assert_report_trajectory_steps_match_extracted(result) + + extracted = json.loads(Path(result["metadata"]["extracted_path"]).read_text(encoding="utf-8")) + assert extracted["task_id"] == task_id + assert extracted["final_answer"] + assert extracted["evidence"] + + +def _trajectory_judge_prompt(case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> str: + outcome = (target.get("artifacts") or {}).get("outcome") or {} + extracted_path = outcome.get("extracted_path") + extracted_payload: dict[str, Any] = {} + if extracted_path: + extracted_payload = json.loads(Path(str(extracted_path)).read_text(encoding="utf-8")) + + payload = { + "case": { + "task_id": case_input["task_id"], + "trajectory_log": case_input["trajectory_log"], + }, + "extracted_trajectory": extracted_payload, + "required_output_schema": { + "score": "number, weighted score from 0 to 100", + "verdict": "Excellent|Pass|Marginal|Fail", + "A1_groundedness": "integer 1-5", + "A2_completeness": "integer 1-5", + "A3_relevance": "integer 1-5", + "A4_readability": "integer 1-5", + "B1_tool_use": "integer 1-5", + "B2_efficiency": "integer 1-5", + "B3_compliance": "integer 1-5", + "B4_robustness": "integer 1-5", + "veto_triggered": "boolean", + }, + "instruction": ( + "Apply the trajectory-evaluator agent contract to the extracted trajectory. " + "Do not call tools and do not re-read the raw log; all required evidence is in extracted_trajectory. " + "Return exactly one JSON object matching required_output_schema, with no markdown." + ), + } + return json.dumps(payload, ensure_ascii=False, indent=2) + + +@pytest.mark.asyncio +async def test_manual_trajectory_log_case_runs_end_to_end_for_human_replay(request: pytest.FixtureRequest): + try: + config = _manual_replay_config(request.config) + except pytest.UsageError as exc: + pytest.skip(str(exc)) + task_id = config["task_id"] + log_path = config["log_path"] + agent_prompt_path = config["agent_prompt_path"] + out_dir = config["out_dir"] + judge_timeout_seconds = config["judge_timeout_seconds"] + + if not log_path.exists(): + pytest.skip(f"manual trajectory log not found: {log_path}") + if not agent_prompt_path.exists(): + pytest.skip(f"manual trajectory evaluator agent prompt not found: {agent_prompt_path}") + if not os.getenv("LLM_MODEL_NAME") or not (os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")): + pytest.skip("real trajectory judge requires LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY") + + suite = create_source_eval_suite( + suite_id="trajectory-log-manual-replay", + source=AWorldTrajectoryLogSource( + path=log_path, + task_ids=[task_id], + extraction_dir=out_dir, + ), + judge_schema=TrajectoryJudgeSchema.default(), + judge_backend=AgentJudgeBackend.from_agent_markdown( + agent_prompt_path, + backend_id="trajectory-evaluator-agent-md", + prompt_builder=_trajectory_judge_prompt, + timeout_seconds=judge_timeout_seconds, + ), + outcome_scorers=( + StateCheckGrader( + metric_name="has_evidence", + source="outcome", + path=("evidence_blocks",), + op=">", + expected=0, + ), + StateCheckGrader( + metric_name="agent_finished", + source="outcome", + path=("is_finished",), + op="==", + expected=True, + ), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), + GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), + GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), + ) + ), + metadata={ + "manual_replay": True, + "judge_agent_prompt": str(agent_prompt_path), + "trajectory_log": str(log_path), + }, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": str(log_path), "target_kind": "trajectory_log"}}, + suite=suite, + ) + ) + + report_dict = report.to_dict() + validate_evaluator_report(report_dict) + out_dir.mkdir(parents=True, exist_ok=True) + report_path = out_dir / f"evaluator_report_{task_id}.json" + report_path.write_text(json.dumps(report_dict, ensure_ascii=False, indent=2), encoding="utf-8") + + assert report["gate"]["status"] in {"pass", "fail", "needs_approval"} + assert report["metrics"]["has_evidence"]["mean"] == 1.0 + assert report["metrics"]["agent_finished"]["mean"] == 1.0 + assert report["judge_backend"]["backend_id"] == "trajectory-evaluator-agent-md" + assert report["results"][0]["judge"]["verdict"] in {"Excellent", "Pass", "Marginal", "Fail"} + assert 0 <= report["results"][0]["judge"]["score"] <= 100 + assert report["results"][0]["state_summary"]["answer"] + assert Path(report["results"][0]["metadata"]["extracted_path"]).exists() + _assert_report_trajectory_steps_match_extracted(report["results"][0]) + assert report_path.exists() + + +def test_manual_trajectory_log_case_runs_via_source_cli_for_human_replay(request: pytest.FixtureRequest): + try: + config = _manual_replay_config(request.config) + except pytest.UsageError as exc: + pytest.skip(str(exc)) + task_id = config["task_id"] + log_path = config["log_path"] + agent_prompt_path = config["agent_prompt_path"] + out_dir = config["out_dir"] + + if not log_path.exists(): + pytest.skip(f"manual trajectory log not found: {log_path}") + if not agent_prompt_path.exists(): + pytest.skip(f"manual trajectory evaluator agent prompt not found: {agent_prompt_path}") + if not os.getenv("LLM_MODEL_NAME") or not (os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")): + pytest.skip("real trajectory judge requires LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY") + + report = run_evaluator_source_cli( + input=str(log_path), + kind="trajectory", + task_id=task_id, + judge_agent=str(agent_prompt_path), + out_dir=str(out_dir), + ) + + _assert_source_cli_trajectory_report_matches_manual_goal( + report, + task_id=task_id, + log_path=log_path, + agent_prompt_path=agent_prompt_path, + ) diff --git a/tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json b/tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json new file mode 100644 index 000000000..c829ba2cc --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json @@ -0,0 +1,33 @@ +{ + "id": "evaluator-like", + "name": "evaluator-like", + "version": "1.0.0", + "entrypoints": { + "hooks": [ + { + "id": "evaluator-pre-run", + "target": "hooks/pre_run.py", + "scope": "workspace", + "metadata": { + "hook_point": "evaluator.pre_run" + } + }, + { + "id": "evaluator-post-run", + "target": "hooks/post_run.py", + "scope": "workspace", + "metadata": { + "hook_point": "evaluator.post_run" + } + }, + { + "id": "evaluator-render-summary", + "target": "hooks/render_summary.py", + "scope": "workspace", + "metadata": { + "hook_point": "evaluator.render_summary" + } + } + ] + } +} diff --git a/tests/fixtures/plugins/evaluator_like/hooks/.gitignore b/tests/fixtures/plugins/evaluator_like/hooks/.gitignore new file mode 100644 index 000000000..7a60b85e1 --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/tests/fixtures/plugins/evaluator_like/hooks/post_run.py b/tests/fixtures/plugins/evaluator_like/hooks/post_run.py new file mode 100644 index 000000000..e76538a7c --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/post_run.py @@ -0,0 +1,8 @@ +import json +from pathlib import Path + + +def handle_event(event, state): + output_path = Path(event["workspace_path"]) / "hook-output.json" + output_path.write_text(json.dumps(event["report"]), encoding="utf-8") + return {"action": "allow"} diff --git a/tests/fixtures/plugins/evaluator_like/hooks/pre_run.py b/tests/fixtures/plugins/evaluator_like/hooks/pre_run.py new file mode 100644 index 000000000..711be3dfc --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/pre_run.py @@ -0,0 +1,2 @@ +def handle_event(event, state): + return {"metadata": {"hook_tag": "from-pre-run"}} diff --git a/tests/fixtures/plugins/evaluator_like/hooks/render_summary.py b/tests/fixtures/plugins/evaluator_like/hooks/render_summary.py new file mode 100644 index 000000000..7f671d2a4 --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/render_summary.py @@ -0,0 +1,2 @@ +def handle_event(event, state): + return {"metadata": {"summary_suffix": "hook-rendered"}} diff --git a/tests/plugins/test_plugin_hooks.py b/tests/plugins/test_plugin_hooks.py index c0dd8adb0..219398d74 100644 --- a/tests/plugins/test_plugin_hooks.py +++ b/tests/plugins/test_plugin_hooks.py @@ -8,6 +8,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) from aworld.plugins.discovery import discover_plugins +from aworld_cli.evaluator_runtime import render_evaluator_summary, run_evaluator_cli from aworld_cli.builtin_plugins.memory_cli.common import append_workspace_session_log from aworld_cli.builtin_plugins.memory_cli.hooks import task_completed as task_completed_hook_module from aworld_cli.plugin_capabilities.hooks import PluginHookResult, load_plugin_hooks @@ -36,6 +37,10 @@ def _get_builtin_memory_plugin_root() -> Path: ) +def _get_evaluator_like_plugin_root() -> Path: + return Path("tests/fixtures/plugins/evaluator_like").resolve() + + def test_load_plugin_hook_entrypoints(): plugin_root = Path("tests/fixtures/plugins/ralph_like").resolve() plugin = discover_plugins([plugin_root])[0] @@ -46,6 +51,89 @@ def test_load_plugin_hook_entrypoints(): assert hooks["stop"][0].entrypoint_id == "loop-stop" +def test_evaluator_pre_run_hook_can_annotate_runtime_metadata( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + hooks = load_plugin_hooks(discover_plugins([_get_evaluator_like_plugin_root()])) + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "target": dict(flow.target), + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": False, "resolved": False, "approved": None}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: hooks) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["target"]["hook_tag"] == "from-pre-run" + + +def test_evaluator_post_run_hook_can_capture_report_side_effect( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + hooks = load_plugin_hooks(discover_plugins([_get_evaluator_like_plugin_root()])) + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "target": dict(flow.target), + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": False, "resolved": False, "approved": None}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: hooks) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + run_evaluator_cli(target=str(target)) + + assert (tmp_path / "hook-output.json").exists() + + +def test_evaluator_render_summary_hook_can_append_suffix( + monkeypatch: pytest.MonkeyPatch, +) -> None: + hooks = load_plugin_hooks(discover_plugins([_get_evaluator_like_plugin_root()])) + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: hooks) + + summary = render_evaluator_summary( + { + "suite_id": "app-evaluator", + "gate": {"status": "pass", "value": 0.9}, + "target": {"target_path": str(Path.cwd() / "artifact.txt")}, + } + ) + + assert "hook-rendered" in summary + + +def test_evaluator_hook_contract_is_documented_in_runtime_module() -> None: + content = Path("aworld-cli/src/aworld_cli/evaluator_runtime.py").read_text(encoding="utf-8") + + assert "evaluator.pre_run" in content + assert "event payload" in content + assert "allowed side effects" in content + + @pytest.mark.asyncio async def test_stop_hook_can_block_and_continue_session(tmp_path): plugin_root = Path("tests/fixtures/plugins/ralph_like").resolve() diff --git a/tests/test_plugin_cli_entrypoint.py b/tests/test_plugin_cli_entrypoint.py index 39782ce42..8209d55a3 100644 --- a/tests/test_plugin_cli_entrypoint.py +++ b/tests/test_plugin_cli_entrypoint.py @@ -69,6 +69,17 @@ def test_interactive_command_is_registered_via_plugin_registry(): assert command is not None +def test_evaluator_command_is_registered_via_plugin_registry(): + from aworld_cli import main as main_module + from aworld_cli.core.plugin_manager import get_builtin_plugin_roots + + registry = main_module._build_top_level_command_registry() + command = registry.get("evaluator") + + assert command is not None + assert any(path.name == "evaluator_cli" for path in get_builtin_plugin_roots()) + + def test_acp_command_dispatches_via_plugin_registry(capsys): from aworld_cli import main as main_module diff --git a/tests/test_slash_commands.py b/tests/test_slash_commands.py index 28ffdb771..a230d89cb 100644 --- a/tests/test_slash_commands.py +++ b/tests/test_slash_commands.py @@ -18,7 +18,7 @@ from aworld_cli.core.command_system import CommandRegistry, CommandContext from aworld.plugins.discovery import discover_plugins from aworld_cli.plugin_capabilities.commands import register_plugin_commands -from aworld_cli.commands import help_cmd, commit, review, diff, cron_cmd, plugins_cmd +from aworld_cli.commands import help_cmd, commit, review, diff, cron_cmd, plugins_cmd, evaluation_cmd from aworld_cli.console import AWorldCLI @@ -27,7 +27,7 @@ class TestCommandRegistration: def test_commands_registered(self): """Verify all commands are registered.""" - expected_commands = ['help', 'commit', 'review', 'diff', 'cron', 'plugins'] + expected_commands = ['help', 'commit', 'review', 'diff', 'cron', 'plugins', 'evaluation'] for cmd_name in expected_commands: cmd = CommandRegistry.get(cmd_name) assert cmd is not None, f"Command /{cmd_name} not registered" @@ -44,7 +44,7 @@ def test_command_types(self): cmd = CommandRegistry.get(cmd_name) assert cmd.command_type == 'prompt', f"/{cmd_name} should be prompt command" - for cmd_name in ['cron', 'plugins']: + for cmd_name in ['cron', 'plugins', 'evaluation']: tool_cmd = CommandRegistry.get(cmd_name) assert tool_cmd.command_type == 'tool' @@ -59,6 +59,7 @@ def test_list_commands(self): assert 'diff' in command_names assert 'cron' in command_names assert 'plugins' in command_names + assert 'evaluation' in command_names class TestHelpCommand: @@ -79,6 +80,7 @@ async def test_help_command_execution(self): assert '/review' in result assert '/diff' in result assert '/plugins' in result + assert '/evaluation' in result @pytest.mark.asyncio async def test_help_command_with_args(self): @@ -550,9 +552,114 @@ async def _drain_notifications(self, job_id=None): assert remaining[0].job_id == "job-2" +class TestEvaluationCommand: + """Test /evaluation command direct execution.""" + + @pytest.mark.asyncio + async def test_evaluation_without_args_shows_usage(self): + cmd = CommandRegistry.get("evaluation") + + result = await cmd.execute(CommandContext(cwd=os.getcwd(), user_args="")) + + assert "Usage:" in result + assert "/evaluation --input" in result + assert "--kind trajectory" in result + + @pytest.mark.asyncio + async def test_evaluation_delegates_to_source_runtime(self, monkeypatch, tmp_path): + cmd = CommandRegistry.get("evaluation") + input_path = tmp_path / "trajectory.log" + agent_path = tmp_path / "agent.md" + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "trajectory-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"trajectory-source-evaluator": {"score": {"mean": 88.0}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + "report_path": str(tmp_path / "report.json"), + } + + monkeypatch.setattr( + "aworld_cli.commands.evaluation_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + result = await cmd.execute( + CommandContext( + cwd=os.getcwd(), + user_args=( + f"--input {input_path} --kind trajectory " + f"--task-id task-1 --judge-agent {agent_path} --out-dir {tmp_path}" + ), + ) + ) + + assert calls["input"] == str(input_path) + assert calls["kind"] == "trajectory" + assert calls["task_id"] == "task-1" + assert calls["judge_agent"] == str(agent_path) + assert calls["out_dir"] == str(tmp_path) + assert "trajectory-source-evaluator" in result + assert "Report:" in result + + @pytest.mark.asyncio + async def test_evaluation_runs_source_runtime_without_nested_event_loop(self, monkeypatch, tmp_path): + cmd = CommandRegistry.get("evaluation") + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8") + agent_path = tmp_path / "agent.md" + agent_path.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-10T00:00:00Z", + "suite_id": "answer-source-evaluator", + "target": flow.target, + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}}, + "metrics": {"score": {"mean": 88.0}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "gate": {"status": "pass", "metric_name": "score", "value": 88.0}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: {}) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + result = await cmd.execute( + CommandContext( + cwd=os.getcwd(), + user_args=( + f"--input {input_path} --kind answer " + f"--judge-agent {agent_path} --output {tmp_path / 'report.json'}" + ), + ) + ) + + assert "answer-source-evaluator" in result + assert "Report:" in result + + class TestSlashCommandCompletion: """Test slash command completion sources.""" + def test_console_completion_entries_include_evaluation_command(self): + cli = AWorldCLI() + + words, meta = cli._build_completion_entries(agent_names=[]) + + assert "/evaluation" in words + assert "/evaluation --kind answer" in words + assert "/evaluation --kind trajectory" in words + assert meta["/evaluation"] == "Run evaluator flows" + def test_console_completion_entries_include_cron_subcommands(self): """Typing /cron should expose concrete cron subcommands in the completer source.""" cli = AWorldCLI()