From 6f0d7c43723b44818336ddf4d98c84d8b5a5144a Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Mon, 1 Jun 2026 21:47:35 +0800 Subject: [PATCH 01/41] feat: add suite-backed evaluator flow --- .../src/aworld_cli/evaluator_runtime.py | 56 ++ .../aworld_cli/top_level_commands/__init__.py | 7 +- .../top_level_commands/evaluator_cmd.py | 39 ++ aworld/config/conf.py | 2 + aworld/evaluations/scorers/suite_judge.py | 35 ++ aworld/evaluations/substrate.py | 570 ++++++++++++++++++ aworld/runners/evaluate_runner.py | 2 + .../plans/2026-06-01-evaluation-substrate.md | 312 ++++++++++ tests/core/test_evaluator_runtime.py | 47 ++ .../core/test_evaluator_top_level_command.py | 51 ++ .../evaluations/test_evaluation_substrate.py | 187 ++++++ 11 files changed, 1304 insertions(+), 4 deletions(-) create mode 100644 aworld-cli/src/aworld_cli/evaluator_runtime.py create mode 100644 aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py create mode 100644 aworld/evaluations/scorers/suite_judge.py create mode 100644 aworld/evaluations/substrate.py create mode 100644 docs/superpowers/plans/2026-06-01-evaluation-substrate.md create mode 100644 tests/core/test_evaluator_runtime.py create mode 100644 tests/core/test_evaluator_top_level_command.py create mode 100644 tests/evaluations/test_evaluation_substrate.py diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py new file mode 100644 index 000000000..003164370 --- /dev/null +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import asyncio +import json +from pathlib import Path + +from aworld.evaluations.substrate import EvaluationFlowDef, resolve_eval_suite, run_evaluation_flow + + +def run_evaluator_cli( + *, + target: str, + suite: str | None = None, + output: str | None = None, + interactive_approval: bool = False, +) -> dict: + target_path = Path(target).expanduser().resolve() + suite_def = resolve_eval_suite(suite, target_path) + flow = EvaluationFlowDef( + target={ + "target_path": str(target_path), + "target_kind": "directory" if target_path.is_dir() else "file", + }, + suite=suite_def, + interactive_approval=interactive_approval, + output_path=output, + ) + report = asyncio.run(run_evaluation_flow(flow)) + approval = dict(report.get("approval") or {}) + approval.setdefault("required", report.get("gate", {}).get("status") == "needs_approval") + approval.setdefault("resolved", False) + approval.setdefault("approved", None) + if approval["required"] and interactive_approval: + approved = input("Evaluation requires approval. Approve? [y/N]: ").strip().lower() in {"y", "yes"} + approval["resolved"] = True + approval["approved"] = approved + report["approval"] = approval + if output: + output_path = Path(output).expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + return report + + +def render_evaluator_summary(report: dict) -> str: + suite_id = report.get("suite_id", "unknown-suite") + gate = report.get("gate", {}) + status = gate.get("status", "unknown") + metric_value = gate.get("value") + summary_line = f"Evaluator suite: {suite_id}\nGate: {status}" + if metric_value is not None: + summary_line += f" ({metric_value:.2f})" + backend = report.get("judge_backend", {}).get("backend_id") + if backend: + summary_line += f"\nJudge backend: {backend}" + return summary_line diff --git a/aworld-cli/src/aworld_cli/top_level_commands/__init__.py b/aworld-cli/src/aworld_cli/top_level_commands/__init__.py index 03f64e245..06638e690 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/__init__.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/__init__.py @@ -1,7 +1,6 @@ from __future__ import annotations def register_builtin_top_level_commands(registry) -> None: - # Keep the builtin registry hook so kernel-owned top-level commands can be - # added later. The `skill` command is now contributed through the framework - # plugin bootstrap path instead of hardcoded registration here. - return None + from .evaluator_cmd import EvaluatorTopLevelCommand + + registry.register(EvaluatorTopLevelCommand()) diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py new file mode 100644 index 000000000..cad728fc3 --- /dev/null +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from aworld_cli.evaluator_runtime import render_evaluator_summary, run_evaluator_cli + + +class EvaluatorTopLevelCommand: + @property + def name(self) -> str: + return "evaluator" + + @property + def description(self) -> str: + return "Run a suite-backed evaluation flow for a local target." + + @property + def aliases(self) -> tuple[str, ...]: + return tuple() + + def register_parser(self, subparsers) -> None: + parser = subparsers.add_parser( + "evaluator", + help=self.description, + description=self.description, + prog="aworld-cli evaluator", + ) + parser.add_argument("--target", type=str, required=True) + parser.add_argument("--suite", type=str) + parser.add_argument("--output", type=str) + parser.add_argument("--interactive-approval", action="store_true") + + def run(self, args, context) -> int: + report = run_evaluator_cli( + target=args.target, + suite=args.suite, + output=args.output, + interactive_approval=args.interactive_approval, + ) + print(render_evaluator_summary(report)) + return 0 diff --git a/aworld/config/conf.py b/aworld/config/conf.py index 6d9f27219..14db16980 100644 --- a/aworld/config/conf.py +++ b/aworld/config/conf.py @@ -423,6 +423,8 @@ class EvaluationConfig(BaseConfig): eval_target_full_class_name: str = None eval_target_config: dict = None eval_criterias: List[Union[dict]] = None + eval_suite_id: str = None + eval_dataset: Any = None # eval dataset id or file path, file path should be a jsonl file eval_dataset_id_or_file_path: str = None eval_dataset_load_config: Optional[DataLoaderConfig] = DataLoaderConfig() diff --git a/aworld/evaluations/scorers/suite_judge.py b/aworld/evaluations/scorers/suite_judge.py new file mode 100644 index 000000000..fb0852d58 --- /dev/null +++ b/aworld/evaluations/scorers/suite_judge.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import annotations + +from aworld.evaluations.base import EvalDataCase, MetricResult, ScorerResult +from aworld.evaluations.scorers import scorer_register +from aworld.evaluations.base import Scorer + + +@scorer_register("score") +class SuiteJudgeScorer(Scorer): + def __init__(self, suite=None, name: str = None, **kwargs): + super().__init__(name=name or getattr(suite, "suite_id", None), **kwargs) + self.suite = suite + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + if self.suite is None: + raise ValueError("suite judge is required for suite-backed evaluation") + + case_input = dict(input.case_data) + target = dict(case_input.get("_target", {})) + execution = await self.suite.resolve_judge_backend().execute(case_input, target, self.suite) + payload = dict(execution.payload) + self.suite.judge_schema.validate(payload) + + metric_result: MetricResult = { + "value": float(payload["score"]), + "metadata": { + **payload, + "_judge_backend": execution.backend_id, + }, + } + return ScorerResult( + scorer_name=self.name, + metric_results={"score": metric_result}, + ) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py new file mode 100644 index 000000000..5275f24cd --- /dev/null +++ b/aworld/evaluations/substrate.py @@ -0,0 +1,570 @@ +# coding: utf-8 +from __future__ import annotations + +import json +import inspect +import os +import re +import uuid +from dataclasses import dataclass, field, replace +from pathlib import Path +from typing import Any, Awaitable, Callable, ClassVar, Mapping + +from aworld.config.conf import EvaluationConfig +from aworld.evaluations.base import EvalDataCase, EvalDataset +from aworld.evaluations.base import NoActionEvalTarget +from aworld.runners.evaluate_runner import EvaluateRunner + + +JudgeCallable = Callable[[dict[str, Any], dict[str, Any]], Mapping[str, Any] | Awaitable[Mapping[str, Any]]] +JudgeExecutor = Callable[[str, str], Mapping[str, Any] | str | Awaitable[Mapping[str, Any] | str]] + + +@dataclass(frozen=True) +class EvalCaseDef: + case_id: str + input: dict[str, Any] + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class JudgeSchemaDef: + required_fields: tuple[str, ...] = tuple() + + def validate(self, payload: Mapping[str, Any]) -> None: + missing = [field for field in self.required_fields if field not in payload] + if missing: + joined = ", ".join(missing) + raise ValueError(f"missing required judge fields: {joined}") + + +@dataclass(frozen=True) +class GateDecision: + status: str + metric_name: str + value: float + + +@dataclass(frozen=True) +class GatePolicyDef: + metric_name: str + pass_threshold: float + approval_threshold: float | None = None + + def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: + value = float(metrics[self.metric_name]) + if value >= self.pass_threshold: + return GateDecision(status="pass", metric_name=self.metric_name, value=value) + if self.approval_threshold is not None and value >= self.approval_threshold: + return GateDecision(status="needs_approval", metric_name=self.metric_name, value=value) + return GateDecision(status="fail", metric_name=self.metric_name, value=value) + + +@dataclass(frozen=True) +class JudgeExecution: + backend_id: str + payload: dict[str, Any] + + +class JudgeBackend: + backend_id: ClassVar[str] = "judge-backend" + + def is_available(self) -> bool: + return True + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + raise NotImplementedError + + +@dataclass(frozen=True) +class CallableJudgeBackend: + backend_id: str + judge: JudgeCallable + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + payload = await _maybe_await_judge(self.judge, case_input, target) + return JudgeExecution(backend_id=self.backend_id, payload=dict(payload)) + + +@dataclass(frozen=True) +class AgentJudgeBackend: + backend_id: str + system_prompt: str + executor: JudgeExecutor | None = None + prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], str] | None = None + + def is_available(self) -> bool: + if self.executor is not None: + return True + model_name = os.getenv("LLM_MODEL_NAME") + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + return bool(model_name and api_key) + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + if not self.is_available(): + raise RuntimeError(f"judge backend '{self.backend_id}' is not available") + prompt_builder = self.prompt_builder or _build_default_judge_prompt + prompt = prompt_builder(case_input, target, suite) + executor = self.executor or _default_agent_judge_executor + response = executor(prompt, self.system_prompt) + if inspect.isawaitable(response): + response = await response + payload = _coerce_judge_payload(response) + return JudgeExecution(backend_id=self.backend_id, payload=payload) + + async def judge(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> dict[str, Any]: + execution = await self.execute(case_input, target, suite) + return execution.payload + + +@dataclass(frozen=True) +class FallbackJudgeBackend: + backend_id: str + backends: tuple[JudgeBackend, ...] + + def is_available(self) -> bool: + return any(backend.is_available() for backend in self.backends) + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + errors: list[str] = [] + for backend in self.backends: + if not backend.is_available(): + errors.append(f"{backend.backend_id}:unavailable") + continue + try: + return await backend.execute(case_input, target, suite) + except Exception as exc: + errors.append(f"{backend.backend_id}:{exc}") + joined = "; ".join(errors) if errors else "no candidate backend" + raise RuntimeError(f"no judge backend succeeded: {joined}") + + +@dataclass(frozen=True) +class _LegacyJudgeBackendAdapter: + backend: Any + + @property + def backend_id(self) -> str: + return getattr(self.backend, "backend_id", "legacy-judge-backend") + + def is_available(self) -> bool: + available = getattr(self.backend, "is_available", None) + if callable(available): + return bool(available()) + return True + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: + payload = self.backend.judge(case_input, target, suite) + if inspect.isawaitable(payload): + payload = await payload + return JudgeExecution(backend_id=self.backend_id, payload=dict(payload)) + + +@dataclass(frozen=True) +class EvalSuiteDef: + suite_id: str + cases: list[EvalCaseDef] = field(default_factory=list) + judge_schema: JudgeSchemaDef = field(default_factory=JudgeSchemaDef) + gate_policy: GatePolicyDef | None = None + judge: JudgeCallable | None = None + judge_backend: JudgeBackend | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def with_cases(self, cases: list[EvalCaseDef]) -> "EvalSuiteDef": + return replace(self, cases=cases) + + def resolve_judge_backend(self) -> JudgeBackend: + if self.judge_backend is not None: + if hasattr(self.judge_backend, "execute"): + return self.judge_backend + if hasattr(self.judge_backend, "judge"): + return _LegacyJudgeBackendAdapter(self.judge_backend) + return self.judge_backend + if self.judge is not None: + return CallableJudgeBackend( + backend_id=f"{self.suite_id}-callable", + judge=self.judge, + ) + raise ValueError(f"suite '{self.suite_id}' has no judge backend") + + +@dataclass(frozen=True) +class EvaluationFlowDef: + target: dict[str, Any] + suite: EvalSuiteDef + interactive_approval: bool = False + output_path: str | None = None + + +@dataclass(frozen=True) +class CompiledEvaluationPlan: + suite: EvalSuiteDef + target: dict[str, Any] + dataset: EvalDataset + eval_config: EvaluationConfig + gate_policy: GatePolicyDef | None + + +def _normalize_target(target: dict[str, Any]) -> dict[str, Any]: + normalized = dict(target) + value = normalized.pop("value", None) + if isinstance(value, Mapping): + normalized.update(value) + return normalized + + +def build_eval_dataset(cases: list[EvalCaseDef], target: dict[str, Any]) -> EvalDataset: + dataset_id = uuid.uuid4().hex + normalized_target = _normalize_target(target) + eval_cases = [ + EvalDataCase( + eval_case_id=case.case_id, + eval_dataset_id=dataset_id, + case_data={**case.input, "_target": normalized_target, "_case_metadata": dict(case.metadata)}, + ) + for case in cases + ] + return EvalDataset(eval_dataset_id=dataset_id, eval_dataset_name="suite_eval_dataset", eval_cases=eval_cases) + + +def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: + normalized_target = _normalize_target(flow.target) + dataset = build_eval_dataset(flow.suite.cases, normalized_target) + gate_policy = flow.suite.gate_policy or GatePolicyDef(metric_name="score", pass_threshold=0.0) + eval_criteria = { + "metric_name": gate_policy.metric_name, + "threshold": gate_policy.pass_threshold, + "scorer_params": { + "suite": flow.suite, + "name": flow.suite.suite_id, + }, + } + eval_config = EvaluationConfig( + eval_suite_id=flow.suite.suite_id, + eval_target=NoActionEvalTarget(), + eval_criterias=[eval_criteria], + eval_dataset=dataset, + ) + return CompiledEvaluationPlan( + suite=flow.suite, + target=normalized_target, + dataset=dataset, + eval_config=eval_config, + gate_policy=flow.suite.gate_policy, + ) + + +def _extract_metric_value(summary: Mapping[str, Any], metric_name: str) -> float: + metric_summary = summary.get(metric_name, {}) + if "mean" in metric_summary: + return float(metric_summary["mean"]) + if "true_rate" in metric_summary: + return float(metric_summary["true_rate"]) + if "value" in metric_summary: + return float(metric_summary["value"]) + raise KeyError(f"metric {metric_name} is missing aggregate summary") + + +async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: + compiled = compile_evaluation_flow(flow) + eval_result = await EvaluateRunner(config=compiled.eval_config).run() + + suite_summary = eval_result.summary.get(compiled.suite.suite_id, {}) + gate_metrics = {} + gate = None + if compiled.gate_policy is not None: + gate_metrics[compiled.gate_policy.metric_name] = _extract_metric_value( + suite_summary, + compiled.gate_policy.metric_name, + ) + gate = compiled.gate_policy.evaluate(gate_metrics) + + results = [] + report_backend_id = None + for case_result in eval_result.eval_case_results: + score_row = case_result.score_rows.get(compiled.suite.suite_id) + judge_payload = {} + if score_row is not None: + metric_result = score_row.metric_results.get(compiled.gate_policy.metric_name if compiled.gate_policy else "score", {}) + judge_payload = dict(metric_result.get("metadata", {})) + report_backend_id = report_backend_id or judge_payload.pop("_judge_backend", None) + results.append( + { + "case_id": case_result.eval_case_id, + "input": dict(case_result.input.case_data if hasattr(case_result.input, "case_data") else case_result.input), + "judge": judge_payload, + } + ) + + report = { + "report_version": 1, + "suite_id": compiled.suite.suite_id, + "target": dict(compiled.target), + "summary": eval_result.summary, + "results": results, + "approval": { + "required": bool(gate and gate.status == "needs_approval"), + "resolved": False, + "approved": None, + }, + } + if report_backend_id is not None: + report["judge_backend"] = {"backend_id": report_backend_id} + if gate is not None: + report["gate"] = { + "status": gate.status, + "metric_name": gate.metric_name, + "value": gate.value, + } + return report + + +def _rank_for_score(score: float) -> str: + if score >= 0.8: + return "Exemplary" + if score >= 0.6: + return "Good" + if score >= 0.4: + return "Mediocre" + return "Fail" + + +def _artifact_quality_score(target_path: Path) -> tuple[float, list[str], list[str]]: + positive: list[str] = [] + improvements: list[str] = [] + score = 0.3 + + if target_path.is_dir(): + files = [item for item in target_path.rglob("*") if item.is_file()] + else: + files = [target_path] + + suffixes = {item.suffix.lower() for item in files} + names = {item.name.lower() for item in files} + + if ".html" in suffixes: + score += 0.15 + positive.append("HTML entrypoints are present for direct artifact review.") + else: + improvements.append("Add a concrete HTML or UI artifact entrypoint for review.") + + if ".css" in suffixes: + score += 0.15 + positive.append("CSS assets suggest dedicated presentation work instead of raw markup only.") + else: + improvements.append("Add explicit CSS styling rather than relying on unstyled defaults.") + + if {".js", ".ts", ".tsx", ".jsx"} & suffixes: + score += 0.1 + positive.append("Interactive source files are present.") + else: + improvements.append("Add explicit interactive behavior coverage where the experience depends on it.") + + if {"readme.md", "README.md"} & names: + score += 0.1 + positive.append("Project metadata or usage notes are present.") + else: + improvements.append("Document the artifact so evaluators can understand intended behavior quickly.") + + if len(files) >= 3: + score += 0.1 + positive.append("The target contains multiple assets, which usually indicates a more complete deliverable.") + else: + improvements.append("Package the target with its supporting assets rather than a single thin file.") + + if any(item.suffix.lower() in {".png", ".jpg", ".jpeg", ".svg", ".webp"} for item in files): + score += 0.1 + positive.append("Visual assets are included for richer presentation.") + else: + improvements.append("Include branded or supporting visual assets to improve evaluability.") + + return min(score, 0.95), positive, improvements + + +def _app_evaluator_judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]: + target_path = Path(target["target_path"]) + score, positive, improvements = _artifact_quality_score(target_path) + rank = _rank_for_score(score) + praise = positive[0] if positive else "The artifact is present and can be evaluated." + criticism = improvements[0] if improvements else "The artifact still needs a stronger end-to-end product signal." + advice = " ".join(improvements[:2]) if improvements else "Raise the visual polish and make the main experience more explicit." + return { + "score": round(score, 2), + "rank": rank, + "criticism": criticism, + "praise": praise, + "improvement_advice": advice, + } + + +async def _maybe_await_judge(judge: JudgeCallable, case_input: dict[str, Any], target: dict[str, Any]) -> Mapping[str, Any]: + payload = judge(case_input, target) + if inspect.isawaitable(payload): + return await payload + return payload + + +def _load_app_evaluator_skill_prompt() -> str: + skill_path = Path(__file__).resolve().parents[2] / "aworld-skills" / "app_evaluator" / "SKILL.md" + return skill_path.read_text(encoding="utf-8") + + +def _snapshot_text_for_file(path: Path, *, max_chars: int = 1600) -> str | None: + if path.suffix.lower() not in {".html", ".css", ".js", ".ts", ".tsx", ".jsx", ".md", ".json", ".txt"}: + return None + try: + return path.read_text(encoding="utf-8", errors="ignore")[:max_chars] + except Exception: + return None + + +def _build_target_snapshot(target: dict[str, Any], *, max_files: int = 6) -> dict[str, Any]: + target_path = Path(target["target_path"]) + files = [target_path] + if target_path.is_dir(): + files = sorted([item for item in target_path.rglob("*") if item.is_file()])[:max_files] + snapshot_files = [] + for item in files: + snapshot_files.append( + { + "path": str(item), + "name": item.name, + "suffix": item.suffix.lower(), + "preview": _snapshot_text_for_file(item), + } + ) + return { + "target_path": str(target_path), + "target_kind": target.get("target_kind", "directory" if target_path.is_dir() else "file"), + "files": snapshot_files, + } + + +def _build_default_judge_prompt(case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> str: + snapshot = _build_target_snapshot(target) + target_name = Path(target["target_path"]).name + return ( + "Evaluate the following app artifact snapshot.\n" + f"Suite: {suite.suite_id}\n" + f"Target: {target['target_path']}\n" + f"Case input: {json.dumps(case_input, ensure_ascii=False)}\n" + f"Artifact snapshot: {json.dumps(snapshot, ensure_ascii=False)}\n" + "Return a JSON object with a `results` array containing exactly one item for " + f"`{target_name}` and include `score`, `rank`, `criticism`, `praise`, and `improvement_advice`." + ) + + +def _extract_json_object(text: str) -> dict[str, Any]: + stripped = text.strip() + try: + loaded = json.loads(stripped) + if isinstance(loaded, dict): + return loaded + except json.JSONDecodeError: + pass + + matches = re.findall(r"\{.*\}", stripped, re.DOTALL) + for candidate in matches: + try: + loaded = json.loads(candidate) + if isinstance(loaded, dict): + return loaded + except json.JSONDecodeError: + continue + raise ValueError("judge response does not contain a valid JSON object") + + +def _coerce_judge_payload(response: Mapping[str, Any] | str) -> dict[str, Any]: + if isinstance(response, str): + response = _extract_json_object(response) + else: + response = dict(response) + + if "results" in response: + results = response.get("results") or [] + if not results: + raise ValueError("judge response results array is empty") + return dict(results[0]) + return dict(response) + + +async def _default_agent_judge_executor(prompt: str, system_prompt: str) -> str: + from aworld.agents.llm_agent import Agent + from aworld.config.conf import AgentConfig + from aworld.core.context.base import Context + from aworld.utils.run_util import exec_agent + + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + model_name = os.getenv("LLM_MODEL_NAME") + if not api_key or not model_name: + raise RuntimeError("LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY are required for agent judge backend") + + agent = Agent( + name="evaluation_judge", + conf=AgentConfig( + llm_provider=os.getenv("LLM_PROVIDER", "openai"), + llm_model_name=model_name, + llm_temperature=float(os.getenv("LLM_TEMPERATURE", "0.1")), + llm_base_url=os.getenv("LLM_BASE_URL"), + llm_api_key=api_key, + ), + system_prompt=system_prompt, + ) + response = await exec_agent(prompt, agent=agent, context=Context()) + return str(response.answer) + + +def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) -> EvalSuiteDef: + if name != "app-evaluator": + raise KeyError(name) + + return EvalSuiteDef( + suite_id="app-evaluator", + judge_schema=JudgeSchemaDef( + required_fields=( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + ), + gate_policy=GatePolicyDef( + metric_name="score", + pass_threshold=0.8, + approval_threshold=0.6, + ), + judge_backend=judge_backend + or FallbackJudgeBackend( + backend_id="app-evaluator-fallback", + backends=( + AgentJudgeBackend( + backend_id="app-evaluator-agent", + system_prompt=_load_app_evaluator_skill_prompt(), + prompt_builder=_build_default_judge_prompt, + ), + CallableJudgeBackend( + backend_id="app-evaluator-heuristic", + judge=_app_evaluator_judge, + ), + ), + ), + metadata={ + "rubric_source": "aworld-skills/app_evaluator/SKILL.md", + "preferred_backend": "app-evaluator-agent", + }, + ) + + +def resolve_eval_suite(name: str | None, target: str | Path) -> EvalSuiteDef: + target_path = Path(target) + suite_name = name or "app-evaluator" + suite = get_builtin_eval_suite(suite_name) + case = EvalCaseDef( + case_id=target_path.name or "target", + input={ + "target_path": str(target_path), + "target_kind": "directory" if target_path.is_dir() else "file", + }, + ) + return suite.with_cases([case]) diff --git a/aworld/runners/evaluate_runner.py b/aworld/runners/evaluate_runner.py index 591029c1f..27e4c5bdc 100644 --- a/aworld/runners/evaluate_runner.py +++ b/aworld/runners/evaluate_runner.py @@ -126,6 +126,8 @@ async def load_dataset(self, eval_config: EvaluationConfig) -> EvalDataset: Returns: EvalDataset """ + if getattr(eval_config, "eval_dataset", None) is not None: + return eval_config.eval_dataset if self._is_file_path(eval_config.eval_dataset_id_or_file_path): dataset = Dataset[Dict[str, Any]](name="my_dataset", data=[]) preload_transform = None diff --git a/docs/superpowers/plans/2026-06-01-evaluation-substrate.md b/docs/superpowers/plans/2026-06-01-evaluation-substrate.md new file mode 100644 index 000000000..c66b723d7 --- /dev/null +++ b/docs/superpowers/plans/2026-06-01-evaluation-substrate.md @@ -0,0 +1,312 @@ +# Evaluation Substrate Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build AWorld's internal evaluation substrate, add a working `aworld-cli evaluator` flow, and express `app_evaluator` as a suite-backed evaluation without breaking legacy evaluation APIs. + +**Architecture:** Add a small internal definition-and-compilation layer under `aworld.evaluations`, keep execution on top of existing `EvaluateRunner`/`EvalTarget`/`Scorer`, then wire a new CLI top-level command onto that substrate. `app_evaluator` becomes the first built-in suite definition and gateable report flow. + +**Tech Stack:** Python, dataclasses, existing AWorld evaluation runtime, argparse-based CLI, pytest, unittest-style async tests where already used + +--- + +### Task 1: Add substrate contracts and red-path tests + +**Files:** +- Create: `tests/evaluations/test_evaluation_substrate.py` +- Create: `aworld/evaluations/substrate.py` +- Modify: `aworld/config/conf.py` + +- [ ] **Step 1: Write the failing substrate contract tests** + +```python +from aworld.evaluations.substrate import ( + EvalSuiteDef, + EvalCaseDef, + JudgeSchemaDef, + GatePolicyDef, + EvaluationFlowDef, + compile_evaluation_flow, +) + + +def test_compile_evaluation_flow_preserves_legacy_runner_inputs(): + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge_schema=JudgeSchemaDef(required_fields=("score", "rank")), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8), + ) + flow = EvaluationFlowDef(target={"kind": "dict", "value": {"answer": "hello"}}, suite=suite) + + compiled = compile_evaluation_flow(flow) + + assert compiled.eval_config.eval_dataset_id_or_file_path is None + assert compiled.dataset.eval_cases[0].case_data["query"] == "hello" + assert compiled.gate_policy.metric_name == "score" +``` + +- [ ] **Step 2: Run the substrate tests to verify they fail** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: FAIL with import or missing symbol errors for the new substrate module and compile helpers. + +- [ ] **Step 3: Implement the minimal substrate layer** + +```python +@dataclass +class GatePolicyDef: + metric_name: str + pass_threshold: float + approval_threshold: float | None = None + + +def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: + dataset = build_eval_dataset(flow.suite.cases) + eval_config = EvaluationConfig( + eval_criterias=[], + eval_dataset_id_or_file_path=None, + ) + return CompiledEvaluationPlan( + suite=flow.suite, + dataset=dataset, + eval_config=eval_config, + gate_policy=flow.suite.gate_policy, + ) +``` + +- [ ] **Step 4: Re-run the substrate tests** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the substrate contract slice** + +```bash +git add tests/evaluations/test_evaluation_substrate.py aworld/evaluations/substrate.py aworld/config/conf.py +git commit -m "feat: add evaluation substrate contracts" +``` + +### Task 2: Add schema validation and gate decisions + +**Files:** +- Modify: `tests/evaluations/test_evaluation_substrate.py` +- Modify: `aworld/evaluations/substrate.py` + +- [ ] **Step 1: Write failing tests for schema validation and gate outcomes** + +```python +def test_gate_policy_returns_needs_approval_between_thresholds(): + decision = GatePolicyDef( + metric_name="score", + pass_threshold=0.85, + approval_threshold=0.6, + ).evaluate({"score": 0.7}) + + assert decision.status == "needs_approval" + + +def test_judge_schema_rejects_missing_required_fields(): + schema = JudgeSchemaDef(required_fields=("score", "rank", "criticism")) + + with pytest.raises(ValueError, match="missing required judge fields"): + schema.validate({"score": 0.8, "rank": "Good"}) +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: FAIL because schema validation and gate evaluation are not implemented yet. + +- [ ] **Step 3: Implement schema validation and gate decisions** + +```python +def validate(self, payload: Mapping[str, Any]) -> None: + missing = [field for field in self.required_fields if field not in payload] + if missing: + raise ValueError(f"missing required judge fields: {', '.join(missing)}") + + +def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: + score = float(metrics[self.metric_name]) + if score >= self.pass_threshold: + return GateDecision(status="pass", metric_name=self.metric_name, value=score) + if self.approval_threshold is not None and score >= self.approval_threshold: + return GateDecision(status="needs_approval", metric_name=self.metric_name, value=score) + return GateDecision(status="fail", metric_name=self.metric_name, value=score) +``` + +- [ ] **Step 4: Re-run the substrate tests** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the schema-and-gate slice** + +```bash +git add tests/evaluations/test_evaluation_substrate.py aworld/evaluations/substrate.py +git commit -m "feat: add evaluation schema and gate decisions" +``` + +### Task 3: Add the CLI evaluator command + +**Files:** +- Create: `tests/core/test_evaluator_top_level_command.py` +- Create: `aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py` +- Create: `aworld-cli/src/aworld_cli/evaluator_runtime.py` +- Modify: `aworld-cli/src/aworld_cli/top_level_commands/__init__.py` +- Modify: `aworld-cli/src/aworld_cli/main.py` + +- [ ] **Step 1: Write the failing evaluator CLI tests** + +```python +def test_registry_registers_builtin_evaluator_command(): + registry = main_module._build_top_level_command_registry() + command = registry.get("evaluator") + assert command is not None + + +def test_maybe_dispatch_top_level_command_runs_evaluator(monkeypatch, tmp_path, capsys): + target = tmp_path / "artifact.txt" + target.write_text("demo", encoding="utf-8") + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + lambda **kwargs: {"gate": {"status": "pass"}, "suite_id": "app-evaluator"}, + ) + + handled = main_module._maybe_dispatch_top_level_command( + ["aworld-cli", "evaluator", "--target", str(target)] + ) + + assert handled is True +``` + +- [ ] **Step 2: Run the evaluator CLI tests to verify they fail** + +Run: `pytest tests/core/test_evaluator_top_level_command.py -q` +Expected: FAIL because the evaluator command is not registered yet. + +- [ ] **Step 3: Implement the minimal evaluator command and runtime** + +```python +class EvaluatorTopLevelCommand: + @property + def name(self) -> str: + return "evaluator" + + def register_parser(self, subparsers) -> None: + parser = subparsers.add_parser("evaluator", help=self.description) + parser.add_argument("--target", required=True) + parser.add_argument("--suite") + parser.add_argument("--output") + + def run(self, args, context) -> int: + result = run_evaluator_cli(target=args.target, suite=args.suite, output=args.output) + print(render_evaluator_summary(result)) + return 0 +``` + +- [ ] **Step 4: Re-run the evaluator CLI tests** + +Run: `pytest tests/core/test_evaluator_top_level_command.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the evaluator command slice** + +```bash +git add tests/core/test_evaluator_top_level_command.py aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py aworld-cli/src/aworld_cli/evaluator_runtime.py aworld-cli/src/aworld_cli/top_level_commands/__init__.py aworld-cli/src/aworld_cli/main.py +git commit -m "feat: add evaluator top level command" +``` + +### Task 4: Add the built-in app evaluator suite and end-to-end report wiring + +**Files:** +- Create: `tests/evaluations/test_app_evaluator_suite.py` +- Modify: `aworld/evaluations/substrate.py` +- Modify: `aworld-skills/app_evaluator/SKILL.md` +- Modify: `aworld-cli/src/aworld_cli/evaluator_runtime.py` + +- [ ] **Step 1: Write the failing app evaluator suite tests** + +```python +from aworld.evaluations.substrate import get_builtin_eval_suite + + +def test_app_evaluator_suite_requires_expected_judge_fields(): + suite = get_builtin_eval_suite("app-evaluator") + + assert suite.judge_schema.required_fields == ( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + + +def test_app_evaluator_suite_uses_threshold_gate(): + suite = get_builtin_eval_suite("app-evaluator") + + assert suite.gate_policy.metric_name == "score" +``` + +- [ ] **Step 2: Run the app evaluator suite tests to verify they fail** + +Run: `pytest tests/evaluations/test_app_evaluator_suite.py -q` +Expected: FAIL because the builtin app evaluator suite registry does not exist yet. + +- [ ] **Step 3: Implement the builtin app evaluator suite and CLI result persistence** + +```python +def get_builtin_eval_suite(name: str) -> EvalSuiteDef: + if name != "app-evaluator": + raise KeyError(name) + return EvalSuiteDef( + suite_id="app-evaluator", + judge_schema=JudgeSchemaDef( + required_fields=( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + ), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8, approval_threshold=0.6), + ) +``` + +- [ ] **Step 4: Re-run the app evaluator suite tests and the focused CLI/substrate suite** + +Run: `pytest tests/evaluations/test_app_evaluator_suite.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_top_level_command.py -q` +Expected: PASS + +- [ ] **Step 5: Commit the built-in suite slice** + +```bash +git add tests/evaluations/test_app_evaluator_suite.py aworld/evaluations/substrate.py aworld-skills/app_evaluator/SKILL.md aworld-cli/src/aworld_cli/evaluator_runtime.py +git commit -m "feat: add builtin app evaluator suite" +``` + +### Task 5: Full focused verification + +**Files:** +- Test: `tests/evaluations/test_evaluation_substrate.py` +- Test: `tests/evaluations/test_app_evaluator_suite.py` +- Test: `tests/core/test_evaluator_top_level_command.py` +- Test: `tests/evaluations/test_dataset_evaluate.py` + +- [ ] **Step 1: Run the focused verification suite** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_app_evaluator_suite.py tests/core/test_evaluator_top_level_command.py tests/evaluations/test_dataset_evaluate.py -q` +Expected: PASS + +- [ ] **Step 2: Sanity-check the CLI help output** + +Run: `python -m aworld_cli.main evaluator --help` +Expected: exit 0 and help output showing `--target`, `--suite`, and `--output` + +- [ ] **Step 3: Validate the OpenSpec change remains consistent** + +Run: `openspec validate aworld-evaluation-substrate-2026-06-01` +Expected: `Change 'aworld-evaluation-substrate-2026-06-01' is valid` diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py new file mode 100644 index 000000000..ab4d8f02c --- /dev/null +++ b/tests/core/test_evaluator_runtime.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + +from aworld_cli.evaluator_runtime import run_evaluator_cli + + +def test_run_evaluator_cli_persists_approval_state( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + output = tmp_path / "report.json" + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.7}}}, + "results": [], + "gate": {"status": "needs_approval", "metric_name": "score", "value": 0.7}, + "approval": {"required": True, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + monkeypatch.setattr("builtins.input", lambda _: "y") + + report = run_evaluator_cli( + target=str(target), + interactive_approval=True, + output=str(output), + ) + + persisted = json.loads(output.read_text(encoding="utf-8")) + + assert report["approval"]["resolved"] is True + assert report["approval"]["approved"] is True + assert persisted["approval"]["approved"] is True + assert persisted["judge_backend"]["backend_id"] == "stub-agent" diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py new file mode 100644 index 000000000..4d1ccf1cc --- /dev/null +++ b/tests/core/test_evaluator_top_level_command.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + +from aworld_cli import main as main_module + + +def test_registry_registers_builtin_evaluator_command() -> None: + registry = main_module._build_top_level_command_registry() + + command = registry.get("evaluator") + + assert command is not None + assert command.name == "evaluator" + + +def test_maybe_dispatch_top_level_command_runs_evaluator_command( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + def fake_run_evaluator_cli(**kwargs): + assert kwargs["target"] == str(target) + return { + "suite_id": "app-evaluator", + "gate": {"status": "pass"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + fake_run_evaluator_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + ["aworld-cli", "evaluator", "--target", str(target)] + ) + output = capsys.readouterr().out + + assert handled is True + assert "app-evaluator" in output + assert "pass" in output diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py new file mode 100644 index 000000000..5359fbdd5 --- /dev/null +++ b/tests/evaluations/test_evaluation_substrate.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.base import EvaluationConfig +from aworld.evaluations.substrate import ( + AgentJudgeBackend, + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + GatePolicyDef, + JudgeSchemaDef, + compile_evaluation_flow, + get_builtin_eval_suite, + run_evaluation_flow, +) + + +def test_compile_evaluation_flow_builds_inline_dataset_and_gate_config() -> None: + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[ + EvalCaseDef( + case_id="case-1", + input={"query": "hello world"}, + ) + ], + judge_schema=JudgeSchemaDef(required_fields=("score", "rank")), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8), + ) + flow = EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + + compiled = compile_evaluation_flow(flow) + + assert isinstance(compiled.eval_config, EvaluationConfig) + assert compiled.eval_config.eval_dataset is compiled.dataset + assert compiled.dataset.eval_cases[0].case_data["query"] == "hello world" + assert compiled.dataset.eval_cases[0].case_data["_target"]["target_path"] == "demo.txt" + assert compiled.gate_policy.metric_name == "score" + + +def test_judge_schema_validation_rejects_missing_fields() -> None: + schema = JudgeSchemaDef(required_fields=("score", "rank", "criticism")) + + with pytest.raises(ValueError, match="missing required judge fields"): + schema.validate({"score": 0.8, "rank": "Good"}) + + +def test_gate_policy_uses_pass_and_approval_thresholds() -> None: + gate = GatePolicyDef( + metric_name="score", + pass_threshold=0.85, + approval_threshold=0.6, + ) + + assert gate.evaluate({"score": 0.9}).status == "pass" + assert gate.evaluate({"score": 0.7}).status == "needs_approval" + assert gate.evaluate({"score": 0.5}).status == "fail" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_executes_suite_judge_and_returns_gate() -> None: + async def fake_judge(case_input, target): + assert case_input["query"] == "hello" + assert target["target_path"] == "artifact.txt" + return { + "score": 0.7, + "rank": "Good", + "criticism": "Needs stronger hierarchy.", + "praise": "The layout is clear.", + "improvement_advice": "Increase contrast around the hero area.", + } + + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge_schema=JudgeSchemaDef( + required_fields=( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + ), + gate_policy=GatePolicyDef( + metric_name="score", + pass_threshold=0.85, + approval_threshold=0.6, + ), + judge=fake_judge, + ) + flow = EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + + report = await run_evaluation_flow(flow) + + assert report["suite_id"] == "demo-suite" + assert report["gate"]["status"] == "needs_approval" + assert report["results"][0]["judge"]["rank"] == "Good" + assert report["summary"]["demo-suite"]["score"]["mean"] == pytest.approx(0.7) + + +def test_builtin_app_evaluator_suite_has_required_schema_and_score_gate() -> None: + suite = get_builtin_eval_suite("app-evaluator") + + assert suite.suite_id == "app-evaluator" + assert suite.judge_schema.required_fields == ( + "score", + "rank", + "criticism", + "praise", + "improvement_advice", + ) + assert suite.gate_policy.metric_name == "score" + + +@pytest.mark.asyncio +async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: + async def fake_executor(prompt: str, system_prompt: str): + assert "artifact.txt" in prompt + assert "UI review committee" in system_prompt + return { + "results": [ + { + "filename": "artifact.txt", + "score": 0.91, + "rank": "Exemplary", + "criticism": "Almost none.", + "praise": "Strong visual hierarchy.", + "improvement_advice": "Keep the current direction.", + } + ] + } + + backend = AgentJudgeBackend( + backend_id="agent-backend", + system_prompt="You are a UI review committee.", + executor=fake_executor, + ) + + payload = await backend.judge( + case_input={"target_path": "artifact.txt"}, + target={"target_path": "artifact.txt", "target_kind": "file"}, + suite=EvalSuiteDef(suite_id="app-evaluator"), + ) + + assert payload["score"] == pytest.approx(0.91) + assert payload["rank"] == "Exemplary" + + +@pytest.mark.asyncio +async def test_builtin_app_evaluator_can_use_injected_judge_backend() -> None: + class StubBackend: + backend_id = "stub-agent" + + def is_available(self) -> bool: + return True + + async def judge(self, case_input, target, suite): + return { + "score": 0.72, + "rank": "Good", + "criticism": "Needs slightly better spacing.", + "praise": "Solid composition.", + "improvement_advice": "Increase whitespace around the main section.", + } + + suite = get_builtin_eval_suite("app-evaluator", judge_backend=StubBackend()).with_cases( + [EvalCaseDef(case_id="artifact", input={"target_path": "artifact.txt"})] + ) + flow = EvaluationFlowDef( + target={"target_path": "artifact.txt", "target_kind": "file"}, + suite=suite, + ) + + report = await run_evaluation_flow(flow) + + assert report["judge_backend"]["backend_id"] == "stub-agent" + assert report["results"][0]["judge"]["rank"] == "Good" + assert report["report_version"] == 1 + assert report["approval"]["required"] is True From b4edd9140a2ac8bc47050e8f1667da9de7e19acd Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Mon, 1 Jun 2026 21:56:50 +0800 Subject: [PATCH 02/41] feat: add evaluator judge timeout fallback --- aworld/evaluations/substrate.py | 27 +++++++++++-- .../evaluations/test_evaluation_substrate.py | 40 +++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 5275f24cd..823848a6c 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -81,6 +81,9 @@ class CallableJudgeBackend: backend_id: str judge: JudgeCallable + def is_available(self) -> bool: + return True + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: "EvalSuiteDef") -> JudgeExecution: payload = await _maybe_await_judge(self.judge, case_input, target) return JudgeExecution(backend_id=self.backend_id, payload=dict(payload)) @@ -92,6 +95,7 @@ class AgentJudgeBackend: system_prompt: str executor: JudgeExecutor | None = None prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], str] | None = None + timeout_seconds: float | None = None def is_available(self) -> bool: if self.executor is not None: @@ -106,9 +110,25 @@ async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suit prompt_builder = self.prompt_builder or _build_default_judge_prompt prompt = prompt_builder(case_input, target, suite) executor = self.executor or _default_agent_judge_executor - response = executor(prompt, self.system_prompt) - if inspect.isawaitable(response): - response = await response + async def _run_executor(): + result = executor(prompt, self.system_prompt) + if inspect.isawaitable(result): + return await result + return result + + if self.timeout_seconds is not None: + task = asyncio.create_task(_run_executor()) + try: + response = await asyncio.wait_for(task, timeout=self.timeout_seconds) + except Exception: + task.cancel() + try: + await task + except BaseException: + pass + raise + else: + response = await _run_executor() payload = _coerce_judge_payload(response) return JudgeExecution(backend_id=self.backend_id, payload=payload) @@ -542,6 +562,7 @@ def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) backend_id="app-evaluator-agent", system_prompt=_load_app_evaluator_skill_prompt(), prompt_builder=_build_default_judge_prompt, + timeout_seconds=float(os.getenv("AWORLD_EVALUATOR_AGENT_TIMEOUT_SECONDS", "8.0")), ), CallableJudgeBackend( backend_id="app-evaluator-heuristic", diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 5359fbdd5..dd697db13 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -5,9 +5,11 @@ from aworld.evaluations.base import EvaluationConfig from aworld.evaluations.substrate import ( AgentJudgeBackend, + CallableJudgeBackend, EvalCaseDef, EvalSuiteDef, EvaluationFlowDef, + FallbackJudgeBackend, GatePolicyDef, JudgeSchemaDef, compile_evaluation_flow, @@ -185,3 +187,41 @@ async def judge(self, case_input, target, suite): assert report["results"][0]["judge"]["rank"] == "Good" assert report["report_version"] == 1 assert report["approval"]["required"] is True + + +@pytest.mark.asyncio +async def test_fallback_judge_backend_uses_next_backend_after_timeout() -> None: + async def slow_executor(prompt: str, system_prompt: str): + await asyncio.sleep(0.05) + return {"results": [{"filename": "artifact.txt", "score": 0.99}]} + + fallback = FallbackJudgeBackend( + backend_id="fallback", + backends=( + AgentJudgeBackend( + backend_id="slow-agent", + system_prompt="judge", + executor=slow_executor, + timeout_seconds=0.01, + ), + CallableJudgeBackend( + backend_id="heuristic", + judge=lambda case_input, target: { + "score": 0.61, + "rank": "Good", + "criticism": "Fallback path used.", + "praise": "Fallback stayed responsive.", + "improvement_advice": "Keep timeout budgets explicit.", + }, + ), + ), + ) + + execution = await fallback.execute( + case_input={"target_path": "artifact.txt"}, + target={"target_path": "artifact.txt", "target_kind": "file"}, + suite=EvalSuiteDef(suite_id="app-evaluator"), + ) + + assert execution.backend_id == "heuristic" + assert execution.payload["score"] == pytest.approx(0.61) From 45cc8e5e2e514c3fe109649959a40ddf03b70dca Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 09:48:07 +0800 Subject: [PATCH 03/41] feat: improve evaluator cli flow --- .../src/aworld_cli/evaluator_runtime.py | 37 +++++++- .../top_level_commands/evaluator_cmd.py | 25 ++++- tests/core/test_evaluator_runtime.py | 39 +++++++- .../core/test_evaluator_top_level_command.py | 93 +++++++++++++++++++ 4 files changed, 186 insertions(+), 8 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 003164370..584133202 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -4,7 +4,27 @@ import json from pathlib import Path -from aworld.evaluations.substrate import EvaluationFlowDef, resolve_eval_suite, run_evaluation_flow +from aworld.evaluations.substrate import EvaluationFlowDef, get_builtin_eval_suite, resolve_eval_suite, run_evaluation_flow + + +def _sanitize_path_token(value: str) -> str: + return "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in value).strip("-") or "target" + + +def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path | None = None) -> Path: + root = (cwd or Path.cwd()).expanduser().resolve() + report_dir = root / ".aworld" / "evaluations" + report_dir.mkdir(parents=True, exist_ok=True) + target_token = _sanitize_path_token(target_path.stem or target_path.name) + suite_token = _sanitize_path_token(suite_id) + return report_dir / f"{target_token}.{suite_token}.json" + + +def available_evaluator_suites() -> list[str]: + # Keep this explicit for now so CLI discovery stays stable even before a broader + # suite registry API is introduced. + get_builtin_eval_suite("app-evaluator") + return ["app-evaluator"] def run_evaluator_cli( @@ -35,10 +55,14 @@ def run_evaluator_cli( approval["resolved"] = True approval["approved"] = approved report["approval"] = approval - if output: - output_path = Path(output).expanduser().resolve() - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + output_path = ( + Path(output).expanduser().resolve() + if output + else default_evaluator_report_path(target_path=target_path, suite_id=report["suite_id"]) + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + report["report_path"] = str(output_path) + output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") return report @@ -53,4 +77,7 @@ def render_evaluator_summary(report: dict) -> str: backend = report.get("judge_backend", {}).get("backend_id") if backend: summary_line += f"\nJudge backend: {backend}" + report_path = report.get("report_path") + if report_path: + summary_line += f"\nReport: {report_path}" return summary_line diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index cad728fc3..36309d638 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -1,6 +1,10 @@ from __future__ import annotations -from aworld_cli.evaluator_runtime import render_evaluator_summary, run_evaluator_cli +from aworld_cli.evaluator_runtime import ( + available_evaluator_suites, + render_evaluator_summary, + run_evaluator_cli, +) class EvaluatorTopLevelCommand: @@ -23,12 +27,23 @@ def register_parser(self, subparsers) -> None: description=self.description, prog="aworld-cli evaluator", ) - parser.add_argument("--target", type=str, required=True) + parser.add_argument("--target", type=str) parser.add_argument("--suite", type=str) parser.add_argument("--output", type=str) parser.add_argument("--interactive-approval", action="store_true") + parser.add_argument("--list-suites", action="store_true") def run(self, args, context) -> int: + if getattr(args, "list_suites", False): + print("Available evaluator suites:") + for suite_name in available_evaluator_suites(): + print(f" - {suite_name}") + return 0 + + if not getattr(args, "target", None): + print("❌ --target is required unless --list-suites is used") + return 1 + report = run_evaluator_cli( target=args.target, suite=args.suite, @@ -36,4 +51,10 @@ def run(self, args, context) -> int: interactive_approval=args.interactive_approval, ) print(render_evaluator_summary(report)) + gate_status = report.get("gate", {}).get("status") + approval = report.get("approval") or {} + if gate_status == "fail": + return 2 + if gate_status == "needs_approval" and not approval.get("approved", False): + return 3 return 0 diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index ab4d8f02c..93129d9ce 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -8,7 +8,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) -from aworld_cli.evaluator_runtime import run_evaluator_cli +from aworld_cli.evaluator_runtime import available_evaluator_suites, run_evaluator_cli def test_run_evaluator_cli_persists_approval_state( @@ -45,3 +45,40 @@ async def fake_run_evaluation_flow(flow): assert report["approval"]["approved"] is True assert persisted["approval"]["approved"] is True assert persisted["judge_backend"]["backend_id"] == "stub-agent" + + +def test_run_evaluator_cli_writes_default_report_when_output_is_omitted( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + monkeypatch.chdir(tmp_path) + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + report_path = Path(report["report_path"]) + persisted = json.loads(report_path.read_text(encoding="utf-8")) + + assert report_path.exists() + assert report_path.parent == tmp_path / ".aworld" / "evaluations" + assert persisted["suite_id"] == "app-evaluator" + + +def test_available_evaluator_suites_lists_builtin_suite() -> None: + suites = available_evaluator_suites() + + assert "app-evaluator" in suites diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index 4d1ccf1cc..0934e48da 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -2,12 +2,15 @@ import sys from pathlib import Path +from types import SimpleNamespace import pytest sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) from aworld_cli import main as main_module +from aworld_cli.core.top_level_command_system import TopLevelCommandContext +from aworld_cli.top_level_commands.evaluator_cmd import EvaluatorTopLevelCommand def test_registry_registers_builtin_evaluator_command() -> None: @@ -49,3 +52,93 @@ def fake_run_evaluator_cli(**kwargs): assert handled is True assert "app-evaluator" in output assert "pass" in output + + +def test_evaluator_command_returns_nonzero_for_failed_gate( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + lambda **kwargs: { + "suite_id": "app-evaluator", + "gate": {"status": "fail", "value": 0.3}, + "approval": {"required": False, "resolved": False, "approved": None}, + }, + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target="artifact.txt", + suite=None, + output=None, + interactive_approval=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + assert exit_code == 2 + + +def test_evaluator_command_returns_nonzero_for_unresolved_approval( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_cli", + lambda **kwargs: { + "suite_id": "app-evaluator", + "gate": {"status": "needs_approval", "value": 0.7}, + "approval": {"required": True, "resolved": False, "approved": None}, + }, + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target="artifact.txt", + suite=None, + output=None, + interactive_approval=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + assert exit_code == 3 + + +def test_evaluator_command_lists_available_suites( + capsys: pytest.CaptureFixture[str], +) -> None: + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=True, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "app-evaluator" in output + + +def test_evaluator_command_returns_usage_error_without_target( + capsys: pytest.CaptureFixture[str], +) -> None: + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 1 + assert "--target is required" in output From 0e7cffc699cdf49cca77f81e335958a6f681422c Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 10:55:28 +0800 Subject: [PATCH 04/41] feat: add visual-aware evaluator suite resolution --- .../src/aworld_cli/evaluator_runtime.py | 19 +- aworld/evaluations/substrate.py | 218 ++++++++++++++++-- tests/core/test_evaluator_runtime.py | 31 +++ .../evaluations/test_evaluation_substrate.py | 105 +++++++++ 4 files changed, 347 insertions(+), 26 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 584133202..c871d4404 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -4,7 +4,13 @@ import json from pathlib import Path -from aworld.evaluations.substrate import EvaluationFlowDef, get_builtin_eval_suite, resolve_eval_suite, run_evaluation_flow +from aworld.evaluations.substrate import ( + EvaluationFlowDef, + describe_eval_target, + list_eval_suites, + resolve_eval_suite, + run_evaluation_flow, +) def _sanitize_path_token(value: str) -> str: @@ -21,10 +27,7 @@ def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path def available_evaluator_suites() -> list[str]: - # Keep this explicit for now so CLI discovery stays stable even before a broader - # suite registry API is introduced. - get_builtin_eval_suite("app-evaluator") - return ["app-evaluator"] + return list_eval_suites() def run_evaluator_cli( @@ -36,11 +39,9 @@ def run_evaluator_cli( ) -> dict: target_path = Path(target).expanduser().resolve() suite_def = resolve_eval_suite(suite, target_path) + target_info = describe_eval_target(target_path) flow = EvaluationFlowDef( - target={ - "target_path": str(target_path), - "target_kind": "directory" if target_path.is_dir() else "file", - }, + target=target_info, suite=suite_def, interactive_approval=interactive_approval, output_path=output, diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 823848a6c..532dee416 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import annotations +import asyncio +import base64 import json import inspect import os @@ -17,7 +19,20 @@ JudgeCallable = Callable[[dict[str, Any], dict[str, Any]], Mapping[str, Any] | Awaitable[Mapping[str, Any]]] -JudgeExecutor = Callable[[str, str], Mapping[str, Any] | str | Awaitable[Mapping[str, Any] | str]] +JudgePrompt = str | tuple[str, list[str]] +JudgeExecutor = Callable[[JudgePrompt, str], Mapping[str, Any] | str | Awaitable[Mapping[str, Any] | str]] +EvalSuiteFactory = Callable[[dict[str, Any]], "EvalSuiteDef"] +EvalSuiteMatcher = Callable[[dict[str, Any]], bool] + +_IMAGE_SUFFIX_TO_MIME = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + ".svg": "image/svg+xml", +} @dataclass(frozen=True) @@ -94,7 +109,7 @@ class AgentJudgeBackend: backend_id: str system_prompt: str executor: JudgeExecutor | None = None - prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], str] | None = None + prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], JudgePrompt] | None = None timeout_seconds: float | None = None def is_available(self) -> bool: @@ -225,12 +240,76 @@ class CompiledEvaluationPlan: gate_policy: GatePolicyDef | None +@dataclass(frozen=True) +class EvalSuiteRegistration: + suite_id: str + factory: EvalSuiteFactory + matcher: EvalSuiteMatcher | None = None + priority: int = 0 + + def matches(self, target: dict[str, Any]) -> bool: + if self.matcher is None: + return True + return bool(self.matcher(target)) + + +_EVAL_SUITE_REGISTRY: dict[str, EvalSuiteRegistration] = {} + + +def register_eval_suite( + suite_id: str, + factory: EvalSuiteFactory, + *, + matcher: EvalSuiteMatcher | None = None, + priority: int = 0, +) -> None: + _EVAL_SUITE_REGISTRY[suite_id] = EvalSuiteRegistration( + suite_id=suite_id, + factory=factory, + matcher=matcher, + priority=priority, + ) + + +def list_eval_suites() -> list[str]: + return sorted(_EVAL_SUITE_REGISTRY) + + +def _is_image_path(path: Path) -> bool: + return path.suffix.lower() in _IMAGE_SUFFIX_TO_MIME + + +def _infer_target_kind(path: Path) -> str: + if path.is_dir(): + return "directory" + if _is_image_path(path): + return "image" + return "file" + + +def describe_eval_target(target: str | Path | Mapping[str, Any]) -> dict[str, Any]: + if isinstance(target, Mapping): + normalized = dict(target) + value = normalized.pop("value", None) + if isinstance(value, Mapping): + normalized.update(value) + target_path = normalized.get("target_path") + if target_path is None: + return normalized + path = Path(str(target_path)).expanduser() + normalized["target_path"] = str(path) + normalized["target_kind"] = normalized.get("target_kind") or _infer_target_kind(path) + return normalized + + path = Path(target).expanduser().resolve() + return { + "target_path": str(path), + "target_kind": _infer_target_kind(path), + } + + def _normalize_target(target: dict[str, Any]) -> dict[str, Any]: - normalized = dict(target) - value = normalized.pop("value", None) - if isinstance(value, Mapping): - normalized.update(value) - return normalized + return describe_eval_target(target) def build_eval_dataset(cases: list[EvalCaseDef], target: dict[str, Any]) -> EvalDataset: @@ -352,6 +431,12 @@ def _rank_for_score(score: float) -> str: def _artifact_quality_score(target_path: Path) -> tuple[float, list[str], list[str]]: positive: list[str] = [] improvements: list[str] = [] + + if target_path.is_file() and _is_image_path(target_path): + positive.append("A rendered screenshot is present for direct visual review.") + improvements.append("Provide a few more representative screens or brief implementation context for deeper evaluation.") + return 0.65, positive, improvements + score = 0.3 if target_path.is_dir(): @@ -361,6 +446,22 @@ def _artifact_quality_score(target_path: Path) -> tuple[float, list[str], list[s suffixes = {item.suffix.lower() for item in files} names = {item.name.lower() for item in files} + visual_files = [item for item in files if _is_image_path(item)] + + if visual_files and not {".html", ".css", ".js", ".ts", ".tsx", ".jsx"} & suffixes: + score = 0.55 + positive.append("Rendered screenshots are available for direct visual review.") + if len(visual_files) >= 3: + score += 0.1 + positive.append("Multiple screens provide broader product coverage.") + else: + improvements.append("Include a few more representative states to improve evaluation coverage.") + if {"readme.md", "README.md"} & names: + score += 0.1 + positive.append("Project metadata or usage notes are present.") + else: + improvements.append("Add brief context so evaluators understand what the screens are showing.") + return min(score, 0.95), positive, improvements if ".html" in suffixes: score += 0.15 @@ -392,7 +493,7 @@ def _artifact_quality_score(target_path: Path) -> tuple[float, list[str], list[s else: improvements.append("Package the target with its supporting assets rather than a single thin file.") - if any(item.suffix.lower() in {".png", ".jpg", ".jpeg", ".svg", ".webp"} for item in files): + if visual_files: score += 0.1 positive.append("Visual assets are included for richer presentation.") else: @@ -474,6 +575,60 @@ def _build_default_judge_prompt(case_input: dict[str, Any], target: dict[str, An ) +def _encode_image_as_data_url(path: Path) -> str | None: + mime_type = _IMAGE_SUFFIX_TO_MIME.get(path.suffix.lower()) + if mime_type is None: + return None + try: + encoded = base64.b64encode(path.read_bytes()).decode("utf-8") + except Exception: + return None + return f"data:{mime_type};base64,{encoded}" + + +def _collect_target_image_urls(target: dict[str, Any], *, max_images: int = 4) -> list[str]: + target_path = Path(target["target_path"]) + image_paths: list[Path] = [] + + if target_path.is_file() and _is_image_path(target_path): + image_paths = [target_path] + elif target_path.is_dir(): + image_paths = sorted( + item for item in target_path.rglob("*") if item.is_file() and _is_image_path(item) + )[:max_images] + + image_urls: list[str] = [] + for path in image_paths: + data_url = _encode_image_as_data_url(path) + if data_url is not None: + image_urls.append(data_url) + return image_urls + + +def _build_app_evaluator_judge_prompt( + case_input: dict[str, Any], + target: dict[str, Any], + suite: EvalSuiteDef, +) -> JudgePrompt: + snapshot = _build_target_snapshot(target) + target_name = Path(target["target_path"]).name + image_urls = _collect_target_image_urls(target) + prompt = ( + "Evaluate the following app artifact.\n" + f"Suite: {suite.suite_id}\n" + f"Target: {target['target_path']}\n" + f"Case input: {json.dumps(case_input, ensure_ascii=False)}\n" + f"Artifact snapshot: {json.dumps(snapshot, ensure_ascii=False)}\n" + f"Attached visuals: {len(image_urls)}\n" + "Use attached visuals as the primary evidence when present. Use the artifact snapshot for filenames and implementation context.\n" + "Return a JSON object with a `results` array containing exactly one item for " + f"`{target_name}` and include `score`, `rank`, `criticism`, `praise`, and `improvement_advice`." + ) + if image_urls: + return prompt, image_urls + return prompt + + def _extract_json_object(text: str) -> dict[str, Any]: stripped = text.strip() try: @@ -508,9 +663,10 @@ def _coerce_judge_payload(response: Mapping[str, Any] | str) -> dict[str, Any]: return dict(response) -async def _default_agent_judge_executor(prompt: str, system_prompt: str) -> str: +async def _default_agent_judge_executor(prompt: JudgePrompt, system_prompt: str) -> str: from aworld.agents.llm_agent import Agent from aworld.config.conf import AgentConfig + from aworld.core.common import Observation from aworld.core.context.base import Context from aworld.utils.run_util import exec_agent @@ -519,6 +675,13 @@ async def _default_agent_judge_executor(prompt: str, system_prompt: str) -> str: if not api_key or not model_name: raise RuntimeError("LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY are required for agent judge backend") + prompt_text: str + image_urls: list[str] | None + if isinstance(prompt, tuple): + prompt_text, image_urls = prompt + else: + prompt_text, image_urls = prompt, None + agent = Agent( name="evaluation_judge", conf=AgentConfig( @@ -530,7 +693,10 @@ async def _default_agent_judge_executor(prompt: str, system_prompt: str) -> str: ), system_prompt=system_prompt, ) - response = await exec_agent(prompt, agent=agent, context=Context()) + request: str | Observation = prompt_text + if image_urls: + request = Observation(content=prompt_text, images=image_urls) + response = await exec_agent(request, agent=agent, context=Context()) return str(response.answer) @@ -561,7 +727,7 @@ def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) AgentJudgeBackend( backend_id="app-evaluator-agent", system_prompt=_load_app_evaluator_skill_prompt(), - prompt_builder=_build_default_judge_prompt, + prompt_builder=_build_app_evaluator_judge_prompt, timeout_seconds=float(os.getenv("AWORLD_EVALUATOR_AGENT_TIMEOUT_SECONDS", "8.0")), ), CallableJudgeBackend( @@ -578,14 +744,32 @@ def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) def resolve_eval_suite(name: str | None, target: str | Path) -> EvalSuiteDef: - target_path = Path(target) - suite_name = name or "app-evaluator" - suite = get_builtin_eval_suite(suite_name) + target_info = describe_eval_target(target) + + if name is not None: + registration = _EVAL_SUITE_REGISTRY.get(name) + if registration is None: + raise KeyError(name) + else: + candidates = [registration for registration in _EVAL_SUITE_REGISTRY.values() if registration.matches(target_info)] + if not candidates: + raise KeyError(f"no evaluation suite matches target {target_info.get('target_path')}") + registration = sorted(candidates, key=lambda item: (-item.priority, item.suite_id))[0] + + suite = registration.factory(target_info) case = EvalCaseDef( - case_id=target_path.name or "target", + case_id=Path(target_info["target_path"]).name or "target", input={ - "target_path": str(target_path), - "target_kind": "directory" if target_path.is_dir() else "file", + "target_path": target_info["target_path"], + "target_kind": target_info["target_kind"], }, ) return suite.with_cases([case]) + + +register_eval_suite( + "app-evaluator", + lambda target: get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, +) diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 93129d9ce..54a039aab 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -1,5 +1,6 @@ from __future__ import annotations +import base64 import json import sys from pathlib import Path @@ -82,3 +83,33 @@ def test_available_evaluator_suites_lists_builtin_suite() -> None: suites = available_evaluator_suites() assert "app-evaluator" in suites + + +def test_run_evaluator_cli_marks_image_targets( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.png" + target.write_bytes( + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=" + ) + ) + + async def fake_run_evaluation_flow(flow): + assert flow.target["target_kind"] == "image" + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["suite_id"] == "app-evaluator" diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index dd697db13..372c57ee2 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -1,8 +1,10 @@ from __future__ import annotations +import base64 import pytest from aworld.evaluations.base import EvaluationConfig +import aworld.evaluations.substrate as substrate_module from aworld.evaluations.substrate import ( AgentJudgeBackend, CallableJudgeBackend, @@ -14,6 +16,9 @@ JudgeSchemaDef, compile_evaluation_flow, get_builtin_eval_suite, + list_eval_suites, + register_eval_suite, + resolve_eval_suite, run_evaluation_flow, ) @@ -122,6 +127,49 @@ def test_builtin_app_evaluator_suite_has_required_schema_and_score_gate() -> Non assert suite.gate_policy.metric_name == "score" +def test_eval_suite_registry_resolves_explicit_and_target_defaults( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + def generic_factory(target): + return EvalSuiteDef(suite_id="generic-review") + + def image_factory(target): + return EvalSuiteDef(suite_id="image-review") + + register_eval_suite( + "generic-review", + generic_factory, + matcher=lambda target: True, + priority=10, + ) + register_eval_suite( + "image-review", + image_factory, + matcher=lambda target: target["target_kind"] == "image", + priority=50, + ) + + image_target = tmp_path / "artifact.png" + image_target.write_bytes(base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=")) + text_target = tmp_path / "artifact.txt" + text_target.write_text("artifact", encoding="utf-8") + + listed = list_eval_suites() + explicit = resolve_eval_suite("generic-review", image_target) + image_default = resolve_eval_suite(None, image_target) + text_default = resolve_eval_suite(None, text_target) + + assert listed == ["generic-review", "image-review"] + assert explicit.suite_id == "generic-review" + assert image_default.suite_id == "image-review" + assert image_default.cases[0].input["target_kind"] == "image" + assert text_default.suite_id == "generic-review" + assert text_default.cases[0].input["target_kind"] == "file" + + @pytest.mark.asyncio async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: async def fake_executor(prompt: str, system_prompt: str): @@ -225,3 +273,60 @@ async def slow_executor(prompt: str, system_prompt: str): assert execution.backend_id == "heuristic" assert execution.payload["score"] == pytest.approx(0.61) + + +@pytest.mark.asyncio +async def test_builtin_app_evaluator_passes_visual_target_images_to_agent_backend( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + image_path = tmp_path / "artifact.png" + image_path.write_bytes( + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=" + ) + ) + + captured = {} + + async def fake_executor(prompt, system_prompt: str): + captured["prompt"] = prompt + return { + "results": [ + { + "filename": image_path.name, + "score": 0.88, + "rank": "Exemplary", + "criticism": "Minor spacing polish remains.", + "praise": "The main visual is clear.", + "improvement_advice": "Tighten secondary detail spacing.", + } + ] + } + + monkeypatch.setenv("LLM_MODEL_NAME", "test-model") + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + monkeypatch.setattr(substrate_module, "_default_agent_judge_executor", fake_executor) + + suite = get_builtin_eval_suite("app-evaluator").with_cases( + [ + EvalCaseDef( + case_id="artifact", + input={"target_path": str(image_path), "target_kind": "image"}, + ) + ] + ) + flow = EvaluationFlowDef( + target={"target_path": str(image_path), "target_kind": "image"}, + suite=suite, + ) + + report = await run_evaluation_flow(flow) + + prompt = captured["prompt"] + + assert isinstance(prompt, tuple) + assert prompt[0].startswith("Evaluate the following app artifact") + assert len(prompt[1]) == 1 + assert prompt[1][0].startswith("data:image/png;base64,") + assert report["judge_backend"]["backend_id"] == "app-evaluator-agent" From d0bbade93d4c5b5b84f7b6110942104e02046a5b Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 11:09:34 +0800 Subject: [PATCH 05/41] feat: expose evaluator suite selection diagnostics --- .../src/aworld_cli/evaluator_runtime.py | 33 +++++++++-- .../top_level_commands/evaluator_cmd.py | 13 ++++- aworld/evaluations/substrate.py | 58 +++++++++++++++---- tests/core/test_evaluator_runtime.py | 41 +++++++++++++ .../core/test_evaluator_top_level_command.py | 25 ++++++++ .../evaluations/test_evaluation_substrate.py | 35 +++++++++++ 6 files changed, 188 insertions(+), 17 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index c871d4404..cb85fc621 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -8,7 +8,8 @@ EvaluationFlowDef, describe_eval_target, list_eval_suites, - resolve_eval_suite, + list_matching_eval_suites, + resolve_eval_suite_selection, run_evaluation_flow, ) @@ -26,8 +27,23 @@ def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path return report_dir / f"{target_token}.{suite_token}.json" -def available_evaluator_suites() -> list[str]: - return list_eval_suites() +def available_evaluator_suites(*, target: str | None = None) -> list[str]: + if target is None: + return list_eval_suites() + return list_matching_eval_suites(target) + + +def get_evaluator_suite_selection( + *, + target: str, + suite: str | None = None, +) -> dict[str, str | None]: + selection = resolve_eval_suite_selection(suite, target) + return { + "requested": suite, + "resolved": selection.suite_id, + "mode": selection.mode, + } def run_evaluator_cli( @@ -38,7 +54,8 @@ def run_evaluator_cli( interactive_approval: bool = False, ) -> dict: target_path = Path(target).expanduser().resolve() - suite_def = resolve_eval_suite(suite, target_path) + selection = resolve_eval_suite_selection(suite, target_path) + suite_def = selection.suite target_info = describe_eval_target(target_path) flow = EvaluationFlowDef( target=target_info, @@ -56,6 +73,11 @@ def run_evaluator_cli( approval["resolved"] = True approval["approved"] = approved report["approval"] = approval + report["suite_selection"] = { + "requested": suite, + "resolved": selection.suite_id, + "mode": selection.mode, + } output_path = ( Path(output).expanduser().resolve() if output @@ -75,6 +97,9 @@ def render_evaluator_summary(report: dict) -> str: summary_line = f"Evaluator suite: {suite_id}\nGate: {status}" if metric_value is not None: summary_line += f" ({metric_value:.2f})" + selection = report.get("suite_selection") or {} + if selection.get("resolved"): + summary_line += f"\nSuite selection: {selection.get('mode', 'unknown')} -> {selection['resolved']}" backend = report.get("judge_backend", {}).get("backend_id") if backend: summary_line += f"\nJudge backend: {backend}" diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index 36309d638..58c004fbc 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -2,6 +2,7 @@ from aworld_cli.evaluator_runtime import ( available_evaluator_suites, + get_evaluator_suite_selection, render_evaluator_summary, run_evaluator_cli, ) @@ -35,9 +36,17 @@ def register_parser(self, subparsers) -> None: def run(self, args, context) -> int: if getattr(args, "list_suites", False): - print("Available evaluator suites:") - for suite_name in available_evaluator_suites(): + if getattr(args, "target", None): + print("Available evaluator suites for target:") + suite_names = available_evaluator_suites(target=args.target) + else: + print("Available evaluator suites:") + suite_names = available_evaluator_suites() + for suite_name in suite_names: print(f" - {suite_name}") + if getattr(args, "target", None) and suite_names: + selection = get_evaluator_suite_selection(target=args.target, suite=args.suite) + print(f"Default suite: {selection['resolved']}") return 0 if not getattr(args, "target", None): diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 532dee416..9b06fa964 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -253,6 +253,14 @@ def matches(self, target: dict[str, Any]) -> bool: return bool(self.matcher(target)) +@dataclass(frozen=True) +class EvalSuiteSelection: + suite_id: str + suite: EvalSuiteDef + target: dict[str, Any] + mode: str + + _EVAL_SUITE_REGISTRY: dict[str, EvalSuiteRegistration] = {} @@ -275,6 +283,10 @@ def list_eval_suites() -> list[str]: return sorted(_EVAL_SUITE_REGISTRY) +def _sorted_eval_suite_registrations(registrations: list[EvalSuiteRegistration]) -> list[EvalSuiteRegistration]: + return sorted(registrations, key=lambda item: (-item.priority, item.suite_id)) + + def _is_image_path(path: Path) -> bool: return path.suffix.lower() in _IMAGE_SUFFIX_TO_MIME @@ -743,28 +755,52 @@ def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) ) -def resolve_eval_suite(name: str | None, target: str | Path) -> EvalSuiteDef: +def _build_eval_suite_case(target_info: dict[str, Any]) -> EvalCaseDef: + return EvalCaseDef( + case_id=Path(target_info["target_path"]).name or "target", + input={ + "target_path": target_info["target_path"], + "target_kind": target_info["target_kind"], + }, + ) + + +def list_matching_eval_suites(target: str | Path | Mapping[str, Any]) -> list[str]: target_info = describe_eval_target(target) + candidates = [registration for registration in _EVAL_SUITE_REGISTRY.values() if registration.matches(target_info)] + return [registration.suite_id for registration in _sorted_eval_suite_registrations(candidates)] + +def resolve_eval_suite_selection(name: str | None, target: str | Path | Mapping[str, Any]) -> EvalSuiteSelection: + target_info = describe_eval_target(target) if name is not None: registration = _EVAL_SUITE_REGISTRY.get(name) if registration is None: raise KeyError(name) + if not registration.matches(target_info): + raise ValueError(f"suite '{name}' does not support target kind '{target_info.get('target_kind')}'") + mode = "explicit" else: candidates = [registration for registration in _EVAL_SUITE_REGISTRY.values() if registration.matches(target_info)] if not candidates: raise KeyError(f"no evaluation suite matches target {target_info.get('target_path')}") - registration = sorted(candidates, key=lambda item: (-item.priority, item.suite_id))[0] - - suite = registration.factory(target_info) - case = EvalCaseDef( - case_id=Path(target_info["target_path"]).name or "target", - input={ - "target_path": target_info["target_path"], - "target_kind": target_info["target_kind"], - }, + registration = _sorted_eval_suite_registrations(candidates)[0] + mode = "auto" + + suite = registration.factory(target_info).with_cases([ + _build_eval_suite_case(target_info), + ]) + return EvalSuiteSelection( + suite_id=suite.suite_id, + suite=suite, + target=target_info, + mode=mode, ) - return suite.with_cases([case]) + + +def resolve_eval_suite(name: str | None, target: str | Path) -> EvalSuiteDef: + selection = resolve_eval_suite_selection(name, target) + return selection.suite register_eval_suite( diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 54a039aab..d8b0a39f9 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -85,6 +85,21 @@ def test_available_evaluator_suites_lists_builtin_suite() -> None: assert "app-evaluator" in suites +def test_available_evaluator_suites_filters_by_target( + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.png" + target.write_bytes( + base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=" + ) + ) + + suites = available_evaluator_suites(target=str(target)) + + assert suites == ["app-evaluator"] + + def test_run_evaluator_cli_marks_image_targets( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -113,3 +128,29 @@ async def fake_run_evaluation_flow(flow): report = run_evaluator_cli(target=str(target)) assert report["suite_id"] == "app-evaluator" + + +def test_run_evaluator_cli_records_suite_selection_metadata( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["suite_selection"]["mode"] == "auto" + assert report["suite_selection"]["resolved"] == "app-evaluator" diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index 0934e48da..df796f4eb 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -124,6 +124,31 @@ def test_evaluator_command_lists_available_suites( assert "app-evaluator" in output +def test_evaluator_command_lists_target_matching_suites_and_default( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=str(target), + suite=None, + output=None, + interactive_approval=False, + list_suites=True, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "Available evaluator suites for target:" in output + assert "Default suite: app-evaluator" in output + + def test_evaluator_command_returns_usage_error_without_target( capsys: pytest.CaptureFixture[str], ) -> None: diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 372c57ee2..b8adf1490 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -17,8 +17,10 @@ compile_evaluation_flow, get_builtin_eval_suite, list_eval_suites, + list_matching_eval_suites, register_eval_suite, resolve_eval_suite, + resolve_eval_suite_selection, run_evaluation_flow, ) @@ -170,6 +172,39 @@ def image_factory(target): assert text_default.cases[0].input["target_kind"] == "file" +def test_eval_suite_registry_reports_matching_suites_and_selection_mode( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + register_eval_suite( + "generic-review", + lambda target: EvalSuiteDef(suite_id="generic-review"), + matcher=lambda target: True, + priority=10, + ) + register_eval_suite( + "image-review", + lambda target: EvalSuiteDef(suite_id="image-review"), + matcher=lambda target: target["target_kind"] == "image", + priority=50, + ) + + image_target = tmp_path / "artifact.png" + image_target.write_bytes(base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+aA1EAAAAASUVORK5CYII=")) + + matching = list_matching_eval_suites(image_target) + auto_selection = resolve_eval_suite_selection(None, image_target) + explicit_selection = resolve_eval_suite_selection("generic-review", image_target) + + assert matching == ["image-review", "generic-review"] + assert auto_selection.mode == "auto" + assert auto_selection.suite_id == "image-review" + assert explicit_selection.mode == "explicit" + assert explicit_selection.suite_id == "generic-review" + + @pytest.mark.asyncio async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: async def fake_executor(prompt: str, system_prompt: str): From 5eb15dc63ef58246421173846aa0a293399f7b4c Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 11:51:49 +0800 Subject: [PATCH 06/41] feat: add structured evaluator report metadata --- .../src/aworld_cli/evaluator_runtime.py | 28 +++++++++++++ .../top_level_commands/evaluator_cmd.py | 9 +---- aworld/evaluations/substrate.py | 35 ++++++++++++++++ tests/core/test_evaluator_runtime.py | 40 ++++++++++++++++++- .../evaluations/test_evaluation_substrate.py | 5 +++ 5 files changed, 109 insertions(+), 8 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index cb85fc621..a2809c4c7 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -46,6 +46,33 @@ def get_evaluator_suite_selection( } +def evaluator_exit_code(report: dict) -> int: + gate_status = report.get("gate", {}).get("status") + approval = report.get("approval") or {} + if gate_status == "fail": + return 2 + if gate_status == "needs_approval" and not approval.get("approved", False): + return 3 + return 0 + + +def _build_automation_summary(report: dict) -> dict[str, object]: + gate = report.get("gate") or {} + approval = report.get("approval") or {} + result_counts = report.get("result_counts") or {} + return { + "gate_status": gate.get("status"), + "metric_name": gate.get("metric_name"), + "metric_value": gate.get("value"), + "approval_required": approval.get("required", False), + "approval_resolved": approval.get("resolved", False), + "approved": approval.get("approved"), + "suggested_exit_code": evaluator_exit_code(report), + "case_count": result_counts.get("cases_total", len(report.get("results") or [])), + "judge_backend": (report.get("judge_backend") or {}).get("backend_id"), + } + + def run_evaluator_cli( *, target: str, @@ -78,6 +105,7 @@ def run_evaluator_cli( "resolved": selection.suite_id, "mode": selection.mode, } + report["automation"] = _build_automation_summary(report) output_path = ( Path(output).expanduser().resolve() if output diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index 58c004fbc..addc16491 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -2,6 +2,7 @@ from aworld_cli.evaluator_runtime import ( available_evaluator_suites, + evaluator_exit_code, get_evaluator_suite_selection, render_evaluator_summary, run_evaluator_cli, @@ -60,10 +61,4 @@ def run(self, args, context) -> int: interactive_approval=args.interactive_approval, ) print(render_evaluator_summary(report)) - gate_status = report.get("gate", {}).get("status") - approval = report.get("approval") or {} - if gate_status == "fail": - return 2 - if gate_status == "needs_approval" and not approval.get("approved", False): - return 3 - return 0 + return evaluator_exit_code(report) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 9b06fa964..84e520318 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -376,6 +376,12 @@ def _extract_metric_value(summary: Mapping[str, Any], metric_name: str) -> float raise KeyError(f"metric {metric_name} is missing aggregate summary") +def _normalize_metric_status(status: Any) -> str | None: + if status is None: + return None + return getattr(status, "name", str(status)) + + async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: compiled = compile_evaluation_flow(flow) eval_result = await EvaluateRunner(config=compiled.eval_config).run() @@ -392,27 +398,56 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: results = [] report_backend_id = None + cases_with_metrics = 0 + cases_with_judge = 0 for case_result in eval_result.eval_case_results: score_row = case_result.score_rows.get(compiled.suite.suite_id) judge_payload = {} + case_metrics: dict[str, Any] = {} + case_backend_id = None if score_row is not None: + cases_with_metrics += 1 + for metric_name, metric_result in score_row.metric_results.items(): + if isinstance(metric_result, Mapping): + case_metrics[metric_name] = {} + if "value" in metric_result: + case_metrics[metric_name]["value"] = metric_result["value"] + status = _normalize_metric_status(metric_result.get("eval_status")) + if status is not None: + case_metrics[metric_name]["status"] = status + metadata = metric_result.get("metadata") or {} + if case_backend_id is None and isinstance(metadata, Mapping): + case_backend_id = metadata.get("_judge_backend") + else: + case_metrics[metric_name] = {"value": metric_result} metric_result = score_row.metric_results.get(compiled.gate_policy.metric_name if compiled.gate_policy else "score", {}) judge_payload = dict(metric_result.get("metadata", {})) report_backend_id = report_backend_id or judge_payload.pop("_judge_backend", None) + if judge_payload: + cases_with_judge += 1 results.append( { "case_id": case_result.eval_case_id, "input": dict(case_result.input.case_data if hasattr(case_result.input, "case_data") else case_result.input), + "metrics": case_metrics, "judge": judge_payload, + "judge_backend": {"backend_id": case_backend_id} if case_backend_id is not None else None, } ) + metrics = dict(suite_summary) report = { "report_version": 1, "suite_id": compiled.suite.suite_id, "target": dict(compiled.target), "summary": eval_result.summary, + "metrics": metrics, "results": results, + "result_counts": { + "cases_total": len(results), + "cases_with_metrics": cases_with_metrics, + "cases_with_judge": cases_with_judge, + }, "approval": { "required": bool(gate and gate.status == "needs_approval"), "resolved": False, diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index d8b0a39f9..1e7e1ff9e 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -9,7 +9,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) -from aworld_cli.evaluator_runtime import available_evaluator_suites, run_evaluator_cli +from aworld_cli.evaluator_runtime import available_evaluator_suites, evaluator_exit_code, run_evaluator_cli def test_run_evaluator_cli_persists_approval_state( @@ -154,3 +154,41 @@ async def fake_run_evaluation_flow(flow): assert report["suite_selection"]["mode"] == "auto" assert report["suite_selection"]["resolved"] == "app-evaluator" + + +def test_run_evaluator_cli_adds_automation_metadata( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "judge_backend": {"backend_id": "stub-agent"}, + "summary": {"app-evaluator": {"score": {"mean": 0.7}}}, + "metrics": {"score": {"mean": 0.7}}, + "result_counts": {"cases_total": 2, "cases_with_metrics": 2, "cases_with_judge": 2}, + "results": [{}, {}], + "gate": {"status": "needs_approval", "metric_name": "score", "value": 0.7}, + "approval": {"required": True, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["automation"]["gate_status"] == "needs_approval" + assert report["automation"]["case_count"] == 2 + assert report["automation"]["judge_backend"] == "stub-agent" + assert report["automation"]["suggested_exit_code"] == 3 + + +def test_evaluator_exit_code_matches_gate_and_approval() -> None: + assert evaluator_exit_code({"gate": {"status": "pass"}, "approval": {}}) == 0 + assert evaluator_exit_code({"gate": {"status": "fail"}, "approval": {}}) == 2 + assert evaluator_exit_code( + {"gate": {"status": "needs_approval"}, "approval": {"approved": False}} + ) == 3 diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index b8adf1490..6f4d8333c 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -112,6 +112,11 @@ async def fake_judge(case_input, target): assert report["suite_id"] == "demo-suite" assert report["gate"]["status"] == "needs_approval" assert report["results"][0]["judge"]["rank"] == "Good" + assert report["results"][0]["metrics"]["score"]["value"] == pytest.approx(0.7) + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["mean"] == pytest.approx(0.7) + assert report["result_counts"]["cases_total"] == 1 + assert report["result_counts"]["cases_with_metrics"] == 1 assert report["summary"]["demo-suite"]["score"]["mean"] == pytest.approx(0.7) From dd421c72968e1c940d7709c65ce2b4c857e49fb5 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 14:32:14 +0800 Subject: [PATCH 07/41] feat: export evaluator report schema --- .../src/aworld_cli/evaluator_runtime.py | 79 +++++++++++++++++++ .../top_level_commands/evaluator_cmd.py | 8 ++ aworld/evaluations/substrate.py | 13 +++ tests/core/test_evaluator_runtime.py | 17 +++- .../core/test_evaluator_top_level_command.py | 22 ++++++ .../evaluations/test_evaluation_substrate.py | 3 + 6 files changed, 141 insertions(+), 1 deletion(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index a2809c4c7..d37dd4f92 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -5,6 +5,8 @@ from pathlib import Path from aworld.evaluations.substrate import ( + EVALUATOR_REPORT_FORMAT_ID, + EVALUATOR_REPORT_FORMAT_VERSION, EvaluationFlowDef, describe_eval_target, list_eval_suites, @@ -73,6 +75,83 @@ def _build_automation_summary(report: dict) -> dict[str, object]: } +def get_evaluator_report_schema() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": f"https://schemas.aworld.dev/evaluator/report/v{EVALUATOR_REPORT_FORMAT_VERSION}.json", + "title": "AWorld Evaluator Report", + "type": "object", + "required": [ + "report_version", + "report_format", + "generated_at", + "suite_id", + "target", + "summary", + "metrics", + "results", + "result_counts", + "approval", + ], + "properties": { + "report_version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, + "report_format": { + "type": "object", + "required": ["id", "version"], + "properties": { + "id": {"type": "string", "const": EVALUATOR_REPORT_FORMAT_ID}, + "version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, + }, + "additionalProperties": False, + }, + "generated_at": {"type": "string", "format": "date-time"}, + "suite_id": {"type": "string"}, + "target": {"type": "object"}, + "summary": {"type": "object"}, + "metrics": {"type": "object"}, + "results": { + "type": "array", + "items": { + "type": "object", + "required": ["case_id", "input", "metrics", "judge"], + "properties": { + "case_id": {"type": "string"}, + "input": {"type": "object"}, + "metrics": {"type": "object"}, + "judge": {"type": "object"}, + "judge_backend": { + "type": ["object", "null"], + "properties": { + "backend_id": {"type": "string"}, + }, + "required": ["backend_id"], + "additionalProperties": False, + }, + }, + "additionalProperties": True, + }, + }, + "result_counts": { + "type": "object", + "required": ["cases_total", "cases_with_metrics", "cases_with_judge"], + "properties": { + "cases_total": {"type": "integer", "minimum": 0}, + "cases_with_metrics": {"type": "integer", "minimum": 0}, + "cases_with_judge": {"type": "integer", "minimum": 0}, + }, + "additionalProperties": False, + }, + "gate": {"type": "object"}, + "approval": {"type": "object"}, + "judge_backend": {"type": "object"}, + "suite_selection": {"type": "object"}, + "automation": {"type": "object"}, + "report_path": {"type": "string"}, + }, + "additionalProperties": True, + } + + def run_evaluator_cli( *, target: str, diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index addc16491..cc68e940a 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -1,9 +1,12 @@ from __future__ import annotations +import json + from aworld_cli.evaluator_runtime import ( available_evaluator_suites, evaluator_exit_code, get_evaluator_suite_selection, + get_evaluator_report_schema, render_evaluator_summary, run_evaluator_cli, ) @@ -34,8 +37,13 @@ def register_parser(self, subparsers) -> None: parser.add_argument("--output", type=str) parser.add_argument("--interactive-approval", action="store_true") parser.add_argument("--list-suites", action="store_true") + parser.add_argument("--print-report-schema", action="store_true") def run(self, args, context) -> int: + if getattr(args, "print_report_schema", False): + print(json.dumps(get_evaluator_report_schema(), ensure_ascii=False, indent=2)) + return 0 + if getattr(args, "list_suites", False): if getattr(args, "target", None): print("Available evaluator suites for target:") diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 84e520318..6724638b0 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -9,6 +9,7 @@ import re import uuid from dataclasses import dataclass, field, replace +from datetime import datetime, timezone from pathlib import Path from typing import Any, Awaitable, Callable, ClassVar, Mapping @@ -34,6 +35,9 @@ ".svg": "image/svg+xml", } +EVALUATOR_REPORT_FORMAT_ID = "aworld.evaluator.report" +EVALUATOR_REPORT_FORMAT_VERSION = 1 + @dataclass(frozen=True) class EvalCaseDef: @@ -382,6 +386,10 @@ def _normalize_metric_status(status: Any) -> str | None: return getattr(status, "name", str(status)) +def _format_report_timestamp(timestamp: float) -> str: + return datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat().replace("+00:00", "Z") + + async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: compiled = compile_evaluation_flow(flow) eval_result = await EvaluateRunner(config=compiled.eval_config).run() @@ -438,6 +446,11 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: metrics = dict(suite_summary) report = { "report_version": 1, + "report_format": { + "id": EVALUATOR_REPORT_FORMAT_ID, + "version": EVALUATOR_REPORT_FORMAT_VERSION, + }, + "generated_at": _format_report_timestamp(eval_result.create_time), "suite_id": compiled.suite.suite_id, "target": dict(compiled.target), "summary": eval_result.summary, diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 1e7e1ff9e..92b823b0b 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -9,7 +9,12 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) -from aworld_cli.evaluator_runtime import available_evaluator_suites, evaluator_exit_code, run_evaluator_cli +from aworld_cli.evaluator_runtime import ( + available_evaluator_suites, + evaluator_exit_code, + get_evaluator_report_schema, + run_evaluator_cli, +) def test_run_evaluator_cli_persists_approval_state( @@ -192,3 +197,13 @@ def test_evaluator_exit_code_matches_gate_and_approval() -> None: assert evaluator_exit_code( {"gate": {"status": "needs_approval"}, "approval": {"approved": False}} ) == 3 + + +def test_get_evaluator_report_schema_describes_report_contract() -> None: + schema = get_evaluator_report_schema() + + assert schema["$schema"] == "https://json-schema.org/draft/2020-12/schema" + assert schema["title"] == "AWorld Evaluator Report" + assert "report_format" in schema["required"] + assert schema["properties"]["report_format"]["properties"]["id"]["const"] == "aworld.evaluator.report" + assert schema["properties"]["report_format"]["properties"]["version"]["const"] == 1 diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index df796f4eb..160553ac6 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -149,6 +149,28 @@ def test_evaluator_command_lists_target_matching_suites_and_default( assert "Default suite: app-evaluator" in output +def test_evaluator_command_prints_report_schema( + capsys: pytest.CaptureFixture[str], +) -> None: + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + print_report_schema=True, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "\"title\": \"AWorld Evaluator Report\"" in output + assert "\"aworld.evaluator.report\"" in output + + def test_evaluator_command_returns_usage_error_without_target( capsys: pytest.CaptureFixture[str], ) -> None: diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 6f4d8333c..4aeaa274b 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -110,6 +110,9 @@ async def fake_judge(case_input, target): report = await run_evaluation_flow(flow) assert report["suite_id"] == "demo-suite" + assert report["report_format"]["id"] == "aworld.evaluator.report" + assert report["report_format"]["version"] == 1 + assert report["generated_at"] assert report["gate"]["status"] == "needs_approval" assert report["results"][0]["judge"]["rank"] == "Good" assert report["results"][0]["metrics"]["score"]["value"] == pytest.approx(0.7) From 850d7fa9d5a7ea1f80d9d7e1375474651c1763a3 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 15:54:24 +0800 Subject: [PATCH 08/41] feat: tighten evaluator report schema --- .../src/aworld_cli/evaluator_runtime.py | 98 ++++++++++++++++++- tests/core/test_evaluator_runtime.py | 20 ++++ 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index d37dd4f92..e5ec287b9 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -81,6 +81,90 @@ def get_evaluator_report_schema() -> dict[str, object]: "$id": f"https://schemas.aworld.dev/evaluator/report/v{EVALUATOR_REPORT_FORMAT_VERSION}.json", "title": "AWorld Evaluator Report", "type": "object", + "$defs": { + "evalStatus": { + "type": "string", + "enum": ["PASSED", "FAILED", "NOT_EVALUATED"], + }, + "metricScalar": { + "oneOf": [ + {"type": "number"}, + {"type": "boolean"}, + ] + }, + "metricAggregate": { + "type": "object", + "properties": { + "mean": {"type": "number"}, + "min": {"type": "number"}, + "max": {"type": "number"}, + "std": {"type": "number"}, + "true_count": {"type": "integer", "minimum": 0}, + "true_rate": {"type": "number", "minimum": 0, "maximum": 1}, + "value": {"$ref": "#/$defs/metricScalar"}, + "eval_status": {"$ref": "#/$defs/evalStatus"}, + }, + "additionalProperties": { + "oneOf": [ + {"type": "number"}, + {"type": "boolean"}, + {"type": "string"}, + {"$ref": "#/$defs/metricAggregate"}, + ] + }, + }, + "caseMetric": { + "type": "object", + "properties": { + "value": {"$ref": "#/$defs/metricScalar"}, + "status": {"$ref": "#/$defs/evalStatus"}, + }, + "required": ["value"], + "additionalProperties": False, + }, + "gateDecision": { + "type": "object", + "required": ["status", "metric_name", "value"], + "properties": { + "status": { + "type": "string", + "enum": ["pass", "fail", "needs_approval"], + }, + "metric_name": {"type": "string"}, + "value": {"type": "number"}, + }, + "additionalProperties": False, + }, + "automationSummary": { + "type": "object", + "required": [ + "gate_status", + "metric_name", + "metric_value", + "approval_required", + "approval_resolved", + "approved", + "suggested_exit_code", + "case_count", + "judge_backend", + ], + "properties": { + "gate_status": { + "type": ["string", "null"], + "enum": ["pass", "fail", "needs_approval", None], + }, + "metric_name": {"type": ["string", "null"]}, + "metric_value": {"type": ["number", "null"]}, + "approval_required": {"type": "boolean"}, + "approval_resolved": {"type": "boolean"}, + "approved": {"type": ["boolean", "null"]}, + "suggested_exit_code": {"type": "integer", "enum": [0, 2, 3]}, + "case_count": {"type": "integer", "minimum": 0}, + "judge_backend": {"type": ["string", "null"]}, + }, + "additionalProperties": False, + }, + }, "required": [ "report_version", "report_format", @@ -108,7 +192,10 @@ def get_evaluator_report_schema() -> dict[str, object]: "suite_id": {"type": "string"}, "target": {"type": "object"}, "summary": {"type": "object"}, - "metrics": {"type": "object"}, + "metrics": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/metricAggregate"}, + }, "results": { "type": "array", "items": { @@ -117,7 +204,10 @@ def get_evaluator_report_schema() -> dict[str, object]: "properties": { "case_id": {"type": "string"}, "input": {"type": "object"}, - "metrics": {"type": "object"}, + "metrics": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/caseMetric"}, + }, "judge": {"type": "object"}, "judge_backend": { "type": ["object", "null"], @@ -141,11 +231,11 @@ def get_evaluator_report_schema() -> dict[str, object]: }, "additionalProperties": False, }, - "gate": {"type": "object"}, + "gate": {"$ref": "#/$defs/gateDecision"}, "approval": {"type": "object"}, "judge_backend": {"type": "object"}, "suite_selection": {"type": "object"}, - "automation": {"type": "object"}, + "automation": {"$ref": "#/$defs/automationSummary"}, "report_path": {"type": "string"}, }, "additionalProperties": True, diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 92b823b0b..3d48ede3b 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -207,3 +207,23 @@ def test_get_evaluator_report_schema_describes_report_contract() -> None: assert "report_format" in schema["required"] assert schema["properties"]["report_format"]["properties"]["id"]["const"] == "aworld.evaluator.report" assert schema["properties"]["report_format"]["properties"]["version"]["const"] == 1 + assert schema["properties"]["metrics"]["additionalProperties"]["$ref"] == "#/$defs/metricAggregate" + assert ( + schema["properties"]["results"]["items"]["properties"]["metrics"]["additionalProperties"]["$ref"] + == "#/$defs/caseMetric" + ) + assert schema["properties"]["gate"]["$ref"] == "#/$defs/gateDecision" + assert schema["properties"]["automation"]["$ref"] == "#/$defs/automationSummary" + assert schema["$defs"]["gateDecision"]["properties"]["status"]["enum"] == ["pass", "fail", "needs_approval"] + assert schema["$defs"]["automationSummary"]["properties"]["suggested_exit_code"]["enum"] == [0, 2, 3] + assert schema["$defs"]["automationSummary"]["required"] == [ + "gate_status", + "metric_name", + "metric_value", + "approval_required", + "approval_resolved", + "approved", + "suggested_exit_code", + "case_count", + "judge_backend", + ] From 57e9d6af0f03a8f2223530e26a4a0b1e40180fd3 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 16:36:31 +0800 Subject: [PATCH 09/41] feat: add evaluator report validation command --- .../src/aworld_cli/evaluator_runtime.py | 11 ++ .../top_level_commands/evaluator_cmd.py | 14 +++ tests/core/test_evaluator_runtime.py | 68 +++++++++++ .../core/test_evaluator_top_level_command.py | 107 ++++++++++++++++++ 4 files changed, 200 insertions(+) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index e5ec287b9..2d2ec49ce 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -242,6 +242,17 @@ def get_evaluator_report_schema() -> dict[str, object]: } +def validate_evaluator_report(report: dict) -> None: + import jsonschema + + try: + jsonschema.validate(instance=report, schema=get_evaluator_report_schema()) + except jsonschema.ValidationError as exc: + path = ".".join(str(part) for part in exc.absolute_path) + location = f" at '{path}'" if path else "" + raise ValueError(f"evaluator report validation failed{location}: {exc.message}") from exc + + def run_evaluator_cli( *, target: str, diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index cc68e940a..84a34c202 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from pathlib import Path from aworld_cli.evaluator_runtime import ( available_evaluator_suites, @@ -9,6 +10,7 @@ get_evaluator_report_schema, render_evaluator_summary, run_evaluator_cli, + validate_evaluator_report, ) @@ -38,12 +40,24 @@ def register_parser(self, subparsers) -> None: parser.add_argument("--interactive-approval", action="store_true") parser.add_argument("--list-suites", action="store_true") parser.add_argument("--print-report-schema", action="store_true") + parser.add_argument("--validate-report", type=str) def run(self, args, context) -> int: if getattr(args, "print_report_schema", False): print(json.dumps(get_evaluator_report_schema(), ensure_ascii=False, indent=2)) return 0 + if getattr(args, "validate_report", None): + report_path = Path(args.validate_report).expanduser().resolve() + report = json.loads(report_path.read_text(encoding="utf-8")) + try: + validate_evaluator_report(report) + except ValueError as exc: + print(f"Report is invalid: {exc}") + return 4 + print(f"Report is valid: {report_path}") + return 0 + if getattr(args, "list_suites", False): if getattr(args, "target", None): print("Available evaluator suites for target:") diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 3d48ede3b..105feec46 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -14,6 +14,7 @@ evaluator_exit_code, get_evaluator_report_schema, run_evaluator_cli, + validate_evaluator_report, ) @@ -227,3 +228,70 @@ def test_get_evaluator_report_schema_describes_report_contract() -> None: "case_count", "judge_backend", ] + + +def test_validate_evaluator_report_accepts_valid_report() -> None: + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9, "min": 0.9, "max": 0.9, "std": 0.0, "eval_status": "PASSED"}}, + "results": [ + { + "case_id": "artifact.txt", + "input": {"target_path": "/tmp/artifact.txt"}, + "metrics": {"score": {"value": 0.9, "status": "PASSED"}}, + "judge": {"score": 0.9}, + "judge_backend": {"backend_id": "stub-agent"}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "pass", + "metric_name": "score", + "metric_value": 0.9, + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 0, + "case_count": 1, + "judge_backend": "stub-agent", + }, + } + + validate_evaluator_report(report) + + +def test_validate_evaluator_report_rejects_invalid_gate_status() -> None: + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "gate": {"status": "maybe", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "maybe", + "metric_name": "score", + "metric_value": 0.9, + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 0, + "case_count": 0, + "judge_backend": None, + }, + } + + with pytest.raises(ValueError, match="status"): + validate_evaluator_report(report) diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index 160553ac6..ef80e400b 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -171,6 +171,113 @@ def test_evaluator_command_prints_report_schema( assert "\"aworld.evaluator.report\"" in output +def test_evaluator_command_validates_report_file( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + report_path = tmp_path / "report.json" + report_path.write_text( + """ +{ + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": false, "resolved": false, "approved": null}, + "automation": { + "gate_status": null, + "metric_name": null, + "metric_value": null, + "approval_required": false, + "approval_resolved": false, + "approved": null, + "suggested_exit_code": 0, + "case_count": 0, + "judge_backend": null + } +} +""".strip(), + encoding="utf-8", + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=str(report_path), + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 0 + assert "Report is valid" in output + + +def test_evaluator_command_returns_nonzero_for_invalid_report( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + report_path = tmp_path / "report.json" + report_path.write_text( + """ +{ + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "gate": {"status": "maybe", "metric_name": "score", "value": 0.9}, + "approval": {"required": false, "resolved": false, "approved": null}, + "automation": { + "gate_status": "maybe", + "metric_name": "score", + "metric_value": 0.9, + "approval_required": false, + "approval_resolved": false, + "approved": null, + "suggested_exit_code": 0, + "case_count": 0, + "judge_backend": null + } +} +""".strip(), + encoding="utf-8", + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=str(report_path), + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 4 + assert "Report is invalid" in output + + def test_evaluator_command_returns_usage_error_without_target( capsys: pytest.CaptureFixture[str], ) -> None: From 67fbf7c8890912859b7cb2657ecf35b2b80af1bb Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 19:22:38 +0800 Subject: [PATCH 10/41] docs: add evaluator report contract guides --- docs/AWorld CLI/Commands/Evaluator.md | 75 +++++++++++++++ docs/AWorld CLI/Commands/Overview.md | 5 + docs/AWorld CLI/Recipes/Mini App Build.md | 12 +++ examples/aworld_quick_start/cli/README.md | 12 ++- .../cli/evaluator_report.example.json | 93 +++++++++++++++++++ tests/docs/test_evaluator_report_docs.py | 33 +++++++ 6 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 docs/AWorld CLI/Commands/Evaluator.md create mode 100644 examples/aworld_quick_start/cli/evaluator_report.example.json create mode 100644 tests/docs/test_evaluator_report_docs.py diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md new file mode 100644 index 000000000..b7c2a29e5 --- /dev/null +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -0,0 +1,75 @@ +# Evaluator + +## What It Does + +The evaluator command runs suite-backed evaluation flows for local targets and exposes the resulting report as a stable machine-readable contract. + +Use it when you want to: + +- run a built-in evaluator suite such as `app-evaluator` +- inspect which suites match a target +- export the evaluator report schema +- validate a saved evaluator report in automation + +## Commands + +Top-level CLI usage: + +```bash +aworld-cli evaluator --target ./artifact +aworld-cli evaluator --target ./artifact --suite app-evaluator +aworld-cli evaluator --list-suites +aworld-cli evaluator --list-suites --target ./artifact +aworld-cli evaluator --print-report-schema +aworld-cli evaluator --validate-report ./.aworld/evaluations/artifact.app-evaluator.json +``` + +Useful options: + +```bash +aworld-cli evaluator --target ./artifact --output ./report.json +aworld-cli evaluator --target ./artifact --interactive-approval +``` + +## Report Contract + +Evaluator reports are JSON documents with a stable top-level format marker: + +```json +{ + "report_format": { + "id": "aworld.evaluator.report", + "version": 1 + } +} +``` + +Key report sections: + +- `metrics`: normalized aggregate metrics for the resolved suite +- `results`: per-case judge output plus normalized per-case metrics +- `gate`: structured `pass` / `fail` / `needs_approval` decision +- `automation`: exit-code-oriented summary fields for scripts and CI + +See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/evaluator_report.example.json) for a minimal example. + +## Typical Workflow + +1. Inspect matching suites with `aworld-cli evaluator --list-suites --target ./artifact`. +2. Run evaluation with `aworld-cli evaluator --target ./artifact`. +3. Save or collect the emitted JSON report. +4. Validate persisted reports with `aworld-cli evaluator --validate-report `. +5. Export the current JSON Schema with `aworld-cli evaluator --print-report-schema` when integrating with external tooling. + +## Exit Codes + +- `0`: evaluation passed, schema is valid, or metadata command succeeded +- `2`: evaluation gate failed +- `3`: evaluation requires approval and is not approved +- `4`: evaluator report validation failed + +## Notes And Limits + +- `--list-suites --target ...` shows only suites matching the target and prints the deterministic default suite. +- `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. +- `--validate-report` validates an existing JSON report against that schema without re-running evaluation. diff --git a/docs/AWorld CLI/Commands/Overview.md b/docs/AWorld CLI/Commands/Overview.md index 74843c2c7..422f1415c 100644 --- a/docs/AWorld CLI/Commands/Overview.md +++ b/docs/AWorld CLI/Commands/Overview.md @@ -1,3 +1,8 @@ # Commands Use slash commands inside interactive AWorld CLI sessions to inspect workspace state, manage scheduled work, control plugins, and access command-bridge features exposed through gateway channels. + +Available command references: + +- [Evaluator](/Users/wuman/Documents/workspace/aworld-mas/aworld/docs/AWorld%20CLI/Commands/Evaluator.md): suite-backed evaluation, schema export, and report validation +- [Gateway](/Users/wuman/Documents/workspace/aworld-mas/aworld/docs/AWorld%20CLI/Commands/Gateway.md): multi-channel gateway lifecycle and command bridge behavior diff --git a/docs/AWorld CLI/Recipes/Mini App Build.md b/docs/AWorld CLI/Recipes/Mini App Build.md index 34d37a530..baeb1e347 100644 --- a/docs/AWorld CLI/Recipes/Mini App Build.md +++ b/docs/AWorld CLI/Recipes/Mini App Build.md @@ -63,3 +63,15 @@ help me create an English word learning app, with a UI quality score over 0.9 The agent will generate the app, evaluate it with the official Evaluator skill, and iterate until the UI quality score meets your target. When done, run or deploy the output as needed. For a reusable workspace setup, keep the same `.env` or interactive CLI configuration across runs. + +## Inspecting The Evaluator Report + +When you want a stable machine-readable artifact for CI or post-processing, use the standalone evaluator command against the generated app or artifact: + +```bash +aworld-cli evaluator --target ./artifact --output ./report.json +aworld-cli evaluator --print-report-schema +aworld-cli evaluator --validate-report ./report.json +``` + +The emitted report includes `report_format`, normalized `metrics`, structured `gate`, and `automation` fields. That makes it suitable for quality gates, regression checks, or downstream dashboards without parsing freeform evaluator text. diff --git a/examples/aworld_quick_start/cli/README.md b/examples/aworld_quick_start/cli/README.md index 4fc9fdd49..8ee510972 100644 --- a/examples/aworld_quick_start/cli/README.md +++ b/examples/aworld_quick_start/cli/README.md @@ -32,6 +32,16 @@ aworld-cli --task "Your task" --agent MyAgent - `agents/document_agent.md` - Markdown agent example - `agents/hilp.py` - Human in the loop agent example +## Evaluator Report Example + +The file `evaluator_report.example.json` shows the current stable evaluator report contract, including: + +- `report_format` and `generated_at` +- normalized `metrics` and per-case `results` +- structured `gate`, `approval`, and `automation` sections + +Use it together with `aworld-cli evaluator --print-report-schema` and `aworld-cli evaluator --validate-report ` when integrating evaluator output into scripts or CI. + ## Create Your Agent ### Python Agent @@ -58,4 +68,4 @@ Create `agents/my_agent.md`: name: MyAgent description: My agent description --- -``` \ No newline at end of file +``` diff --git a/examples/aworld_quick_start/cli/evaluator_report.example.json b/examples/aworld_quick_start/cli/evaluator_report.example.json new file mode 100644 index 000000000..401339a82 --- /dev/null +++ b/examples/aworld_quick_start/cli/evaluator_report.example.json @@ -0,0 +1,93 @@ +{ + "report_version": 1, + "report_format": { + "id": "aworld.evaluator.report", + "version": 1 + }, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "app-evaluator", + "target": { + "target_path": "/tmp/artifact.txt", + "target_kind": "file" + }, + "summary": { + "app-evaluator": { + "score": { + "mean": 0.91, + "min": 0.91, + "max": 0.91, + "std": 0.0, + "eval_status": "PASSED" + } + } + }, + "metrics": { + "score": { + "mean": 0.91, + "min": 0.91, + "max": 0.91, + "std": 0.0, + "eval_status": "PASSED" + } + }, + "results": [ + { + "case_id": "artifact.txt", + "input": { + "target_path": "/tmp/artifact.txt", + "target_kind": "file" + }, + "metrics": { + "score": { + "value": 0.91, + "status": "PASSED" + } + }, + "judge": { + "score": 0.91, + "rank": "Exemplary", + "criticism": "Minor polish remains.", + "praise": "Strong overall structure.", + "improvement_advice": "Keep the visual hierarchy consistent." + }, + "judge_backend": { + "backend_id": "app-evaluator-agent" + } + } + ], + "result_counts": { + "cases_total": 1, + "cases_with_metrics": 1, + "cases_with_judge": 1 + }, + "gate": { + "status": "pass", + "metric_name": "score", + "value": 0.91 + }, + "approval": { + "required": false, + "resolved": false, + "approved": null + }, + "judge_backend": { + "backend_id": "app-evaluator-agent" + }, + "suite_selection": { + "requested": null, + "resolved": "app-evaluator", + "mode": "auto" + }, + "automation": { + "gate_status": "pass", + "metric_name": "score", + "metric_value": 0.91, + "approval_required": false, + "approval_resolved": false, + "approved": null, + "suggested_exit_code": 0, + "case_count": 1, + "judge_backend": "app-evaluator-agent" + }, + "report_path": "/tmp/report.json" +} diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py new file mode 100644 index 000000000..0f0b74df4 --- /dev/null +++ b/tests/docs/test_evaluator_report_docs.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from pathlib import Path + + +def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: + doc_path = Path("docs/AWorld CLI/Commands/Evaluator.md") + overview_path = Path("docs/AWorld CLI/Commands/Overview.md") + + content = doc_path.read_text(encoding="utf-8") + overview = overview_path.read_text(encoding="utf-8") + + assert "aworld-cli evaluator" in content + assert "--print-report-schema" in content + assert "--validate-report" in content + assert "report_format" in content + assert "automation" in content + assert "Evaluator" in overview + + +def test_evaluator_report_example_includes_stable_contract_fields() -> None: + example_path = Path("examples/aworld_quick_start/cli/evaluator_report.example.json") + recipe_path = Path("docs/AWorld CLI/Recipes/Mini App Build.md") + + content = example_path.read_text(encoding="utf-8") + recipe = recipe_path.read_text(encoding="utf-8") + + assert '"report_format"' in content + assert '"generated_at"' in content + assert '"metrics"' in content + assert '"automation"' in content + assert "--validate-report" in recipe + assert "--print-report-schema" in recipe From 262188b7e6b8f0418a28852268be517ad15a97c9 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 2 Jun 2026 20:04:33 +0800 Subject: [PATCH 11/41] feat: load declared evaluator suites --- .../src/aworld_cli/evaluator_runtime.py | 4 + aworld/evaluations/substrate.py | 58 ++++++++++++++ tests/core/test_evaluator_runtime.py | 37 +++++++++ .../core/test_evaluator_top_level_command.py | 13 +++ .../evaluations/test_evaluation_substrate.py | 79 +++++++++++++++++++ 5 files changed, 191 insertions(+) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 2d2ec49ce..a81f3b589 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -11,6 +11,7 @@ describe_eval_target, list_eval_suites, list_matching_eval_suites, + load_declared_eval_suites, resolve_eval_suite_selection, run_evaluation_flow, ) @@ -30,6 +31,7 @@ def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path def available_evaluator_suites(*, target: str | None = None) -> list[str]: + load_declared_eval_suites() if target is None: return list_eval_suites() return list_matching_eval_suites(target) @@ -40,6 +42,7 @@ def get_evaluator_suite_selection( target: str, suite: str | None = None, ) -> dict[str, str | None]: + load_declared_eval_suites() selection = resolve_eval_suite_selection(suite, target) return { "requested": suite, @@ -261,6 +264,7 @@ def run_evaluator_cli( interactive_approval: bool = False, ) -> dict: target_path = Path(target).expanduser().resolve() + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) selection = resolve_eval_suite_selection(suite, target_path) suite_def = selection.suite target_info = describe_eval_target(target_path) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 6724638b0..8bee5a5c9 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -266,6 +266,7 @@ class EvalSuiteSelection: _EVAL_SUITE_REGISTRY: dict[str, EvalSuiteRegistration] = {} +_LOADED_EVAL_MANIFEST_PATHS: set[str] = set() def register_eval_suite( @@ -287,6 +288,63 @@ def list_eval_suites() -> list[str]: return sorted(_EVAL_SUITE_REGISTRY) +def _build_declared_eval_suite(manifest: Mapping[str, Any]) -> EvalSuiteDef: + base_suite = str(manifest.get("base_suite") or "").strip() + if base_suite != "app-evaluator": + raise ValueError(f"unsupported base_suite: {base_suite}") + + suite = get_builtin_eval_suite(base_suite) + suite_id = str(manifest.get("suite_id") or "").strip() + if not suite_id: + raise ValueError("suite_id is required") + + gate_manifest = manifest.get("gate_policy") or {} + if gate_manifest: + suite = replace( + suite, + gate_policy=GatePolicyDef( + metric_name=str(gate_manifest.get("metric_name") or suite.gate_policy.metric_name), + pass_threshold=float(gate_manifest.get("pass_threshold", suite.gate_policy.pass_threshold)), + approval_threshold=( + float(gate_manifest["approval_threshold"]) + if gate_manifest.get("approval_threshold") is not None + else suite.gate_policy.approval_threshold + ), + ), + ) + + metadata = dict(suite.metadata) + metadata.update(dict(manifest.get("metadata") or {})) + metadata["declared_manifest"] = True + metadata["base_suite"] = base_suite + return replace(suite, suite_id=suite_id, metadata=metadata) + + +def load_declared_eval_suites(workspace: str | Path | None = None) -> list[str]: + root = Path(workspace or Path.cwd()).expanduser().resolve() + manifest_dir = root / ".aworld" / "evaluators" + if not manifest_dir.exists() or not manifest_dir.is_dir(): + return [] + + loaded: list[str] = [] + for manifest_path in sorted(manifest_dir.glob("*.json")): + manifest_key = str(manifest_path.resolve()) + if manifest_key in _LOADED_EVAL_MANIFEST_PATHS: + continue + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + suite = _build_declared_eval_suite(manifest) + target_kinds = tuple(str(kind) for kind in (manifest.get("target_kinds") or ["file", "directory", "image"])) + register_eval_suite( + suite.suite_id, + lambda target, _suite=suite: _suite, + matcher=lambda target, _target_kinds=target_kinds: target.get("target_kind") in _target_kinds, + priority=int(manifest.get("priority", 100)), + ) + _LOADED_EVAL_MANIFEST_PATHS.add(manifest_key) + loaded.append(suite.suite_id) + return loaded + + def _sorted_eval_suite_registrations(registrations: list[EvalSuiteRegistration]) -> list[EvalSuiteRegistration]: return sorted(registrations, key=lambda item: (-item.priority, item.suite_id)) diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 105feec46..a9403ee94 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -9,6 +9,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) +import aworld.evaluations.substrate as substrate_module from aworld_cli.evaluator_runtime import ( available_evaluator_suites, evaluator_exit_code, @@ -18,6 +19,18 @@ ) +@pytest.fixture(autouse=True) +def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + substrate_module.register_eval_suite( + "app-evaluator", + lambda target: substrate_module.get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, + ) + + def test_run_evaluator_cli_persists_approval_state( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -106,6 +119,30 @@ def test_available_evaluator_suites_filters_by_target( assert suites == ["app-evaluator"] +def test_available_evaluator_suites_loads_declared_suites_from_workspace( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.chdir(tmp_path) + + suites = available_evaluator_suites(target=str(tmp_path / "artifact.txt")) + + assert "strict-ui" in suites + + def test_run_evaluator_cli_marks_image_targets( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index ef80e400b..82e143927 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -8,11 +8,24 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) +import aworld.evaluations.substrate as substrate_module from aworld_cli import main as main_module from aworld_cli.core.top_level_command_system import TopLevelCommandContext from aworld_cli.top_level_commands.evaluator_cmd import EvaluatorTopLevelCommand +@pytest.fixture(autouse=True) +def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + substrate_module.register_eval_suite( + "app-evaluator", + lambda target: substrate_module.get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, + ) + + def test_registry_registers_builtin_evaluator_command() -> None: registry = main_module._build_top_level_command_registry() diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 4aeaa274b..89c48fb14 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -18,6 +18,7 @@ get_builtin_eval_suite, list_eval_suites, list_matching_eval_suites, + load_declared_eval_suites, register_eval_suite, resolve_eval_suite, resolve_eval_suite_selection, @@ -25,6 +26,18 @@ ) +@pytest.fixture(autouse=True) +def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + substrate_module.register_eval_suite( + "app-evaluator", + lambda target: get_builtin_eval_suite("app-evaluator"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, + priority=10, + ) + + def test_compile_evaluation_flow_builds_inline_dataset_and_gate_config() -> None: suite = EvalSuiteDef( suite_id="demo-suite", @@ -213,6 +226,72 @@ def test_eval_suite_registry_reports_matching_suites_and_selection_mode( assert explicit_selection.suite_id == "generic-review" +def test_load_declared_eval_suites_registers_manifest_backed_suite( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file", "directory"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + }, + "metadata": { + "owner": "qa" + } +} +""".strip(), + encoding="utf-8", + ) + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + loaded = load_declared_eval_suites(tmp_path) + listed = list_eval_suites() + + assert loaded == ["strict-ui"] + assert "strict-ui" in listed + + +def test_declared_eval_suite_can_be_selected_for_matching_target( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + } +} +""".strip(), + encoding="utf-8", + ) + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + load_declared_eval_suites(tmp_path) + + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + + selection = resolve_eval_suite_selection("strict-ui", target) + + assert selection.suite_id == "strict-ui" + assert selection.suite.gate_policy.pass_threshold == pytest.approx(0.92) + + @pytest.mark.asyncio async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: async def fake_executor(prompt: str, system_prompt: str): From 1f94675794832804eba34cc3efc17fbca52e7e13 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 3 Jun 2026 10:57:26 +0800 Subject: [PATCH 12/41] docs: add declared evaluator suite contract --- .../src/aworld_cli/evaluator_runtime.py | 51 +++++++++++++++++++ docs/AWorld CLI/Commands/Evaluator.md | 35 +++++++++++++ examples/aworld_quick_start/cli/README.md | 6 +++ .../cli/declared_evaluator_suite.example.json | 15 ++++++ tests/core/test_evaluator_runtime.py | 11 ++++ tests/docs/test_evaluator_report_docs.py | 13 +++++ 6 files changed, 131 insertions(+) create mode 100644 examples/aworld_quick_start/cli/declared_evaluator_suite.example.json diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index a81f3b589..196e08984 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -78,6 +78,57 @@ def _build_automation_summary(report: dict) -> dict[str, object]: } +def get_declared_evaluator_suite_schema() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://schemas.aworld.dev/evaluator/declared-suite/v1.json", + "title": "AWorld Declared Evaluator Suite", + "type": "object", + "required": ["suite_id", "base_suite"], + "properties": { + "suite_id": { + "type": "string", + "minLength": 1, + "description": "Unique suite identifier exposed through aworld-cli evaluator.", + }, + "base_suite": { + "type": "string", + "const": "app-evaluator", + "description": "Builtin evaluator suite used as the declaration base.", + }, + "target_kinds": { + "type": "array", + "items": { + "type": "string", + "enum": ["file", "directory", "image"], + }, + "minItems": 1, + "uniqueItems": True, + "description": "Optional target kinds matched by this declared suite.", + }, + "gate_policy": { + "type": "object", + "properties": { + "metric_name": {"type": "string"}, + "pass_threshold": {"type": "number"}, + "approval_threshold": {"type": ["number", "null"]}, + }, + "additionalProperties": False, + "description": "Optional gate override layered on top of the base suite defaults.", + }, + "metadata": { + "type": "object", + "description": "Optional suite metadata copied into the resolved suite definition.", + }, + "priority": { + "type": "integer", + "description": "Optional suite selection priority. Larger values win automatic selection.", + }, + }, + "additionalProperties": False, + } + + def get_evaluator_report_schema() -> dict[str, object]: return { "$schema": "https://json-schema.org/draft/2020-12/schema", diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index b7c2a29e5..5350c870a 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -7,6 +7,7 @@ The evaluator command runs suite-backed evaluation flows for local targets and e Use it when you want to: - run a built-in evaluator suite such as `app-evaluator` +- load declaration-backed evaluator suites from workspace manifests - inspect which suites match a target - export the evaluator report schema - validate a saved evaluator report in automation @@ -31,6 +32,38 @@ aworld-cli evaluator --target ./artifact --output ./report.json aworld-cli evaluator --target ./artifact --interactive-approval ``` +## Declared Suite Manifests + +Evaluator suites can be declared under `.aworld/evaluators/*.json` and are loaded before suite resolution. This keeps the runtime on top of AWorld's existing runner and task substrate while letting a workspace expose stricter or context-specific evaluator variants without forking builtin code. + +Current manifest scope is intentionally narrow: + +- `base_suite` must be `app-evaluator` +- `suite_id` is required and becomes the suite name exposed to `aworld-cli evaluator` +- `target_kinds` optionally narrows matching to `file`, `directory`, and/or `image` +- `gate_policy`, `metadata`, and `priority` override selection and gating behavior on top of the builtin suite + +Minimal example: + +```json +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file", "directory"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + }, + "metadata": { + "owner": "qa" + }, + "priority": 120 +} +``` + +See [declared_evaluator_suite.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json) for a complete example. The current manifest schema is exported by `aworld_cli.evaluator_runtime.get_declared_evaluator_suite_schema()`. + ## Report Contract Evaluator reports are JSON documents with a stable top-level format marker: @@ -71,5 +104,7 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ ## Notes And Limits - `--list-suites --target ...` shows only suites matching the target and prints the deterministic default suite. +- declared suite manifests are discovered from `.aworld/evaluators/*.json` relative to the evaluation target workspace. +- declared suite manifests currently layer on `app-evaluator` only; they are not a generic suite authoring format yet. - `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. - `--validate-report` validates an existing JSON report against that schema without re-running evaluation. diff --git a/examples/aworld_quick_start/cli/README.md b/examples/aworld_quick_start/cli/README.md index 8ee510972..b88391844 100644 --- a/examples/aworld_quick_start/cli/README.md +++ b/examples/aworld_quick_start/cli/README.md @@ -42,6 +42,12 @@ The file `evaluator_report.example.json` shows the current stable evaluator repo Use it together with `aworld-cli evaluator --print-report-schema` and `aworld-cli evaluator --validate-report ` when integrating evaluator output into scripts or CI. +## Declared Evaluator Suite Example + +The file `declared_evaluator_suite.example.json` shows the workspace manifest format loaded from `.aworld/evaluators/*.json`. + +Use it when you want to derive a stricter evaluator from `app-evaluator` while keeping AWorld's builtin runner, suite resolution, and report contract unchanged. + ## Create Your Agent ### Python Agent diff --git a/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json b/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json new file mode 100644 index 000000000..d6e5376e4 --- /dev/null +++ b/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json @@ -0,0 +1,15 @@ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file", "directory"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + }, + "metadata": { + "owner": "qa", + "purpose": "release-gate" + }, + "priority": 120 +} diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index a9403ee94..58d9a88b3 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -13,6 +13,7 @@ from aworld_cli.evaluator_runtime import ( available_evaluator_suites, evaluator_exit_code, + get_declared_evaluator_suite_schema, get_evaluator_report_schema, run_evaluator_cli, validate_evaluator_report, @@ -332,3 +333,13 @@ def test_validate_evaluator_report_rejects_invalid_gate_status() -> None: with pytest.raises(ValueError, match="status"): validate_evaluator_report(report) + + +def test_get_declared_evaluator_suite_schema_describes_manifest_contract() -> None: + schema = get_declared_evaluator_suite_schema() + + assert schema["$schema"] == "https://json-schema.org/draft/2020-12/schema" + assert schema["title"] == "AWorld Declared Evaluator Suite" + assert schema["properties"]["base_suite"]["const"] == "app-evaluator" + assert "suite_id" in schema["required"] + assert "target_kinds" in schema["properties"] diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py index 0f0b74df4..ce9920087 100644 --- a/tests/docs/test_evaluator_report_docs.py +++ b/tests/docs/test_evaluator_report_docs.py @@ -15,19 +15,32 @@ def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: assert "--validate-report" in content assert "report_format" in content assert "automation" in content + assert ".aworld/evaluators/*.json" in content + assert "declared_evaluator_suite.example.json" in content + assert "get_declared_evaluator_suite_schema()" in content assert "Evaluator" in overview def test_evaluator_report_example_includes_stable_contract_fields() -> None: example_path = Path("examples/aworld_quick_start/cli/evaluator_report.example.json") + manifest_example_path = Path("examples/aworld_quick_start/cli/declared_evaluator_suite.example.json") recipe_path = Path("docs/AWorld CLI/Recipes/Mini App Build.md") + readme_path = Path("examples/aworld_quick_start/cli/README.md") content = example_path.read_text(encoding="utf-8") + manifest_example = manifest_example_path.read_text(encoding="utf-8") recipe = recipe_path.read_text(encoding="utf-8") + readme = readme_path.read_text(encoding="utf-8") assert '"report_format"' in content assert '"generated_at"' in content assert '"metrics"' in content assert '"automation"' in content + assert '"suite_id"' in manifest_example + assert '"base_suite": "app-evaluator"' in manifest_example + assert '"target_kinds"' in manifest_example + assert '"gate_policy"' in manifest_example + assert ".aworld/evaluators/*.json" in readme + assert "declared_evaluator_suite.example.json" in readme assert "--validate-report" in recipe assert "--print-report-schema" in recipe From 339a89e3539483217065125ed7b8e54dbba51fef Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Mon, 8 Jun 2026 17:07:36 +0800 Subject: [PATCH 13/41] fix: refresh declared evaluator suite discovery --- .../src/aworld_cli/evaluator_runtime.py | 9 ++- aworld/evaluations/substrate.py | 13 +++- tests/core/test_evaluator_runtime.py | 31 +++++++- .../evaluations/test_evaluation_substrate.py | 77 +++++++++++++++++++ 4 files changed, 124 insertions(+), 6 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 196e08984..628680cdf 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -31,9 +31,11 @@ def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path def available_evaluator_suites(*, target: str | None = None) -> list[str]: - load_declared_eval_suites() if target is None: + load_declared_eval_suites() return list_eval_suites() + target_path = Path(target).expanduser().resolve() + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) return list_matching_eval_suites(target) @@ -42,8 +44,9 @@ def get_evaluator_suite_selection( target: str, suite: str | None = None, ) -> dict[str, str | None]: - load_declared_eval_suites() - selection = resolve_eval_suite_selection(suite, target) + target_path = Path(target).expanduser().resolve() + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) + selection = resolve_eval_suite_selection(suite, target_path) return { "requested": suite, "resolved": selection.suite_id, diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 8bee5a5c9..4d8af1935 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -267,6 +267,7 @@ class EvalSuiteSelection: _EVAL_SUITE_REGISTRY: dict[str, EvalSuiteRegistration] = {} _LOADED_EVAL_MANIFEST_PATHS: set[str] = set() +_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE: dict[str, set[str]] = {} def register_eval_suite( @@ -323,14 +324,18 @@ def _build_declared_eval_suite(manifest: Mapping[str, Any]) -> EvalSuiteDef: def load_declared_eval_suites(workspace: str | Path | None = None) -> list[str]: root = Path(workspace or Path.cwd()).expanduser().resolve() manifest_dir = root / ".aworld" / "evaluators" + workspace_key = str(root) + previous_suite_ids = _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE.get(workspace_key, set()) if not manifest_dir.exists() or not manifest_dir.is_dir(): + for suite_id in previous_suite_ids: + _EVAL_SUITE_REGISTRY.pop(suite_id, None) + _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE.pop(workspace_key, None) return [] loaded: list[str] = [] + current_suite_ids: set[str] = set() for manifest_path in sorted(manifest_dir.glob("*.json")): manifest_key = str(manifest_path.resolve()) - if manifest_key in _LOADED_EVAL_MANIFEST_PATHS: - continue manifest = json.loads(manifest_path.read_text(encoding="utf-8")) suite = _build_declared_eval_suite(manifest) target_kinds = tuple(str(kind) for kind in (manifest.get("target_kinds") or ["file", "directory", "image"])) @@ -341,7 +346,11 @@ def load_declared_eval_suites(workspace: str | Path | None = None) -> list[str]: priority=int(manifest.get("priority", 100)), ) _LOADED_EVAL_MANIFEST_PATHS.add(manifest_key) + current_suite_ids.add(suite.suite_id) loaded.append(suite.suite_id) + for removed_suite_id in previous_suite_ids - current_suite_ids: + _EVAL_SUITE_REGISTRY.pop(removed_suite_id, None) + _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE[workspace_key] = current_suite_ids return loaded diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 58d9a88b3..3b16f7367 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -126,6 +126,35 @@ def test_available_evaluator_suites_loads_declared_suites_from_workspace( ) -> None: manifest_dir = tmp_path / ".aworld" / "evaluators" manifest_dir.mkdir(parents=True) + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + (manifest_dir / "strict-ui.json").write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.chdir(tmp_path) + + suites = available_evaluator_suites(target=str(target)) + + assert "strict-ui" in suites + + +def test_available_evaluator_suites_uses_target_workspace_not_process_cwd( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + workspace = tmp_path / "project" + manifest_dir = workspace / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + target = workspace / "artifact.txt" + target.write_text("artifact", encoding="utf-8") (manifest_dir / "strict-ui.json").write_text( """ { @@ -139,7 +168,7 @@ def test_available_evaluator_suites_loads_declared_suites_from_workspace( monkeypatch.chdir(tmp_path) - suites = available_evaluator_suites(target=str(tmp_path / "artifact.txt")) + suites = available_evaluator_suites(target=str(target)) assert "strict-ui" in suites diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 89c48fb14..8b484d292 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -292,6 +292,83 @@ def test_declared_eval_suite_can_be_selected_for_matching_target( assert selection.suite.gate_policy.pass_threshold == pytest.approx(0.92) +def test_load_declared_eval_suites_refreshes_existing_manifest_changes( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + manifest_path = manifest_dir / "strict-ui.json" + manifest_path.write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.92, + "approval_threshold": 0.8 + } +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + load_declared_eval_suites(tmp_path) + + manifest_path.write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": { + "metric_name": "score", + "pass_threshold": 0.99, + "approval_threshold": 0.8 + } +} +""".strip(), + encoding="utf-8", + ) + + load_declared_eval_suites(tmp_path) + selection = resolve_eval_suite_selection("strict-ui", tmp_path / "artifact.txt") + + assert selection.suite.gate_policy.pass_threshold == pytest.approx(0.99) + + +def test_load_declared_eval_suites_removes_deleted_manifest_registration( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + manifest_path = manifest_dir / "strict-ui.json" + manifest_path.write_text( + """ +{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + load_declared_eval_suites(tmp_path) + manifest_path.unlink() + + load_declared_eval_suites(tmp_path) + + assert "strict-ui" not in list_eval_suites() + + @pytest.mark.asyncio async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: async def fake_executor(prompt: str, system_prompt: str): From 859c77e67f9e29fc881d6e2460471fa5be0fc859 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Mon, 8 Jun 2026 20:55:10 +0800 Subject: [PATCH 14/41] fix: isolate declared evaluator suites by workspace --- .../src/aworld_cli/evaluator_runtime.py | 15 +++-- .../top_level_commands/evaluator_cmd.py | 42 ++++++++----- aworld/evaluations/substrate.py | 63 ++++++++++++++++--- tests/core/test_evaluator_runtime.py | 10 +++ .../core/test_evaluator_top_level_command.py | 24 +++++++ .../evaluations/test_evaluation_substrate.py | 61 ++++++++++++++++++ 6 files changed, 184 insertions(+), 31 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 628680cdf..7725e3d66 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -21,6 +21,13 @@ def _sanitize_path_token(value: str) -> str: return "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in value).strip("-") or "target" +def _resolve_cli_target_path(target: str) -> Path: + target_path = Path(target).expanduser().resolve() + if not target_path.exists(): + raise FileNotFoundError(f"evaluation target does not exist: {target_path}") + return target_path + + def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path | None = None) -> Path: root = (cwd or Path.cwd()).expanduser().resolve() report_dir = root / ".aworld" / "evaluations" @@ -34,9 +41,9 @@ def available_evaluator_suites(*, target: str | None = None) -> list[str]: if target is None: load_declared_eval_suites() return list_eval_suites() - target_path = Path(target).expanduser().resolve() + target_path = _resolve_cli_target_path(target) load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) - return list_matching_eval_suites(target) + return list_matching_eval_suites(target_path) def get_evaluator_suite_selection( @@ -44,7 +51,7 @@ def get_evaluator_suite_selection( target: str, suite: str | None = None, ) -> dict[str, str | None]: - target_path = Path(target).expanduser().resolve() + target_path = _resolve_cli_target_path(target) load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) selection = resolve_eval_suite_selection(suite, target_path) return { @@ -317,7 +324,7 @@ def run_evaluator_cli( output: str | None = None, interactive_approval: bool = False, ) -> dict: - target_path = Path(target).expanduser().resolve() + target_path = _resolve_cli_target_path(target) load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) selection = resolve_eval_suite_selection(suite, target_path) suite_def = selection.suite diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index 84a34c202..77a0b03a3 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -59,28 +59,36 @@ def run(self, args, context) -> int: return 0 if getattr(args, "list_suites", False): - if getattr(args, "target", None): - print("Available evaluator suites for target:") - suite_names = available_evaluator_suites(target=args.target) - else: - print("Available evaluator suites:") - suite_names = available_evaluator_suites() - for suite_name in suite_names: - print(f" - {suite_name}") - if getattr(args, "target", None) and suite_names: - selection = get_evaluator_suite_selection(target=args.target, suite=args.suite) - print(f"Default suite: {selection['resolved']}") + try: + if getattr(args, "target", None): + print("Available evaluator suites for target:") + suite_names = available_evaluator_suites(target=args.target) + else: + print("Available evaluator suites:") + suite_names = available_evaluator_suites() + for suite_name in suite_names: + print(f" - {suite_name}") + if getattr(args, "target", None) and suite_names: + selection = get_evaluator_suite_selection(target=args.target, suite=args.suite) + print(f"Default suite: {selection['resolved']}") + except (FileNotFoundError, ValueError, KeyError) as exc: + print(f"Evaluator error: {exc}") + return 1 return 0 if not getattr(args, "target", None): print("❌ --target is required unless --list-suites is used") return 1 - report = run_evaluator_cli( - target=args.target, - suite=args.suite, - output=args.output, - interactive_approval=args.interactive_approval, - ) + try: + report = run_evaluator_cli( + target=args.target, + suite=args.suite, + output=args.output, + interactive_approval=args.interactive_approval, + ) + except (FileNotFoundError, ValueError, KeyError) as exc: + print(f"Evaluator error: {exc}") + return 1 print(render_evaluator_summary(report)) return evaluator_exit_code(report) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 4d8af1935..63c2a7052 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -250,6 +250,7 @@ class EvalSuiteRegistration: factory: EvalSuiteFactory matcher: EvalSuiteMatcher | None = None priority: int = 0 + workspace_root: str | None = None def matches(self, target: dict[str, Any]) -> bool: if self.matcher is None: @@ -265,9 +266,35 @@ class EvalSuiteSelection: mode: str -_EVAL_SUITE_REGISTRY: dict[str, EvalSuiteRegistration] = {} +_EVAL_SUITE_REGISTRY: dict[tuple[str | None, str], EvalSuiteRegistration] = {} _LOADED_EVAL_MANIFEST_PATHS: set[str] = set() -_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE: dict[str, set[str]] = {} +_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE: dict[str, set[tuple[str | None, str]]] = {} +_BUILTIN_EVAL_SUITE_IDS = {"app-evaluator"} + + +def _eval_suite_registry_key(suite_id: str, workspace_root: str | None = None) -> tuple[str | None, str]: + return workspace_root, suite_id + + +def _target_workspace_root(target: Mapping[str, Any]) -> str | None: + target_path = target.get("target_path") + if target_path is None: + return None + path = Path(str(target_path)).expanduser().resolve() + target_kind = target.get("target_kind") + if target_kind in {"file", "image"}: + return str(path.parent) + return str(path) + + +def _visible_eval_suite_registrations(target: Mapping[str, Any]) -> list[EvalSuiteRegistration]: + workspace_root = _target_workspace_root(target) + visible: list[EvalSuiteRegistration] = [] + for registration in _EVAL_SUITE_REGISTRY.values(): + if registration.workspace_root is not None and registration.workspace_root != workspace_root: + continue + visible.append(registration) + return visible def register_eval_suite( @@ -276,17 +303,19 @@ def register_eval_suite( *, matcher: EvalSuiteMatcher | None = None, priority: int = 0, + workspace_root: str | None = None, ) -> None: - _EVAL_SUITE_REGISTRY[suite_id] = EvalSuiteRegistration( + _EVAL_SUITE_REGISTRY[_eval_suite_registry_key(suite_id, workspace_root)] = EvalSuiteRegistration( suite_id=suite_id, factory=factory, matcher=matcher, priority=priority, + workspace_root=workspace_root, ) def list_eval_suites() -> list[str]: - return sorted(_EVAL_SUITE_REGISTRY) + return sorted({registration.suite_id for registration in _EVAL_SUITE_REGISTRY.values()}) def _build_declared_eval_suite(manifest: Mapping[str, Any]) -> EvalSuiteDef: @@ -298,6 +327,8 @@ def _build_declared_eval_suite(manifest: Mapping[str, Any]) -> EvalSuiteDef: suite_id = str(manifest.get("suite_id") or "").strip() if not suite_id: raise ValueError("suite_id is required") + if suite_id in _BUILTIN_EVAL_SUITE_IDS: + raise ValueError(f"reserved suite_id: {suite_id}") gate_manifest = manifest.get("gate_policy") or {} if gate_manifest: @@ -333,20 +364,25 @@ def load_declared_eval_suites(workspace: str | Path | None = None) -> list[str]: return [] loaded: list[str] = [] - current_suite_ids: set[str] = set() + current_suite_ids: set[tuple[str | None, str]] = set() + seen_suite_ids: set[str] = set() for manifest_path in sorted(manifest_dir.glob("*.json")): manifest_key = str(manifest_path.resolve()) manifest = json.loads(manifest_path.read_text(encoding="utf-8")) suite = _build_declared_eval_suite(manifest) + if suite.suite_id in seen_suite_ids: + raise ValueError(f"duplicate suite_id in workspace manifests: {suite.suite_id}") + seen_suite_ids.add(suite.suite_id) target_kinds = tuple(str(kind) for kind in (manifest.get("target_kinds") or ["file", "directory", "image"])) register_eval_suite( suite.suite_id, lambda target, _suite=suite: _suite, matcher=lambda target, _target_kinds=target_kinds: target.get("target_kind") in _target_kinds, priority=int(manifest.get("priority", 100)), + workspace_root=workspace_key, ) _LOADED_EVAL_MANIFEST_PATHS.add(manifest_key) - current_suite_ids.add(suite.suite_id) + current_suite_ids.add(_eval_suite_registry_key(suite.suite_id, workspace_key)) loaded.append(suite.suite_id) for removed_suite_id in previous_suite_ids - current_suite_ids: _EVAL_SUITE_REGISTRY.pop(removed_suite_id, None) @@ -882,21 +918,28 @@ def _build_eval_suite_case(target_info: dict[str, Any]) -> EvalCaseDef: def list_matching_eval_suites(target: str | Path | Mapping[str, Any]) -> list[str]: target_info = describe_eval_target(target) - candidates = [registration for registration in _EVAL_SUITE_REGISTRY.values() if registration.matches(target_info)] + candidates = [registration for registration in _visible_eval_suite_registrations(target_info) if registration.matches(target_info)] return [registration.suite_id for registration in _sorted_eval_suite_registrations(candidates)] def resolve_eval_suite_selection(name: str | None, target: str | Path | Mapping[str, Any]) -> EvalSuiteSelection: target_info = describe_eval_target(target) if name is not None: - registration = _EVAL_SUITE_REGISTRY.get(name) - if registration is None: + candidates = [ + registration + for registration in _visible_eval_suite_registrations(target_info) + if registration.suite_id == name + ] + if not candidates: raise KeyError(name) + registration = _sorted_eval_suite_registrations(candidates)[0] if not registration.matches(target_info): raise ValueError(f"suite '{name}' does not support target kind '{target_info.get('target_kind')}'") mode = "explicit" else: - candidates = [registration for registration in _EVAL_SUITE_REGISTRY.values() if registration.matches(target_info)] + candidates = [ + registration for registration in _visible_eval_suite_registrations(target_info) if registration.matches(target_info) + ] if not candidates: raise KeyError(f"no evaluation suite matches target {target_info.get('target_path')}") registration = _sorted_eval_suite_registrations(candidates)[0] diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 3b16f7367..a24268fb0 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -24,6 +24,7 @@ def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + monkeypatch.setattr(substrate_module, "_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE", {}) substrate_module.register_eval_suite( "app-evaluator", lambda target: substrate_module.get_builtin_eval_suite("app-evaluator"), @@ -259,6 +260,15 @@ async def fake_run_evaluation_flow(flow): assert report["automation"]["suggested_exit_code"] == 3 +def test_run_evaluator_cli_rejects_missing_target( + tmp_path: Path, +) -> None: + missing = tmp_path / "missing.txt" + + with pytest.raises(FileNotFoundError, match="does not exist"): + run_evaluator_cli(target=str(missing)) + + def test_evaluator_exit_code_matches_gate_and_approval() -> None: assert evaluator_exit_code({"gate": {"status": "pass"}, "approval": {}}) == 0 assert evaluator_exit_code({"gate": {"status": "fail"}, "approval": {}}) == 2 diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index 82e143927..d7704fef5 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -18,6 +18,7 @@ def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + monkeypatch.setattr(substrate_module, "_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE", {}) substrate_module.register_eval_suite( "app-evaluator", lambda target: substrate_module.get_builtin_eval_suite("app-evaluator"), @@ -309,3 +310,26 @@ def test_evaluator_command_returns_usage_error_without_target( assert exit_code == 1 assert "--target is required" in output + + +def test_evaluator_command_returns_nonzero_for_missing_target( + capsys: pytest.CaptureFixture[str], + tmp_path: Path, +) -> None: + missing = tmp_path / "missing.txt" + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=str(missing), + suite=None, + output=None, + interactive_approval=False, + list_suites=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + + assert exit_code == 1 + assert "does not exist" in output diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 8b484d292..7660ef15d 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -30,6 +30,7 @@ def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) monkeypatch.setattr(substrate_module, "_LOADED_EVAL_MANIFEST_PATHS", set()) + monkeypatch.setattr(substrate_module, "_DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE", {}) substrate_module.register_eval_suite( "app-evaluator", lambda target: get_builtin_eval_suite("app-evaluator"), @@ -369,6 +370,66 @@ def test_load_declared_eval_suites_removes_deleted_manifest_registration( assert "strict-ui" not in list_eval_suites() +def test_declared_eval_suites_are_resolved_per_workspace( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + workspace_a = tmp_path / "a" + workspace_b = tmp_path / "b" + for workspace, threshold in ((workspace_a, 0.91), (workspace_b, 0.99)): + manifest_dir = workspace / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "strict-ui.json").write_text( + f""" +{{ + "suite_id": "strict-ui", + "base_suite": "app-evaluator", + "target_kinds": ["file"], + "gate_policy": {{ + "metric_name": "score", + "pass_threshold": {threshold}, + "approval_threshold": 0.8 + }} +}} +""".strip(), + encoding="utf-8", + ) + + monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) + + load_declared_eval_suites(workspace_a) + load_declared_eval_suites(workspace_b) + + selection_a = resolve_eval_suite_selection("strict-ui", workspace_a / "artifact.txt") + selection_b = resolve_eval_suite_selection("strict-ui", workspace_b / "artifact.txt") + + assert selection_a.suite.gate_policy.pass_threshold == pytest.approx(0.91) + assert selection_b.suite.gate_policy.pass_threshold == pytest.approx(0.99) + + +def test_load_declared_eval_suites_rejects_builtin_suite_id_override( + monkeypatch: pytest.MonkeyPatch, + tmp_path, +) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "override.json").write_text( + """ +{ + "suite_id": "app-evaluator", + "base_suite": "app-evaluator", + "target_kinds": ["file"] +} +""".strip(), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="reserved suite_id"): + load_declared_eval_suites(tmp_path) + + assert list_eval_suites() == ["app-evaluator"] + + @pytest.mark.asyncio async def test_agent_judge_backend_parses_app_evaluator_json_payload() -> None: async def fake_executor(prompt: str, system_prompt: str): From 834efc13049ef2f8645d3013d811750b39d16877 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 9 Jun 2026 13:58:12 +0800 Subject: [PATCH 15/41] feat: add execution-backed evaluator substrate --- .../evaluator_cli/.aworld-plugin/plugin.json | 19 + .../builtin_plugins/evaluator_cli/__init__.py | 1 + .../evaluator_cli/cli_commands/evaluator.py | 5 + .../src/aworld_cli/evaluator_rendering.py | 23 ++ .../src/aworld_cli/evaluator_runtime.py | 384 ++++++------------ .../src/aworld_cli/evaluator_workspace.py | 41 ++ .../aworld_cli/top_level_commands/__init__.py | 4 +- aworld/evaluations/eval_targets/agent_eval.py | 20 +- aworld/evaluations/execution.py | 119 ++++++ aworld/evaluations/manifests.py | 66 +++ aworld/evaluations/report.py | 218 ++++++++++ .../evaluations/scorers/state_extractors.py | 62 +++ aworld/evaluations/scorers/suite_judge.py | 4 + .../scorers/trajectory_validators.py | 4 + aworld/evaluations/substrate.py | 117 +++++- tests/core/test_evaluator_runtime.py | 28 ++ .../evaluations/test_evaluation_substrate.py | 52 +++ tests/evaluations/test_execution_state.py | 43 ++ tests/plugins/test_plugin_hooks.py | 88 ++++ tests/test_plugin_cli_entrypoint.py | 11 + 20 files changed, 1016 insertions(+), 293 deletions(-) create mode 100644 aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json create mode 100644 aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py create mode 100644 aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py create mode 100644 aworld-cli/src/aworld_cli/evaluator_rendering.py create mode 100644 aworld-cli/src/aworld_cli/evaluator_workspace.py create mode 100644 aworld/evaluations/execution.py create mode 100644 aworld/evaluations/manifests.py create mode 100644 aworld/evaluations/report.py create mode 100644 aworld/evaluations/scorers/state_extractors.py create mode 100644 tests/evaluations/test_execution_state.py diff --git a/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json new file mode 100644 index 000000000..905f9022b --- /dev/null +++ b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/.aworld-plugin/plugin.json @@ -0,0 +1,19 @@ +{ + "id": "aworld-evaluator-cli", + "name": "aworld-evaluator-cli", + "version": "1.0.0", + "entrypoints": { + "cli_commands": [ + { + "id": "evaluator", + "name": "evaluator", + "target": "cli_commands/evaluator.py", + "scope": "workspace", + "visibility": "public", + "metadata": { + "factory": "build_command" + } + } + ] + } +} diff --git a/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py new file mode 100644 index 000000000..19af4f24a --- /dev/null +++ b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/__init__.py @@ -0,0 +1 @@ +"""Built-in framework plugin providing the `evaluator` top-level CLI command.""" diff --git a/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py new file mode 100644 index 000000000..5ae07a71f --- /dev/null +++ b/aworld-cli/src/aworld_cli/builtin_plugins/evaluator_cli/cli_commands/evaluator.py @@ -0,0 +1,5 @@ +from aworld_cli.top_level_commands.evaluator_cmd import EvaluatorTopLevelCommand + + +def build_command(): + return EvaluatorTopLevelCommand() diff --git a/aworld-cli/src/aworld_cli/evaluator_rendering.py b/aworld-cli/src/aworld_cli/evaluator_rendering.py new file mode 100644 index 000000000..a44176241 --- /dev/null +++ b/aworld-cli/src/aworld_cli/evaluator_rendering.py @@ -0,0 +1,23 @@ +from __future__ import annotations + + +def render_evaluator_summary(report: dict, *, summary_suffix: str | None = None) -> str: + suite_id = report.get("suite_id", "unknown-suite") + gate = report.get("gate", {}) + status = gate.get("status", "unknown") + metric_value = gate.get("value") + summary_line = f"Evaluator suite: {suite_id}\nGate: {status}" + if metric_value is not None: + summary_line += f" ({metric_value:.2f})" + selection = report.get("suite_selection") or {} + if selection.get("resolved"): + summary_line += f"\nSuite selection: {selection.get('mode', 'unknown')} -> {selection['resolved']}" + backend = report.get("judge_backend", {}).get("backend_id") + if backend: + summary_line += f"\nJudge backend: {backend}" + report_path = report.get("report_path") + if report_path: + summary_line += f"\nReport: {report_path}" + if summary_suffix: + summary_line += f"\n{summary_suffix}" + return summary_line diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 7725e3d66..57bf92acf 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -4,30 +4,35 @@ import json from pathlib import Path -from aworld.evaluations.substrate import ( +from aworld.plugins.discovery import discover_plugins +from aworld.evaluations.manifests import ( + get_declared_eval_suite_schema as _get_declared_eval_suite_schema, +) +from aworld.evaluations.report import ( EVALUATOR_REPORT_FORMAT_ID, EVALUATOR_REPORT_FORMAT_VERSION, + get_evaluator_report_schema as _get_evaluator_report_schema, + validate_evaluator_report as _validate_evaluator_report, +) +from aworld.evaluations.substrate import ( EvaluationFlowDef, describe_eval_target, - list_eval_suites, - list_matching_eval_suites, - load_declared_eval_suites, - resolve_eval_suite_selection, run_evaluation_flow, ) +from aworld_cli.core.plugin_manager import PluginManager, get_builtin_plugin_roots +from aworld_cli.evaluator_rendering import render_evaluator_summary as _render_evaluator_summary +from aworld_cli.evaluator_workspace import ( + discover_workspace_suites, + resolve_cli_target_path, + resolve_workspace_suite_selection, +) +from aworld_cli.plugin_capabilities.hooks import PluginHookResult, load_plugin_hooks def _sanitize_path_token(value: str) -> str: return "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in value).strip("-") or "target" -def _resolve_cli_target_path(target: str) -> Path: - target_path = Path(target).expanduser().resolve() - if not target_path.exists(): - raise FileNotFoundError(f"evaluation target does not exist: {target_path}") - return target_path - - def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path | None = None) -> Path: root = (cwd or Path.cwd()).expanduser().resolve() report_dir = root / ".aworld" / "evaluations" @@ -38,12 +43,26 @@ def default_evaluator_report_path(*, target_path: Path, suite_id: str, cwd: Path def available_evaluator_suites(*, target: str | None = None) -> list[str]: - if target is None: - load_declared_eval_suites() - return list_eval_suites() - target_path = _resolve_cli_target_path(target) - load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) - return list_matching_eval_suites(target_path) + hooks = _load_evaluator_hooks() + target_path = resolve_cli_target_path(target) if target is not None else None + workspace_path = str((target_path.parent if target_path and target_path.is_file() else target_path) or Path.cwd()) + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.pre_discover", + event={"target": target, "workspace_path": workspace_path}, + state={"target": target, "workspace_path": workspace_path}, + ) + suites = discover_workspace_suites(target=target) + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.post_discover", + event={"target": target, "workspace_path": workspace_path, "suite_names": suites}, + state={**hook_state, "suite_names": suites}, + ) + overridden = hook_state.get("suite_names") + if isinstance(overridden, list): + return [str(item) for item in overridden] + return suites def get_evaluator_suite_selection( @@ -51,14 +70,7 @@ def get_evaluator_suite_selection( target: str, suite: str | None = None, ) -> dict[str, str | None]: - target_path = _resolve_cli_target_path(target) - load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) - selection = resolve_eval_suite_selection(suite, target_path) - return { - "requested": suite, - "resolved": selection.suite_id, - "mode": selection.mode, - } + return resolve_workspace_suite_selection(target=target, suite=suite) def evaluator_exit_code(report: dict) -> int: @@ -89,232 +101,52 @@ def _build_automation_summary(report: dict) -> dict[str, object]: def get_declared_evaluator_suite_schema() -> dict[str, object]: - return { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://schemas.aworld.dev/evaluator/declared-suite/v1.json", - "title": "AWorld Declared Evaluator Suite", - "type": "object", - "required": ["suite_id", "base_suite"], - "properties": { - "suite_id": { - "type": "string", - "minLength": 1, - "description": "Unique suite identifier exposed through aworld-cli evaluator.", - }, - "base_suite": { - "type": "string", - "const": "app-evaluator", - "description": "Builtin evaluator suite used as the declaration base.", - }, - "target_kinds": { - "type": "array", - "items": { - "type": "string", - "enum": ["file", "directory", "image"], - }, - "minItems": 1, - "uniqueItems": True, - "description": "Optional target kinds matched by this declared suite.", - }, - "gate_policy": { - "type": "object", - "properties": { - "metric_name": {"type": "string"}, - "pass_threshold": {"type": "number"}, - "approval_threshold": {"type": ["number", "null"]}, - }, - "additionalProperties": False, - "description": "Optional gate override layered on top of the base suite defaults.", - }, - "metadata": { - "type": "object", - "description": "Optional suite metadata copied into the resolved suite definition.", - }, - "priority": { - "type": "integer", - "description": "Optional suite selection priority. Larger values win automatic selection.", - }, - }, - "additionalProperties": False, - } + return _get_declared_eval_suite_schema() def get_evaluator_report_schema() -> dict[str, object]: - return { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": f"https://schemas.aworld.dev/evaluator/report/v{EVALUATOR_REPORT_FORMAT_VERSION}.json", - "title": "AWorld Evaluator Report", - "type": "object", - "$defs": { - "evalStatus": { - "type": "string", - "enum": ["PASSED", "FAILED", "NOT_EVALUATED"], - }, - "metricScalar": { - "oneOf": [ - {"type": "number"}, - {"type": "boolean"}, - ] - }, - "metricAggregate": { - "type": "object", - "properties": { - "mean": {"type": "number"}, - "min": {"type": "number"}, - "max": {"type": "number"}, - "std": {"type": "number"}, - "true_count": {"type": "integer", "minimum": 0}, - "true_rate": {"type": "number", "minimum": 0, "maximum": 1}, - "value": {"$ref": "#/$defs/metricScalar"}, - "eval_status": {"$ref": "#/$defs/evalStatus"}, - }, - "additionalProperties": { - "oneOf": [ - {"type": "number"}, - {"type": "boolean"}, - {"type": "string"}, - {"$ref": "#/$defs/metricAggregate"}, - ] - }, - }, - "caseMetric": { - "type": "object", - "properties": { - "value": {"$ref": "#/$defs/metricScalar"}, - "status": {"$ref": "#/$defs/evalStatus"}, - }, - "required": ["value"], - "additionalProperties": False, - }, - "gateDecision": { - "type": "object", - "required": ["status", "metric_name", "value"], - "properties": { - "status": { - "type": "string", - "enum": ["pass", "fail", "needs_approval"], - }, - "metric_name": {"type": "string"}, - "value": {"type": "number"}, - }, - "additionalProperties": False, - }, - "automationSummary": { - "type": "object", - "required": [ - "gate_status", - "metric_name", - "metric_value", - "approval_required", - "approval_resolved", - "approved", - "suggested_exit_code", - "case_count", - "judge_backend", - ], - "properties": { - "gate_status": { - "type": ["string", "null"], - "enum": ["pass", "fail", "needs_approval", None], - }, - "metric_name": {"type": ["string", "null"]}, - "metric_value": {"type": ["number", "null"]}, - "approval_required": {"type": "boolean"}, - "approval_resolved": {"type": "boolean"}, - "approved": {"type": ["boolean", "null"]}, - "suggested_exit_code": {"type": "integer", "enum": [0, 2, 3]}, - "case_count": {"type": "integer", "minimum": 0}, - "judge_backend": {"type": ["string", "null"]}, - }, - "additionalProperties": False, - }, - }, - "required": [ - "report_version", - "report_format", - "generated_at", - "suite_id", - "target", - "summary", - "metrics", - "results", - "result_counts", - "approval", - ], - "properties": { - "report_version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, - "report_format": { - "type": "object", - "required": ["id", "version"], - "properties": { - "id": {"type": "string", "const": EVALUATOR_REPORT_FORMAT_ID}, - "version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, - }, - "additionalProperties": False, - }, - "generated_at": {"type": "string", "format": "date-time"}, - "suite_id": {"type": "string"}, - "target": {"type": "object"}, - "summary": {"type": "object"}, - "metrics": { - "type": "object", - "additionalProperties": {"$ref": "#/$defs/metricAggregate"}, - }, - "results": { - "type": "array", - "items": { - "type": "object", - "required": ["case_id", "input", "metrics", "judge"], - "properties": { - "case_id": {"type": "string"}, - "input": {"type": "object"}, - "metrics": { - "type": "object", - "additionalProperties": {"$ref": "#/$defs/caseMetric"}, - }, - "judge": {"type": "object"}, - "judge_backend": { - "type": ["object", "null"], - "properties": { - "backend_id": {"type": "string"}, - }, - "required": ["backend_id"], - "additionalProperties": False, - }, - }, - "additionalProperties": True, - }, - }, - "result_counts": { - "type": "object", - "required": ["cases_total", "cases_with_metrics", "cases_with_judge"], - "properties": { - "cases_total": {"type": "integer", "minimum": 0}, - "cases_with_metrics": {"type": "integer", "minimum": 0}, - "cases_with_judge": {"type": "integer", "minimum": 0}, - }, - "additionalProperties": False, - }, - "gate": {"$ref": "#/$defs/gateDecision"}, - "approval": {"type": "object"}, - "judge_backend": {"type": "object"}, - "suite_selection": {"type": "object"}, - "automation": {"$ref": "#/$defs/automationSummary"}, - "report_path": {"type": "string"}, - }, - "additionalProperties": True, - } + return _get_evaluator_report_schema() def validate_evaluator_report(report: dict) -> None: - import jsonschema + _validate_evaluator_report(report) + + +def _load_evaluator_hooks() -> dict[str, tuple[object, ...]]: + builtin_plugin_roots = tuple(Path(root).resolve() for root in get_builtin_plugin_roots()) + plugin_manager = PluginManager() + if hasattr(plugin_manager, "get_runtime_plugin_roots"): + plugin_roots = [Path(root).resolve() for root in plugin_manager.get_runtime_plugin_roots()] + else: + plugin_roots = list(builtin_plugin_roots) + return load_plugin_hooks(discover_plugins(plugin_roots)) + - try: - jsonschema.validate(instance=report, schema=get_evaluator_report_schema()) - except jsonschema.ValidationError as exc: - path = ".".join(str(part) for part in exc.absolute_path) - location = f" at '{path}'" if path else "" - raise ValueError(f"evaluator report validation failed{location}: {exc.message}") from exc +def _run_evaluator_hooks( + hooks: dict[str, tuple[object, ...]], + hook_point: str, + *, + event: dict[str, object], + state: dict[str, object], +) -> dict[str, object]: + """ + Evaluator hook contract: + - `evaluator.pre_discover` event payload: `target`, `workspace_path` + - `evaluator.post_discover` event payload: `target`, `workspace_path`, `suite_names` + - `evaluator.pre_run` event payload: `target`, `suite`, `workspace_path` + - `evaluator.post_run` event payload: `report`, `target`, `suite`, `workspace_path` + - `evaluator.render_summary` event payload: `report`, `workspace_path` + - mutable state: lightweight CLI assembly metadata only + - allowed side effects: report upload, notifications, summary augmentation + - hooks do not redefine framework execution, scoring, or gate semantics + """ + merged = dict(state) + for hook in hooks.get((hook_point or "").strip().lower(), ()): + result = asyncio.run(hook.run(event=event, state=merged)) + hook_result = result if isinstance(result, PluginHookResult) else PluginHookResult.from_payload(result) + if hook_result.metadata: + merged.update(dict(hook_result.metadata)) + return merged def run_evaluator_cli( @@ -324,11 +156,24 @@ def run_evaluator_cli( output: str | None = None, interactive_approval: bool = False, ) -> dict: - target_path = _resolve_cli_target_path(target) - load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) + hooks = _load_evaluator_hooks() + target_path = resolve_cli_target_path(target) + workspace_path = str(target_path.parent if target_path.is_file() else target_path) + suite_selection = resolve_workspace_suite_selection(target=target, suite=suite) + from aworld.evaluations.substrate import resolve_eval_suite_selection + selection = resolve_eval_suite_selection(suite, target_path) suite_def = selection.suite + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.pre_run", + event={"target": str(target_path), "suite": suite_selection["resolved"], "workspace_path": workspace_path}, + state={"target": str(target_path), "suite": suite, "interactive_approval": interactive_approval}, + ) target_info = describe_eval_target(target_path) + for key, value in hook_state.items(): + if key not in {"target", "suite", "interactive_approval", "summary_suffix", "suite_names"}: + target_info[key] = value flow = EvaluationFlowDef( target=target_info, suite=suite_def, @@ -336,6 +181,8 @@ def run_evaluator_cli( output_path=output, ) report = asyncio.run(run_evaluation_flow(flow)) + if hasattr(report, "to_dict"): + report = report.to_dict() approval = dict(report.get("approval") or {}) approval.setdefault("required", report.get("gate", {}).get("status") == "needs_approval") approval.setdefault("resolved", False) @@ -345,11 +192,7 @@ def run_evaluator_cli( approval["resolved"] = True approval["approved"] = approved report["approval"] = approval - report["suite_selection"] = { - "requested": suite, - "resolved": selection.suite_id, - "mode": selection.mode, - } + report["suite_selection"] = suite_selection report["automation"] = _build_automation_summary(report) output_path = ( Path(output).expanduser().resolve() @@ -358,25 +201,28 @@ def run_evaluator_cli( ) output_path.parent.mkdir(parents=True, exist_ok=True) report["report_path"] = str(output_path) + _run_evaluator_hooks( + hooks, + "evaluator.post_run", + event={ + "report": report, + "target": str(target_path), + "suite": suite_selection["resolved"], + "workspace_path": workspace_path, + }, + state=hook_state, + ) output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") return report def render_evaluator_summary(report: dict) -> str: - suite_id = report.get("suite_id", "unknown-suite") - gate = report.get("gate", {}) - status = gate.get("status", "unknown") - metric_value = gate.get("value") - summary_line = f"Evaluator suite: {suite_id}\nGate: {status}" - if metric_value is not None: - summary_line += f" ({metric_value:.2f})" - selection = report.get("suite_selection") or {} - if selection.get("resolved"): - summary_line += f"\nSuite selection: {selection.get('mode', 'unknown')} -> {selection['resolved']}" - backend = report.get("judge_backend", {}).get("backend_id") - if backend: - summary_line += f"\nJudge backend: {backend}" - report_path = report.get("report_path") - if report_path: - summary_line += f"\nReport: {report_path}" - return summary_line + hooks = _load_evaluator_hooks() + workspace_path = str(Path(report.get("report_path", report.get("target", {}).get("target_path", Path.cwd()))).resolve().parent) + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.render_summary", + event={"report": report, "workspace_path": workspace_path}, + state={"summary_suffix": None}, + ) + return _render_evaluator_summary(report, summary_suffix=hook_state.get("summary_suffix")) diff --git a/aworld-cli/src/aworld_cli/evaluator_workspace.py b/aworld-cli/src/aworld_cli/evaluator_workspace.py new file mode 100644 index 000000000..f9cb1551e --- /dev/null +++ b/aworld-cli/src/aworld_cli/evaluator_workspace.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from pathlib import Path + +from aworld.evaluations.substrate import ( + list_eval_suites, + list_matching_eval_suites, + load_declared_eval_suites, + resolve_eval_suite_selection, +) + + +def resolve_cli_target_path(target: str) -> Path: + target_path = Path(target).expanduser().resolve() + if not target_path.exists(): + raise FileNotFoundError(f"evaluation target does not exist: {target_path}") + return target_path + + +def discover_workspace_suites(target: str | None = None) -> list[str]: + if target is None: + load_declared_eval_suites() + return list_eval_suites() + target_path = resolve_cli_target_path(target) + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) + return list_matching_eval_suites(target_path) + + +def resolve_workspace_suite_selection( + *, + target: str, + suite: str | None = None, +) -> dict[str, str | None]: + target_path = resolve_cli_target_path(target) + load_declared_eval_suites(target_path.parent if target_path.is_file() else target_path) + selection = resolve_eval_suite_selection(suite, target_path) + return { + "requested": suite, + "resolved": selection.suite_id, + "mode": selection.mode, + } diff --git a/aworld-cli/src/aworld_cli/top_level_commands/__init__.py b/aworld-cli/src/aworld_cli/top_level_commands/__init__.py index 06638e690..19cb4350f 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/__init__.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/__init__.py @@ -1,6 +1,4 @@ from __future__ import annotations def register_builtin_top_level_commands(registry) -> None: - from .evaluator_cmd import EvaluatorTopLevelCommand - - registry.register(EvaluatorTopLevelCommand()) + return None diff --git a/aworld/evaluations/eval_targets/agent_eval.py b/aworld/evaluations/eval_targets/agent_eval.py index 8176f326a..4077a270b 100644 --- a/aworld/evaluations/eval_targets/agent_eval.py +++ b/aworld/evaluations/eval_targets/agent_eval.py @@ -2,6 +2,7 @@ from typing import Optional, Union from aworld.evaluations.base import EvalTarget, EvalDataCase +from aworld.evaluations.execution import normalize_task_response_to_eval_state from aworld.agents.llm_agent import Agent from aworld.config.conf import AgentConfig from aworld.runner import Runners @@ -73,7 +74,12 @@ async def predict(self, index: int, input: Union[EvalDataCase[dict], dict]) -> d query_column = self.eval_config.eval_dataset_query_column or self.query_column case_data = input.case_data if isinstance(input, EvalDataCase) else input response = await Runners.run(case_data[query_column], agent=self.agent) - return {"answer": response.answer} + state = normalize_task_response_to_eval_state( + case_id=getattr(input, "eval_case_id", str(index)), + response=response, + metadata=case_data, + ) + return {"answer": response.answer, "state": state.to_dict()} class AworldTaskEvalTarget(EvalTarget[dict]): @@ -96,8 +102,14 @@ async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: task = await self.build_task(index, input) result = await Runners.run_task(task=task) if isinstance(result, TaskResponse): - return {"answer": result.answer} + payload = result if isinstance(result, dict): - return {"answer": result[task.id].answer} + payload = result[task.id] else: - return {"answer": result} + payload = result + state = normalize_task_response_to_eval_state( + case_id=getattr(input, "eval_case_id", str(index)), + response=payload, + metadata=input.case_data if isinstance(input, EvalDataCase) else {}, + ) + return {"answer": state.answer, "state": state.to_dict()} diff --git a/aworld/evaluations/execution.py b/aworld/evaluations/execution.py new file mode 100644 index 000000000..eb44b9e26 --- /dev/null +++ b/aworld/evaluations/execution.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Mapping + +from aworld.core.task import TaskResponse + + +class EvalExecutionMode(str, Enum): + STATIC = "static" + AGENT = "agent" + TASK = "task" + + +@dataclass(frozen=True) +class EvalExecutionSpec: + mode: EvalExecutionMode = EvalExecutionMode.STATIC + target_ref: str | None = None + target_config: dict[str, Any] = field(default_factory=dict) + query_column: str | None = None + task_builder_ref: str | None = None + runner_method: str | None = None + timeout_seconds: float | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class EvalState: + case_id: str + status: str + answer: Any | None = None + completion: list[Any] = field(default_factory=list) + artifacts: dict[str, Any] = field(default_factory=dict) + trajectory: list[dict[str, Any]] = field(default_factory=list) + tool_calls: list[dict[str, Any]] = field(default_factory=list) + usage: dict[str, Any] = field(default_factory=dict) + timing: dict[str, Any] = field(default_factory=dict) + error: dict[str, Any] | None = None + raw_response: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "case_id": self.case_id, + "status": self.status, + "answer": self.answer, + "completion": self.completion, + "artifacts": self.artifacts, + "trajectory": self.trajectory, + "tool_calls": self.tool_calls, + "usage": self.usage, + "timing": self.timing, + "error": self.error, + "raw_response": self.raw_response, + "metadata": self.metadata, + } + + +def _extract_tool_calls_from_trajectory(trajectory: list[dict[str, Any]]) -> list[dict[str, Any]]: + calls: list[dict[str, Any]] = [] + for step in trajectory: + if not isinstance(step, Mapping): + continue + if isinstance(step.get("tool_calls"), list): + calls.extend([dict(call) for call in step["tool_calls"] if isinstance(call, Mapping)]) + action = step.get("action") + if isinstance(action, Mapping) and isinstance(action.get("tool_calls"), list): + calls.extend([dict(call) for call in action["tool_calls"] if isinstance(call, Mapping)]) + return calls + + +def normalize_task_response_to_eval_state( + *, + case_id: str, + response: Any, + target: Mapping[str, Any] | None = None, + metadata: Mapping[str, Any] | None = None, +) -> EvalState: + if isinstance(response, TaskResponse): + trajectory = list(response.trajectory or []) + return EvalState( + case_id=case_id, + status="success" if response.success else "failed", + answer=response.answer, + completion=[] if response.answer is None else [response.answer], + trajectory=trajectory, + tool_calls=_extract_tool_calls_from_trajectory(trajectory), + usage=dict(response.usage or {}), + timing={"time_cost": response.time_cost}, + raw_response=response.to_dict(), + metadata={**dict(metadata or {}), "_target": dict(target or {})}, + ) + + if isinstance(response, Mapping): + trajectory = list(response.get("trajectory") or []) + return EvalState( + case_id=case_id, + status=str(response.get("status", "success")), + answer=response.get("answer"), + completion=list(response.get("completion") or ([] if response.get("answer") is None else [response.get("answer")])), + artifacts=dict(response.get("artifacts") or {}), + trajectory=trajectory, + tool_calls=list(response.get("tool_calls") or _extract_tool_calls_from_trajectory(trajectory)), + usage=dict(response.get("usage") or {}), + timing=dict(response.get("timing") or {}), + error=dict(response.get("error")) if isinstance(response.get("error"), Mapping) else response.get("error"), + raw_response=dict(response), + metadata={**dict(metadata or {}), "_target": dict(target or {})}, + ) + + return EvalState( + case_id=case_id, + status="success", + answer=response, + completion=[] if response is None else [response], + metadata={**dict(metadata or {}), "_target": dict(target or {})}, + ) diff --git a/aworld/evaluations/manifests.py b/aworld/evaluations/manifests.py new file mode 100644 index 000000000..119f639a8 --- /dev/null +++ b/aworld/evaluations/manifests.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any + + +def get_declared_eval_suite_schema() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://schemas.aworld.dev/evaluator/declared-suite/v1.json", + "title": "AWorld Declared Evaluator Suite", + "type": "object", + "required": ["suite_id", "base_suite"], + "properties": { + "suite_id": { + "type": "string", + "minLength": 1, + "description": "Unique suite identifier exposed through aworld-cli evaluator.", + }, + "base_suite": { + "type": "string", + "const": "app-evaluator", + "description": "Builtin evaluator suite used as the declaration base.", + }, + "target_kinds": { + "type": "array", + "items": { + "type": "string", + "enum": ["file", "directory", "image"], + }, + "minItems": 1, + "uniqueItems": True, + "description": "Optional target kinds matched by this declared suite.", + }, + "gate_policy": { + "type": "object", + "properties": { + "metric_name": {"type": "string"}, + "pass_threshold": {"type": "number"}, + "approval_threshold": {"type": ["number", "null"]}, + }, + "additionalProperties": False, + "description": "Optional gate override layered on top of the base suite defaults.", + }, + "metadata": { + "type": "object", + "description": "Optional suite metadata copied into the resolved suite definition.", + }, + "priority": { + "type": "integer", + "description": "Optional suite selection priority. Larger values win automatic selection.", + }, + }, + "additionalProperties": False, + } + + +def validate_declared_eval_suite_manifest(payload: dict[str, Any]) -> None: + import jsonschema + + try: + jsonschema.validate(instance=payload, schema=get_declared_eval_suite_schema()) + except jsonschema.ValidationError as exc: + path = ".".join(str(part) for part in exc.absolute_path) + location = f" at '{path}'" if path else "" + raise ValueError(f"declared evaluator suite validation failed{location}: {exc.message}") from exc diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py new file mode 100644 index 000000000..4beb2d413 --- /dev/null +++ b/aworld/evaluations/report.py @@ -0,0 +1,218 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any + + +EVALUATOR_REPORT_FORMAT_ID = "aworld.evaluator.report" +EVALUATOR_REPORT_FORMAT_VERSION = 1 + + +class CaseEvaluationReport(dict): + def __init__( + self, + *, + case_id: str, + input: dict[str, Any], + metrics: dict[str, Any], + judge: dict[str, Any], + judge_backend: dict[str, Any] | None = None, + state_summary: dict[str, Any] | None = None, + ) -> None: + payload = { + "case_id": case_id, + "input": input, + "metrics": metrics, + "judge": judge, + "judge_backend": judge_backend, + "state_summary": state_summary or {}, + } + super().__init__(payload) + + def to_dict(self) -> dict[str, Any]: + return dict(self) + + +class EvaluatorReport(dict): + def to_dict(self) -> dict[str, Any]: + payload = dict(self) + results = payload.get("results") or [] + payload["results"] = [item.to_dict() if hasattr(item, "to_dict") else dict(item) for item in results] + return payload + + +def get_evaluator_report_schema() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": f"https://schemas.aworld.dev/evaluator/report/v{EVALUATOR_REPORT_FORMAT_VERSION}.json", + "title": "AWorld Evaluator Report", + "type": "object", + "$defs": { + "evalStatus": { + "type": "string", + "enum": ["PASSED", "FAILED", "NOT_EVALUATED"], + }, + "metricScalar": { + "oneOf": [ + {"type": "number"}, + {"type": "boolean"}, + ] + }, + "metricAggregate": { + "type": "object", + "properties": { + "mean": {"type": "number"}, + "min": {"type": "number"}, + "max": {"type": "number"}, + "std": {"type": "number"}, + "true_count": {"type": "integer", "minimum": 0}, + "true_rate": {"type": "number", "minimum": 0, "maximum": 1}, + "value": {"$ref": "#/$defs/metricScalar"}, + "eval_status": {"$ref": "#/$defs/evalStatus"}, + }, + "additionalProperties": { + "oneOf": [ + {"type": "number"}, + {"type": "boolean"}, + {"type": "string"}, + {"$ref": "#/$defs/metricAggregate"}, + ] + }, + }, + "caseMetric": { + "type": "object", + "properties": { + "value": {"$ref": "#/$defs/metricScalar"}, + "status": {"$ref": "#/$defs/evalStatus"}, + }, + "required": ["value"], + "additionalProperties": False, + }, + "gateDecision": { + "type": "object", + "required": ["status", "metric_name", "value"], + "properties": { + "status": { + "type": "string", + "enum": ["pass", "fail", "needs_approval"], + }, + "metric_name": {"type": "string"}, + "value": {"type": "number"}, + }, + "additionalProperties": False, + }, + "automationSummary": { + "type": "object", + "required": [ + "gate_status", + "metric_name", + "metric_value", + "approval_required", + "approval_resolved", + "approved", + "suggested_exit_code", + "case_count", + "judge_backend", + ], + "properties": { + "gate_status": { + "type": ["string", "null"], + "enum": ["pass", "fail", "needs_approval", None], + }, + "metric_name": {"type": ["string", "null"]}, + "metric_value": {"type": ["number", "null"]}, + "approval_required": {"type": "boolean"}, + "approval_resolved": {"type": "boolean"}, + "approved": {"type": ["boolean", "null"]}, + "suggested_exit_code": {"type": "integer", "enum": [0, 2, 3]}, + "case_count": {"type": "integer", "minimum": 0}, + "judge_backend": {"type": ["string", "null"]}, + }, + "additionalProperties": False, + }, + }, + "required": [ + "report_version", + "report_format", + "generated_at", + "suite_id", + "target", + "summary", + "metrics", + "results", + "result_counts", + "approval", + ], + "properties": { + "report_version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, + "report_format": { + "type": "object", + "required": ["id", "version"], + "properties": { + "id": {"type": "string", "const": EVALUATOR_REPORT_FORMAT_ID}, + "version": {"type": "integer", "const": EVALUATOR_REPORT_FORMAT_VERSION}, + }, + "additionalProperties": False, + }, + "generated_at": {"type": "string", "format": "date-time"}, + "suite_id": {"type": "string"}, + "target": {"type": "object"}, + "summary": {"type": "object"}, + "metrics": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/metricAggregate"}, + }, + "results": { + "type": "array", + "items": { + "type": "object", + "required": ["case_id", "input", "metrics", "judge"], + "properties": { + "case_id": {"type": "string"}, + "input": {"type": "object"}, + "metrics": { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/caseMetric"}, + }, + "judge": {"type": "object"}, + "judge_backend": { + "type": ["object", "null"], + "properties": {"backend_id": {"type": "string"}}, + "required": ["backend_id"], + "additionalProperties": False, + }, + "state_summary": {"type": "object"}, + }, + "additionalProperties": True, + }, + }, + "result_counts": { + "type": "object", + "required": ["cases_total", "cases_with_metrics", "cases_with_judge"], + "properties": { + "cases_total": {"type": "integer", "minimum": 0}, + "cases_with_metrics": {"type": "integer", "minimum": 0}, + "cases_with_judge": {"type": "integer", "minimum": 0}, + }, + "additionalProperties": False, + }, + "gate": {"$ref": "#/$defs/gateDecision"}, + "approval": {"type": "object"}, + "judge_backend": {"type": "object"}, + "suite_selection": {"type": "object"}, + "automation": {"$ref": "#/$defs/automationSummary"}, + "report_path": {"type": "string"}, + }, + "additionalProperties": True, + } + + +def validate_evaluator_report(report: dict[str, Any]) -> None: + import jsonschema + + try: + jsonschema.validate(instance=report, schema=get_evaluator_report_schema()) + except jsonschema.ValidationError as exc: + path = ".".join(str(part) for part in exc.absolute_path) + location = f" at '{path}'" if path else "" + raise ValueError(f"evaluator report validation failed{location}: {exc.message}") from exc diff --git a/aworld/evaluations/scorers/state_extractors.py b/aworld/evaluations/scorers/state_extractors.py new file mode 100644 index 000000000..98aaca170 --- /dev/null +++ b/aworld/evaluations/scorers/state_extractors.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any, Mapping + + +def get_eval_state(output: Any) -> dict[str, Any]: + if isinstance(output, Mapping) and isinstance(output.get("state"), Mapping): + return dict(output["state"]) + if isinstance(output, Mapping): + return dict(output) + return {} + + +def get_answer(output: Any) -> Any: + state = get_eval_state(output) + if "answer" in state: + return state["answer"] + return None + + +def get_completion(output: Any) -> list[Any]: + state = get_eval_state(output) + return list(state.get("completion") or []) + + +def get_trajectory(output: Any) -> list[dict[str, Any]]: + state = get_eval_state(output) + if "trajectory" in state: + return list(state.get("trajectory") or []) + return [] + + +def get_messages_by_role(output: Any, role: str) -> list[dict[str, Any]]: + return [ + dict(message) + for message in get_trajectory(output) + if isinstance(message, Mapping) and message.get("role") == role + ] + + +def get_assistant_messages(output: Any) -> list[dict[str, Any]]: + completion = get_completion(output) + if completion and all(isinstance(item, Mapping) for item in completion): + return [dict(item) for item in completion] + return get_messages_by_role(output, "assistant") + + +def get_tool_calls(output: Any) -> list[dict[str, Any]]: + tool_calls: list[dict[str, Any]] = [] + for message in get_trajectory(output): + if not isinstance(message, Mapping): + continue + for call in message.get("tool_calls") or []: + if isinstance(call, Mapping): + tool_calls.append(dict(call)) + action = message.get("action") + if isinstance(action, Mapping): + for call in action.get("tool_calls") or []: + if isinstance(call, Mapping): + tool_calls.append(dict(call)) + return tool_calls diff --git a/aworld/evaluations/scorers/suite_judge.py b/aworld/evaluations/scorers/suite_judge.py index fb0852d58..78fabcd4c 100644 --- a/aworld/evaluations/scorers/suite_judge.py +++ b/aworld/evaluations/scorers/suite_judge.py @@ -4,6 +4,7 @@ from aworld.evaluations.base import EvalDataCase, MetricResult, ScorerResult from aworld.evaluations.scorers import scorer_register from aworld.evaluations.base import Scorer +from aworld.evaluations.scorers.state_extractors import get_eval_state @scorer_register("score") @@ -18,6 +19,9 @@ async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> Sc case_input = dict(input.case_data) target = dict(case_input.get("_target", {})) + state = get_eval_state(output) + if state: + target = {**target, **state} execution = await self.suite.resolve_judge_backend().execute(case_input, target, self.suite) payload = dict(execution.payload) self.suite.judge_schema.validate(payload) diff --git a/aworld/evaluations/scorers/trajectory_validators.py b/aworld/evaluations/scorers/trajectory_validators.py index 1feda12b1..cc5b13a80 100644 --- a/aworld/evaluations/scorers/trajectory_validators.py +++ b/aworld/evaluations/scorers/trajectory_validators.py @@ -7,6 +7,7 @@ from aworld.evaluations.base import Scorer, ScorerResult, EvalStatus, MetricResult, EvalDataCase from aworld.evaluations.scorers import scorer_register from aworld.evaluations.scorers.base_validator import RuleScorer, LLMAsJudgeScorer +from aworld.evaluations.scorers.state_extractors import get_trajectory from aworld.evaluations.types import MetricNames from aworld.logs.util import logger @@ -70,6 +71,9 @@ class TrajectoryValidator(RuleScorer): def _parse_trajectory(self, output: Any) -> Dict: + trajectory = get_trajectory(output) + if trajectory: + return trajectory if isinstance(output, dict): if "trajectory" in output: output = output["trajectory"] diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 63c2a7052..e84569f04 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -3,6 +3,7 @@ import asyncio import base64 +import importlib import json import inspect import os @@ -16,6 +17,14 @@ from aworld.config.conf import EvaluationConfig from aworld.evaluations.base import EvalDataCase, EvalDataset from aworld.evaluations.base import NoActionEvalTarget +from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget, AworldTaskEvalTarget +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.report import ( + CaseEvaluationReport, + EVALUATOR_REPORT_FORMAT_ID, + EVALUATOR_REPORT_FORMAT_VERSION, + EvaluatorReport, +) from aworld.runners.evaluate_runner import EvaluateRunner @@ -35,14 +44,13 @@ ".svg": "image/svg+xml", } -EVALUATOR_REPORT_FORMAT_ID = "aworld.evaluator.report" -EVALUATOR_REPORT_FORMAT_VERSION = 1 - - @dataclass(frozen=True) class EvalCaseDef: case_id: str input: dict[str, Any] + expected: Any | None = None + max_turns: int | None = None + timeout_seconds: float | None = None metadata: dict[str, Any] = field(default_factory=dict) @@ -203,8 +211,10 @@ async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suit class EvalSuiteDef: suite_id: str cases: list[EvalCaseDef] = field(default_factory=list) + toolsets: tuple[str, ...] = tuple() judge_schema: JudgeSchemaDef = field(default_factory=JudgeSchemaDef) gate_policy: GatePolicyDef | None = None + execution: EvalExecutionSpec | None = None judge: JudgeCallable | None = None judge_backend: JudgeBackend | None = None metadata: dict[str, Any] = field(default_factory=dict) @@ -438,13 +448,46 @@ def build_eval_dataset(cases: list[EvalCaseDef], target: dict[str, Any]) -> Eval EvalDataCase( eval_case_id=case.case_id, eval_dataset_id=dataset_id, - case_data={**case.input, "_target": normalized_target, "_case_metadata": dict(case.metadata)}, + case_data={ + **case.input, + "_target": normalized_target, + "_case_metadata": dict(case.metadata), + "_expected": case.expected, + "_max_turns": case.max_turns, + "_timeout_seconds": case.timeout_seconds, + }, ) for case in cases ] return EvalDataset(eval_dataset_id=dataset_id, eval_dataset_name="suite_eval_dataset", eval_cases=eval_cases) +class _ConfiguredTaskEvalTarget(AworldTaskEvalTarget): + def __init__(self, *, target: dict[str, Any], execution: EvalExecutionSpec): + super().__init__() + self._target = dict(target) + self._execution = execution + + async def build_task(self, index: int, input: EvalDataCase[dict]): + builder = _load_callable(self._execution.task_builder_ref) + task = builder(index=index, input=input, target=self._target, execution=self._execution) + return await _maybe_await(task) + + +def _build_eval_target(flow: EvaluationFlowDef): + execution = flow.suite.execution + if execution is None or execution.mode == EvalExecutionMode.STATIC: + return NoActionEvalTarget() + if execution.mode == EvalExecutionMode.AGENT: + return AworldAgentEvalTarget( + agent_config=execution.target_config, + query_column=execution.query_column or "query", + ) + if execution.mode == EvalExecutionMode.TASK: + return _ConfiguredTaskEvalTarget(target=flow.target, execution=execution) + raise ValueError(f"unsupported execution mode: {execution.mode}") + + def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: normalized_target = _normalize_target(flow.target) dataset = build_eval_dataset(flow.suite.cases, normalized_target) @@ -459,7 +502,7 @@ def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: } eval_config = EvaluationConfig( eval_suite_id=flow.suite.suite_id, - eval_target=NoActionEvalTarget(), + eval_target=_build_eval_target(flow), eval_criterias=[eval_criteria], eval_dataset=dataset, ) @@ -493,7 +536,24 @@ def _format_report_timestamp(timestamp: float) -> str: return datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat().replace("+00:00", "Z") -async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: +def _build_state_summary(output: Mapping[str, Any] | Any) -> dict[str, Any]: + if not isinstance(output, Mapping): + return {} + state = output.get("state") if isinstance(output.get("state"), Mapping) else output + trajectory = state.get("trajectory") if isinstance(state, Mapping) else None + completion = state.get("completion") if isinstance(state, Mapping) else None + return { + "answer": state.get("answer") if isinstance(state, Mapping) else None, + "completion_count": len(completion or []) if isinstance(completion, list) else 0, + "trajectory_steps": len(trajectory or []) if isinstance(trajectory, list) else 0, + "tool_call_count": len(state.get("tool_calls") or []) if isinstance(state, Mapping) else 0, + "usage": dict(state.get("usage") or {}) if isinstance(state, Mapping) else {}, + "timing": dict(state.get("timing") or {}) if isinstance(state, Mapping) else {}, + "error": state.get("error") if isinstance(state, Mapping) else None, + } + + +async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: compiled = compile_evaluation_flow(flow) eval_result = await EvaluateRunner(config=compiled.eval_config).run() @@ -507,7 +567,7 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: ) gate = compiled.gate_policy.evaluate(gate_metrics) - results = [] + results: list[CaseEvaluationReport] = [] report_backend_id = None cases_with_metrics = 0 cases_with_judge = 0 @@ -537,17 +597,18 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: if judge_payload: cases_with_judge += 1 results.append( - { - "case_id": case_result.eval_case_id, - "input": dict(case_result.input.case_data if hasattr(case_result.input, "case_data") else case_result.input), - "metrics": case_metrics, - "judge": judge_payload, - "judge_backend": {"backend_id": case_backend_id} if case_backend_id is not None else None, - } + CaseEvaluationReport( + case_id=case_result.eval_case_id, + input=dict(case_result.input.case_data if hasattr(case_result.input, "case_data") else case_result.input), + metrics=case_metrics, + judge=judge_payload, + judge_backend={"backend_id": case_backend_id} if case_backend_id is not None else None, + state_summary=_build_state_summary(case_result.output), + ) ) metrics = dict(suite_summary) - report = { + report = EvaluatorReport({ "report_version": 1, "report_format": { "id": EVALUATOR_REPORT_FORMAT_ID, @@ -569,7 +630,7 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> dict[str, Any]: "resolved": False, "approved": None, }, - } + }) if report_backend_id is not None: report["judge_backend"] = {"backend_id": report_backend_id} if gate is not None: @@ -688,6 +749,28 @@ async def _maybe_await_judge(judge: JudgeCallable, case_input: dict[str, Any], t return payload +async def _maybe_await(value: Any) -> Any: + if inspect.isawaitable(value): + return await value + return value + + +def _load_callable(ref: str | None) -> Callable[..., Any]: + if not ref: + raise ValueError("task execution mode requires task_builder_ref") + if ":" in ref: + module_path, attr_name = ref.split(":", 1) + elif "." in ref: + module_path, attr_name = ref.rsplit(".", 1) + else: + raise ValueError(f"invalid callable reference: {ref}") + module = importlib.import_module(module_path) + candidate = getattr(module, attr_name) + if not callable(candidate): + raise ValueError(f"callable reference is not callable: {ref}") + return candidate + + def _load_app_evaluator_skill_prompt() -> str: skill_path = Path(__file__).resolve().parents[2] / "aworld-skills" / "app_evaluator" / "SKILL.md" return skill_path.read_text(encoding="utf-8") diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index a24268fb0..c265676ad 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -10,6 +10,8 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) import aworld.evaluations.substrate as substrate_module +from aworld.evaluations.manifests import get_declared_eval_suite_schema +from aworld.evaluations.report import EvaluatorReport from aworld_cli.evaluator_runtime import ( available_evaluator_suites, evaluator_exit_code, @@ -69,6 +71,27 @@ async def fake_run_evaluation_flow(flow): assert persisted["judge_backend"]["backend_id"] == "stub-agent" +@pytest.mark.asyncio +async def test_framework_run_evaluation_flow_returns_report_object() -> None: + async def fake_judge(case_input, target): + return {"score": 0.9} + + flow = substrate_module.EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=substrate_module.EvalSuiteDef( + suite_id="app-evaluator", + cases=[substrate_module.EvalCaseDef(case_id="case-1", input={"query": "demo"})], + gate_policy=substrate_module.GatePolicyDef(metric_name="score", pass_threshold=0.0), + judge=fake_judge, + ), + ) + + report = await substrate_module.run_evaluation_flow(flow) + + assert isinstance(report, EvaluatorReport) + assert report["suite_id"] == "app-evaluator" + + def test_run_evaluator_cli_writes_default_report_when_output_is_omitted( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -106,6 +129,11 @@ def test_available_evaluator_suites_lists_builtin_suite() -> None: assert "app-evaluator" in suites +def test_cli_schema_helpers_delegate_to_framework_sources() -> None: + assert get_declared_evaluator_suite_schema() == get_declared_eval_suite_schema() + assert get_evaluator_report_schema()["title"] == "AWorld Evaluator Report" + + def test_available_evaluator_suites_filters_by_target( tmp_path: Path, ) -> None: diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 7660ef15d..3400021ac 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -24,6 +24,7 @@ resolve_eval_suite_selection, run_evaluation_flow, ) +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec @pytest.fixture(autouse=True) @@ -62,9 +63,39 @@ def test_compile_evaluation_flow_builds_inline_dataset_and_gate_config() -> None assert compiled.eval_config.eval_dataset is compiled.dataset assert compiled.dataset.eval_cases[0].case_data["query"] == "hello world" assert compiled.dataset.eval_cases[0].case_data["_target"]["target_path"] == "demo.txt" + assert compiled.dataset.eval_cases[0].case_data["_expected"] is None assert compiled.gate_policy.metric_name == "score" +def test_eval_case_def_supports_expected_and_runtime_overrides() -> None: + case = EvalCaseDef( + case_id="case-1", + input={"query": "demo"}, + expected={"answer": "ok"}, + max_turns=3, + timeout_seconds=5.0, + metadata={"toolsets": ["search"]}, + ) + + assert case.expected == {"answer": "ok"} + assert case.max_turns == 3 + assert case.timeout_seconds == 5.0 + assert case.metadata["toolsets"] == ["search"] + + +def test_compile_evaluation_flow_uses_execution_backed_target_when_suite_declares_execution() -> None: + suite = EvalSuiteDef( + suite_id="task-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.TASK, task_builder_ref="tests.helpers:build_demo_task"), + ) + flow = EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + + compiled = compile_evaluation_flow(flow) + + assert compiled.eval_config.eval_target.__class__.__name__ == "_ConfiguredTaskEvalTarget" + + def test_judge_schema_validation_rejects_missing_fields() -> None: schema = JudgeSchemaDef(required_fields=("score", "rank", "criticism")) @@ -137,6 +168,27 @@ async def fake_judge(case_input, target): assert report["summary"]["demo-suite"]["score"]["mean"] == pytest.approx(0.7) +@pytest.mark.asyncio +async def test_suite_judge_prefers_state_payload_over_static_case_target() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "answer": target["answer"]} + + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + judge=fake_judge, + ) + from aworld.evaluations.scorers.suite_judge import SuiteJudgeScorer + + scorer = SuiteJudgeScorer(suite=suite) + input_case = type("Case", (), {"case_data": {"query": "demo", "_target": {"path": "legacy"}}})() + output = {"state": {"answer": "from-state", "status": "success"}} + + result = await scorer.score(0, input_case, output) + + assert result.metric_results["score"]["metadata"]["answer"] == "from-state" + + def test_builtin_app_evaluator_suite_has_required_schema_and_score_gate() -> None: suite = get_builtin_eval_suite("app-evaluator") diff --git a/tests/evaluations/test_execution_state.py b/tests/evaluations/test_execution_state.py new file mode 100644 index 000000000..30b0ae7ad --- /dev/null +++ b/tests/evaluations/test_execution_state.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from aworld.core.task import TaskResponse +from aworld.evaluations.execution import normalize_task_response_to_eval_state +from aworld.evaluations.scorers.state_extractors import ( + get_assistant_messages, + get_completion, + get_tool_calls, +) + + +def test_normalize_task_response_to_eval_state_captures_answer_usage_and_trajectory() -> None: + response = TaskResponse( + id="task-1", + answer="done", + usage={"total_tokens": 42}, + trajectory=[{"type": "tool", "name": "search"}], + success=True, + ) + + state = normalize_task_response_to_eval_state(case_id="case-1", response=response) + + assert state.case_id == "case-1" + assert state.answer == "done" + assert state.completion == ["done"] + assert state.usage["total_tokens"] == 42 + assert state.trajectory[0]["name"] == "search" + assert state.status == "success" + + +def test_state_extractors_support_completion_and_tool_queries() -> None: + state = { + "completion": [{"role": "assistant", "content": "final"}], + "trajectory": [ + {"role": "user", "content": "question"}, + {"role": "assistant", "content": "thinking"}, + {"action": {"tool_calls": [{"name": "search"}]}}, + ], + } + + assert get_completion(state)[0]["content"] == "final" + assert get_assistant_messages(state)[0]["content"] == "final" + assert get_tool_calls(state)[0]["name"] == "search" diff --git a/tests/plugins/test_plugin_hooks.py b/tests/plugins/test_plugin_hooks.py index c0dd8adb0..219398d74 100644 --- a/tests/plugins/test_plugin_hooks.py +++ b/tests/plugins/test_plugin_hooks.py @@ -8,6 +8,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) from aworld.plugins.discovery import discover_plugins +from aworld_cli.evaluator_runtime import render_evaluator_summary, run_evaluator_cli from aworld_cli.builtin_plugins.memory_cli.common import append_workspace_session_log from aworld_cli.builtin_plugins.memory_cli.hooks import task_completed as task_completed_hook_module from aworld_cli.plugin_capabilities.hooks import PluginHookResult, load_plugin_hooks @@ -36,6 +37,10 @@ def _get_builtin_memory_plugin_root() -> Path: ) +def _get_evaluator_like_plugin_root() -> Path: + return Path("tests/fixtures/plugins/evaluator_like").resolve() + + def test_load_plugin_hook_entrypoints(): plugin_root = Path("tests/fixtures/plugins/ralph_like").resolve() plugin = discover_plugins([plugin_root])[0] @@ -46,6 +51,89 @@ def test_load_plugin_hook_entrypoints(): assert hooks["stop"][0].entrypoint_id == "loop-stop" +def test_evaluator_pre_run_hook_can_annotate_runtime_metadata( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + hooks = load_plugin_hooks(discover_plugins([_get_evaluator_like_plugin_root()])) + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "target": dict(flow.target), + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": False, "resolved": False, "approved": None}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: hooks) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_cli(target=str(target)) + + assert report["target"]["hook_tag"] == "from-pre-run" + + +def test_evaluator_post_run_hook_can_capture_report_side_effect( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + target = tmp_path / "artifact.txt" + target.write_text("artifact", encoding="utf-8") + hooks = load_plugin_hooks(discover_plugins([_get_evaluator_like_plugin_root()])) + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "suite_id": "app-evaluator", + "target": dict(flow.target), + "summary": {"app-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": False, "resolved": False, "approved": None}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: hooks) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + run_evaluator_cli(target=str(target)) + + assert (tmp_path / "hook-output.json").exists() + + +def test_evaluator_render_summary_hook_can_append_suffix( + monkeypatch: pytest.MonkeyPatch, +) -> None: + hooks = load_plugin_hooks(discover_plugins([_get_evaluator_like_plugin_root()])) + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: hooks) + + summary = render_evaluator_summary( + { + "suite_id": "app-evaluator", + "gate": {"status": "pass", "value": 0.9}, + "target": {"target_path": str(Path.cwd() / "artifact.txt")}, + } + ) + + assert "hook-rendered" in summary + + +def test_evaluator_hook_contract_is_documented_in_runtime_module() -> None: + content = Path("aworld-cli/src/aworld_cli/evaluator_runtime.py").read_text(encoding="utf-8") + + assert "evaluator.pre_run" in content + assert "event payload" in content + assert "allowed side effects" in content + + @pytest.mark.asyncio async def test_stop_hook_can_block_and_continue_session(tmp_path): plugin_root = Path("tests/fixtures/plugins/ralph_like").resolve() diff --git a/tests/test_plugin_cli_entrypoint.py b/tests/test_plugin_cli_entrypoint.py index 39782ce42..8209d55a3 100644 --- a/tests/test_plugin_cli_entrypoint.py +++ b/tests/test_plugin_cli_entrypoint.py @@ -69,6 +69,17 @@ def test_interactive_command_is_registered_via_plugin_registry(): assert command is not None +def test_evaluator_command_is_registered_via_plugin_registry(): + from aworld_cli import main as main_module + from aworld_cli.core.plugin_manager import get_builtin_plugin_roots + + registry = main_module._build_top_level_command_registry() + command = registry.get("evaluator") + + assert command is not None + assert any(path.name == "evaluator_cli" for path in get_builtin_plugin_roots()) + + def test_acp_command_dispatches_via_plugin_registry(capsys): from aworld_cli import main as main_module From 2cc7c3078b8608451347dc5379614a2ed6772f02 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 9 Jun 2026 17:16:55 +0800 Subject: [PATCH 16/41] docs: align evaluator substrate docs --- aworld/evaluations/README.md | 108 +++++++++++++++++- docs/AWorld CLI/Commands/Evaluator.md | 40 +++++++ .../evaluator_like/.aworld-plugin/plugin.json | 33 ++++++ .../plugins/evaluator_like/hooks/.gitignore | 2 + .../plugins/evaluator_like/hooks/post_run.py | 8 ++ .../plugins/evaluator_like/hooks/pre_run.py | 2 + .../evaluator_like/hooks/render_summary.py | 2 + 7 files changed, 189 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json create mode 100644 tests/fixtures/plugins/evaluator_like/hooks/.gitignore create mode 100644 tests/fixtures/plugins/evaluator_like/hooks/post_run.py create mode 100644 tests/fixtures/plugins/evaluator_like/hooks/pre_run.py create mode 100644 tests/fixtures/plugins/evaluator_like/hooks/render_summary.py diff --git a/aworld/evaluations/README.md b/aworld/evaluations/README.md index 5199bb5d1..8c9f2bfdb 100644 --- a/aworld/evaluations/README.md +++ b/aworld/evaluations/README.md @@ -1,8 +1,8 @@ # AWorld Evaluations Module -The `aworld.evaluations` module provides a comprehensive framework for evaluating the performance of AI agents, language -models, and tasks within the AWorld ecosystem. It offers flexible evaluation criteria, diverse scoring mechanisms, and a -robust runtime system to conduct structured assessments. +The `aworld.evaluations` module is the framework-owned evaluation substrate for AWorld. It supports both legacy +`EvaluationConfig`-driven flows and newer suite-backed evaluator flows that can execute an agent or task first, then +score both final outcomes and trajectory/process quality from a normalized execution state. ## Table of Contents @@ -80,6 +80,30 @@ with input data. `EvalResult` captures the outcomes of an evaluation run, including individual case results and summary statistics. +### Suite-Backed Evaluation Definitions + +Suite-backed evaluation adds a definition layer on top of the existing runtime skeleton: + +- `EvalSuiteDef`: suite identity, cases, judge schema, gate policy, toolset hints, execution spec +- `EvalCaseDef`: input plus optional expected output and per-case runtime hints +- `EvalExecutionSpec`: runtime execution mode and target/task configuration +- `EvalState`: normalized execution result containing final answer, completion view, trajectory, usage, timing, and errors + +These live under `aworld/evaluations/**`, not in `aworld-cli`, so the same substrate can be reused by framework callers, +official CLI flows, and custom evaluation agents. + +Ownership is explicit: + +- suite and case definitions own evaluation intent: input, expected outcome, task-domain tool hints, tags, and judge/gate semantics +- execution specs own runtime behavior: whether execution is static, agent-backed, or task-backed, plus task/runner configuration +- `aworld-cli` only assembles workspace inputs into these framework objects; it does not redefine evaluator semantics + +`EvalState` intentionally separates: + +- `answer`: the final deliverable or normalized terminal answer +- `completion`: completion-oriented view used by outcome scorers that only care about the final assistant output +- `trajectory`: full execution history used by process, tool-use, and efficiency scorers + ## Scorers ### Scorer Registry @@ -119,12 +143,25 @@ The module includes several pre-built scorers for common evaluation tasks: - **SummarizeQuality**: Assesses the quality of generated summaries - And more... +### Execution-State Helpers + +`aworld.evaluations.scorers.state_extractors` provides reusable helpers for execution-backed scoring: + +- `get_eval_state(output)` +- `get_completion(output)` +- `get_assistant_messages(output)` +- `get_messages_by_role(output, role)` +- `get_tool_calls(output)` +- `get_trajectory(output)` + +Use these helpers instead of hand-parsing raw trajectory payloads in every scorer. + ## Evaluation Targets ### AworldAgentEvalTarget `AworldAgentEvalTarget` enables evaluating AWorld agents by running them on evaluation datasets and capturing their -responses. +responses. In execution-backed flows it returns both the final answer and a normalized `state` payload. ```python class AworldAgentEvalTarget(EvalTarget[dict]): @@ -140,7 +177,39 @@ class AworldAgentEvalTarget(EvalTarget[dict]): ### AworldTaskEvalTarget `AworldTaskEvalTarget` provides a framework for evaluating task-based systems by building and running tasks for each -evaluation case. +evaluation case. In execution-backed flows it normalizes `TaskResponse` output into `EvalState`. + +## Execution-Backed Suite Evaluation + +Execution-backed suite flows reuse the existing AWorld runtime instead of introducing a parallel evaluator stack: + +- suite/case definitions specify what is being evaluated +- `EvalExecutionSpec` specifies how runtime execution happens +- `EvalTarget -> Evaluator -> EvaluateRunner` remains the core orchestration skeleton +- scorers read normalized execution state for outcome and trajectory evaluation + +The current execution modes are: + +- `static`: judge-only evaluation with no runtime execution +- `agent`: execute through `AworldAgentEvalTarget` +- `task`: execute through `AworldTaskEvalTarget` + +This gives AWorld a framework-native evaluator path that can assess final artifacts, structured outputs, and trajectory +quality through one substrate. + +## Suite, Case, and Execution Mapping + +The evaluator v2 path is intentionally close to AWorld's existing runner model: + +- suite -> describes the evaluation contract and default gate/judge behavior +- case -> provides per-row input, optional references, and case-local execution hints +- execution spec -> describes how a case becomes a runnable AWorld execution +- eval target -> adapts the execution spec into an existing target implementation +- evaluator / runner -> executes cases and produces normalized outputs +- scorers -> read final answer, completion, and trajectory from `EvalState` + +In practice this means outcome evaluation and trajectory evaluation share one execution pipeline. A suite can score only +the final artifact, only the trajectory, or both. ## Recorder @@ -165,6 +234,33 @@ class EvaluateRunner(Runner): # Evaluation orchestration logic ``` +`EvaluateRunner` remains the orchestration layer. Suite-backed evaluation compiles into it rather than replacing it. + +## Framework vs CLI + +`aworld.evaluations` owns evaluation semantics: + +- suite definitions +- execution-backed compilation +- normalized execution state +- scoring helpers +- report and declared-suite schemas + +`aworld-cli` owns product entrypoints: + +- `aworld-cli evaluator` +- workspace suite discovery +- report file writing +- evaluator lifecycle hooks for peripheral CLI customization + +`aworld-cli evaluator` is now plugin-backed, but the reusable evaluator substrate remains framework-owned. + +The intended layering is: + +- build evaluator capabilities in `aworld/evaluations/**` +- expose a convenient official entrypoint in `aworld-cli` +- allow other agents or products to reuse the framework substrate without depending on the CLI command shape + ## Usage Examples ### Basic Evaluation @@ -245,4 +341,4 @@ Evaluation behavior can be customized through the `EvaluationConfig` class, whic - Evaluation targets and their configurations - Datasets and loading parameters - Evaluation criteria and metrics -- Execution parameters like parallelism and repetition count \ No newline at end of file +- Execution parameters like parallelism and repetition count diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index 5350c870a..aa55d4208 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -4,6 +4,10 @@ The evaluator command runs suite-backed evaluation flows for local targets and exposes the resulting report as a stable machine-readable contract. +It is the official CLI entrypoint for the framework substrate in `aworld.evaluations`: the CLI resolves targets, +workspace manifests, output paths, and hooks, while suite semantics, execution-backed state normalization, scoring, and +gate decisions remain framework-owned. + Use it when you want to: - run a built-in evaluator suite such as `app-evaluator` @@ -64,6 +68,39 @@ Minimal example: See [declared_evaluator_suite.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/declared_evaluator_suite.example.json) for a complete example. The current manifest schema is exported by `aworld_cli.evaluator_runtime.get_declared_evaluator_suite_schema()`. +Resolution rules: + +- builtin suites are always available +- declared suites are discovered relative to the evaluation target workspace, not just the current shell cwd +- declared manifests currently extend `app-evaluator`; they are not yet a generic user-defined suite authoring API +- `--list-suites --target ...` and actual evaluator execution use the same target-relative discovery path + +## Plugin Hooks + +`aworld-cli evaluator` is a builtin plugin-backed command with narrow lifecycle hook points intended for CLI assembly concerns, not framework scoring semantics. + +Available hook points: + +- `evaluator.pre_discover`: inspect or annotate target/workspace inputs before suite discovery +- `evaluator.post_discover`: react to resolved suite candidates +- `evaluator.pre_run`: add lightweight CLI metadata before evaluation starts +- `evaluator.post_run`: upload or post-process the completed report +- `evaluator.render_summary`: augment rendered terminal summary text + +Current event payloads: + +- `evaluator.pre_discover`: `target`, `workspace_path` +- `evaluator.post_discover`: `target`, `workspace_path`, `suite_names` +- `evaluator.pre_run`: `target`, `suite`, `workspace_path` +- `evaluator.post_run`: `report`, `target`, `suite`, `workspace_path` +- `evaluator.render_summary`: `report`, `workspace_path` + +Hook boundaries: + +- mutable hook state is limited to lightweight CLI assembly metadata +- hooks should not replace suite logic, judge logic, or gate calculation +- suitable side effects include report upload, notifications, and summary augmentation + ## Report Contract Evaluator reports are JSON documents with a stable top-level format marker: @@ -83,6 +120,8 @@ Key report sections: - `results`: per-case judge output plus normalized per-case metrics - `gate`: structured `pass` / `fail` / `needs_approval` decision - `automation`: exit-code-oriented summary fields for scripts and CI +- `suite_selection`: resolved/defaulted suite selection diagnostics +- `approval`: approval decision metadata when the gate requires human confirmation See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/evaluator_report.example.json) for a minimal example. @@ -108,3 +147,4 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ - declared suite manifests currently layer on `app-evaluator` only; they are not a generic suite authoring format yet. - `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. - `--validate-report` validates an existing JSON report against that schema without re-running evaluation. +- the CLI command is an assembly/product layer; reusable evaluator building blocks stay in `aworld/evaluations/**`. diff --git a/tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json b/tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json new file mode 100644 index 000000000..c829ba2cc --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/.aworld-plugin/plugin.json @@ -0,0 +1,33 @@ +{ + "id": "evaluator-like", + "name": "evaluator-like", + "version": "1.0.0", + "entrypoints": { + "hooks": [ + { + "id": "evaluator-pre-run", + "target": "hooks/pre_run.py", + "scope": "workspace", + "metadata": { + "hook_point": "evaluator.pre_run" + } + }, + { + "id": "evaluator-post-run", + "target": "hooks/post_run.py", + "scope": "workspace", + "metadata": { + "hook_point": "evaluator.post_run" + } + }, + { + "id": "evaluator-render-summary", + "target": "hooks/render_summary.py", + "scope": "workspace", + "metadata": { + "hook_point": "evaluator.render_summary" + } + } + ] + } +} diff --git a/tests/fixtures/plugins/evaluator_like/hooks/.gitignore b/tests/fixtures/plugins/evaluator_like/hooks/.gitignore new file mode 100644 index 000000000..7a60b85e1 --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/tests/fixtures/plugins/evaluator_like/hooks/post_run.py b/tests/fixtures/plugins/evaluator_like/hooks/post_run.py new file mode 100644 index 000000000..e76538a7c --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/post_run.py @@ -0,0 +1,8 @@ +import json +from pathlib import Path + + +def handle_event(event, state): + output_path = Path(event["workspace_path"]) / "hook-output.json" + output_path.write_text(json.dumps(event["report"]), encoding="utf-8") + return {"action": "allow"} diff --git a/tests/fixtures/plugins/evaluator_like/hooks/pre_run.py b/tests/fixtures/plugins/evaluator_like/hooks/pre_run.py new file mode 100644 index 000000000..711be3dfc --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/pre_run.py @@ -0,0 +1,2 @@ +def handle_event(event, state): + return {"metadata": {"hook_tag": "from-pre-run"}} diff --git a/tests/fixtures/plugins/evaluator_like/hooks/render_summary.py b/tests/fixtures/plugins/evaluator_like/hooks/render_summary.py new file mode 100644 index 000000000..7f671d2a4 --- /dev/null +++ b/tests/fixtures/plugins/evaluator_like/hooks/render_summary.py @@ -0,0 +1,2 @@ +def handle_event(event, state): + return {"metadata": {"summary_suffix": "hook-rendered"}} From 8dea37d7e44b8b86c505696fd0a88e9e4f9004d2 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 9 Jun 2026 21:22:13 +0800 Subject: [PATCH 17/41] feat: extend evaluator v2 substrate --- .../src/aworld_cli/evaluator_rendering.py | 5 +- aworld/evaluations/README.md | 12 +- aworld/evaluations/base.py | 4 +- aworld/evaluations/eval_targets/agent_eval.py | 63 +- aworld/evaluations/execution.py | 74 ++- aworld/evaluations/execution_adapters.py | 115 ++++ aworld/evaluations/report.py | 10 +- aworld/evaluations/scorers/suite_judge.py | 30 +- aworld/evaluations/substrate.py | 351 +++++++++- tests/core/test_evaluator_runtime.py | 39 ++ .../evaluations/test_evaluation_substrate.py | 603 ++++++++++++++++++ tests/evaluations/test_execution_adapters.py | 93 +++ tests/evaluations/test_execution_state.py | 87 ++- 13 files changed, 1417 insertions(+), 69 deletions(-) create mode 100644 aworld/evaluations/execution_adapters.py create mode 100644 tests/evaluations/test_execution_adapters.py diff --git a/aworld-cli/src/aworld_cli/evaluator_rendering.py b/aworld-cli/src/aworld_cli/evaluator_rendering.py index a44176241..21f140df0 100644 --- a/aworld-cli/src/aworld_cli/evaluator_rendering.py +++ b/aworld-cli/src/aworld_cli/evaluator_rendering.py @@ -8,7 +8,10 @@ def render_evaluator_summary(report: dict, *, summary_suffix: str | None = None) metric_value = gate.get("value") summary_line = f"Evaluator suite: {suite_id}\nGate: {status}" if metric_value is not None: - summary_line += f" ({metric_value:.2f})" + if isinstance(metric_value, (int, float)): + summary_line += f" ({metric_value:.2f})" + else: + summary_line += f" ({metric_value})" selection = report.get("suite_selection") or {} if selection.get("resolved"): summary_line += f"\nSuite selection: {selection.get('mode', 'unknown')} -> {selection['resolved']}" diff --git a/aworld/evaluations/README.md b/aworld/evaluations/README.md index 8c9f2bfdb..60226726c 100644 --- a/aworld/evaluations/README.md +++ b/aworld/evaluations/README.md @@ -84,8 +84,9 @@ with input data. Suite-backed evaluation adds a definition layer on top of the existing runtime skeleton: -- `EvalSuiteDef`: suite identity, cases, judge schema, gate policy, toolset hints, execution spec +- `EvalSuiteDef`: suite identity, cases, judge schema, gate policy, trajectory scorers, toolset hints, execution spec - `EvalCaseDef`: input plus optional expected output and per-case runtime hints +- `EvalHarnessDef`: reusable execution defaults for suite-backed flows - `EvalExecutionSpec`: runtime execution mode and target/task configuration - `EvalState`: normalized execution result containing final answer, completion view, trajectory, usage, timing, and errors @@ -95,7 +96,7 @@ official CLI flows, and custom evaluation agents. Ownership is explicit: - suite and case definitions own evaluation intent: input, expected outcome, task-domain tool hints, tags, and judge/gate semantics -- execution specs own runtime behavior: whether execution is static, agent-backed, or task-backed, plus task/runner configuration +- harnesses and execution specs own runtime behavior: whether execution is static, agent-backed, task-backed, or program-backed, plus task/runner configuration - `aworld-cli` only assembles workspace inputs into these framework objects; it does not redefine evaluator semantics `EvalState` intentionally separates: @@ -193,10 +194,17 @@ The current execution modes are: - `static`: judge-only evaluation with no runtime execution - `agent`: execute through `AworldAgentEvalTarget` - `task`: execute through `AworldTaskEvalTarget` +- `program`: execute an importable callable through the evaluator adapter layer and normalize the result into `EvalState` This gives AWorld a framework-native evaluator path that can assess final artifacts, structured outputs, and trajectory quality through one substrate. +Suite-backed evaluation also supports: + +- typed judge schemas: Pydantic-backed validation with JSON schema export and required-field compatibility +- composite gates: structured metric conditions with `pass`, `fail`, and `needs_approval` outcomes +- trajectory scorers: suite-declared process metrics that lower into normal evaluator criteria and reports + ## Suite, Case, and Execution Mapping The evaluator v2 path is intentionally close to AWorld's existing runner model: diff --git a/aworld/evaluations/base.py b/aworld/evaluations/base.py index c54ebe147..5ff609717 100644 --- a/aworld/evaluations/base.py +++ b/aworld/evaluations/base.py @@ -29,7 +29,7 @@ class EvalStatus(Enum): NOT_EVALUATED = 3 -MetricValueType = Union[int, float, bool] +MetricValueType = Union[int, float, bool, str] @dataclass @@ -209,6 +209,8 @@ def _do_summarize(self, scores: list[Any]) -> dict: score_dict['std'] = statistics.stdev(scores) else: score_dict['std'] = 0.0 + elif isinstance(score, str): + score_dict['value'] = score if all(item == score for item in scores) else "mixed" elif isinstance(score, dict): all_keys = list( dict.fromkeys([k for score in scores if isinstance(score, dict) for k in score.keys()]) diff --git a/aworld/evaluations/eval_targets/agent_eval.py b/aworld/evaluations/eval_targets/agent_eval.py index 4077a270b..0f4744a0f 100644 --- a/aworld/evaluations/eval_targets/agent_eval.py +++ b/aworld/evaluations/eval_targets/agent_eval.py @@ -2,11 +2,11 @@ from typing import Optional, Union from aworld.evaluations.base import EvalTarget, EvalDataCase -from aworld.evaluations.execution import normalize_task_response_to_eval_state +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution_adapters import resolve_execution_adapter from aworld.agents.llm_agent import Agent from aworld.config.conf import AgentConfig -from aworld.runner import Runners -from aworld.core.task import Task, TaskResponse +from aworld.core.task import Task import os @@ -73,13 +73,30 @@ def _create_agent_from_config(self, agent_config): async def predict(self, index: int, input: Union[EvalDataCase[dict], dict]) -> dict: query_column = self.eval_config.eval_dataset_query_column or self.query_column case_data = input.case_data if isinstance(input, EvalDataCase) else input - response = await Runners.run(case_data[query_column], agent=self.agent) - state = normalize_task_response_to_eval_state( - case_id=getattr(input, "eval_case_id", str(index)), - response=response, - metadata=case_data, + case = type( + "AdapterCase", + (), + { + "case_id": getattr(input, "eval_case_id", str(index)), + "input": dict(case_data), + }, + )() + state = await resolve_execution_adapter( + EvalExecutionSpec( + mode=EvalExecutionMode.AGENT, + target_config={"agent": self.agent}, + query_column=query_column, + ) + ).execute( + case=case, + target=dict(case_data.get("_target", {})), + spec=EvalExecutionSpec( + mode=EvalExecutionMode.AGENT, + target_config={"agent": self.agent}, + query_column=query_column, + ), ) - return {"answer": response.answer, "state": state.to_dict()} + return {"answer": state.answer, "state": state.to_dict()} class AworldTaskEvalTarget(EvalTarget[dict]): @@ -100,16 +117,22 @@ async def build_task(self, index: int, input: EvalDataCase[dict]) -> Task: async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: task = await self.build_task(index, input) - result = await Runners.run_task(task=task) - if isinstance(result, TaskResponse): - payload = result - if isinstance(result, dict): - payload = result[task.id] - else: - payload = result - state = normalize_task_response_to_eval_state( - case_id=getattr(input, "eval_case_id", str(index)), - response=payload, - metadata=input.case_data if isinstance(input, EvalDataCase) else {}, + case_data = input.case_data if isinstance(input, EvalDataCase) else {} + spec = EvalExecutionSpec( + mode=EvalExecutionMode.TASK, + target_config={"task": task}, + ) + case = type( + "AdapterCase", + (), + { + "case_id": getattr(input, "eval_case_id", str(index)), + "input": dict(case_data), + }, + )() + state = await resolve_execution_adapter(spec).execute( + case=case, + target=dict(case_data.get("_target", {})), + spec=spec, ) return {"answer": state.answer, "state": state.to_dict()} diff --git a/aworld/evaluations/execution.py b/aworld/evaluations/execution.py index eb44b9e26..633fd34a9 100644 --- a/aworld/evaluations/execution.py +++ b/aworld/evaluations/execution.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import annotations +import importlib from dataclasses import dataclass, field from enum import Enum from typing import Any, Mapping @@ -12,6 +13,7 @@ class EvalExecutionMode(str, Enum): STATIC = "static" AGENT = "agent" TASK = "task" + PROGRAM = "program" @dataclass(frozen=True) @@ -71,6 +73,26 @@ def _extract_tool_calls_from_trajectory(trajectory: list[dict[str, Any]]) -> lis return calls +def _merge_eval_metadata( + response_metadata: Any, + invocation_metadata: Mapping[str, Any] | None, + target: Mapping[str, Any] | None, +) -> dict[str, Any]: + base = dict(response_metadata) if isinstance(response_metadata, Mapping) else {} + base.update(dict(invocation_metadata or {})) + base["_target"] = dict(target or {}) + return base + + +def _list_field_from_response(response: Mapping[str, Any], field_name: str, default: list[Any]) -> list[Any]: + value = response.get(field_name) + if value is None: + return default + if not isinstance(value, list): + raise ValueError(f"{field_name} must be a list") + return list(value) + + def normalize_task_response_to_eval_state( *, case_id: str, @@ -78,6 +100,16 @@ def normalize_task_response_to_eval_state( target: Mapping[str, Any] | None = None, metadata: Mapping[str, Any] | None = None, ) -> EvalState: + if isinstance(response, EvalState): + state = response.to_dict() + state["case_id"] = case_id + state["metadata"] = { + **dict(response.metadata or {}), + **dict(metadata or {}), + "_target": dict(target or {}), + } + return EvalState(**state) + if isinstance(response, TaskResponse): trajectory = list(response.trajectory or []) return EvalState( @@ -90,24 +122,29 @@ def normalize_task_response_to_eval_state( usage=dict(response.usage or {}), timing={"time_cost": response.time_cost}, raw_response=response.to_dict(), - metadata={**dict(metadata or {}), "_target": dict(target or {})}, + metadata=_merge_eval_metadata(getattr(response, "metadata", {}), metadata, target), ) if isinstance(response, Mapping): - trajectory = list(response.get("trajectory") or []) + trajectory = _list_field_from_response(response, "trajectory", []) + answer = response.get("answer") return EvalState( case_id=case_id, status=str(response.get("status", "success")), - answer=response.get("answer"), - completion=list(response.get("completion") or ([] if response.get("answer") is None else [response.get("answer")])), + answer=answer, + completion=_list_field_from_response(response, "completion", [] if answer is None else [answer]), artifacts=dict(response.get("artifacts") or {}), trajectory=trajectory, - tool_calls=list(response.get("tool_calls") or _extract_tool_calls_from_trajectory(trajectory)), + tool_calls=_list_field_from_response( + response, + "tool_calls", + _extract_tool_calls_from_trajectory(trajectory), + ), usage=dict(response.get("usage") or {}), timing=dict(response.get("timing") or {}), error=dict(response.get("error")) if isinstance(response.get("error"), Mapping) else response.get("error"), raw_response=dict(response), - metadata={**dict(metadata or {}), "_target": dict(target or {})}, + metadata=_merge_eval_metadata(response.get("metadata"), metadata, target), ) return EvalState( @@ -115,5 +152,28 @@ def normalize_task_response_to_eval_state( status="success", answer=response, completion=[] if response is None else [response], - metadata={**dict(metadata or {}), "_target": dict(target or {})}, + metadata=_merge_eval_metadata({}, metadata, target), ) + + +def _validate_importable_callable_ref(ref: str) -> tuple[str, str]: + if not ref or any(char.isspace() for char in ref) or "/" in ref or "\\" in ref: + raise ValueError("program execution requires an importable callable reference") + if ":" in ref: + module_name, attr_name = ref.split(":", 1) + elif "." in ref: + module_name, attr_name = ref.rsplit(".", 1) + else: + raise ValueError("program execution requires an importable callable reference") + module_parts = module_name.split(".") + if not module_name or not attr_name or attr_name == "py" or "py" in module_parts: + raise ValueError("program execution requires an importable callable reference") + return module_name, attr_name + + +def load_program_callable(ref: str): + module_name, attr_name = _validate_importable_callable_ref(ref) + candidate = getattr(importlib.import_module(module_name), attr_name) + if not callable(candidate): + raise ValueError(f"program reference is not callable: {ref}") + return candidate diff --git a/aworld/evaluations/execution_adapters.py b/aworld/evaluations/execution_adapters.py new file mode 100644 index 000000000..2929f3381 --- /dev/null +++ b/aworld/evaluations/execution_adapters.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import annotations + +import inspect +from dataclasses import dataclass +from typing import Any, Protocol + +from aworld.core.task import TaskResponse +from aworld.evaluations.execution import ( + EvalExecutionMode, + EvalExecutionSpec, + EvalState, + _validate_importable_callable_ref, + load_program_callable, + normalize_task_response_to_eval_state, +) +from aworld.runner import Runners + + +class ExecutionAdapter(Protocol): + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + raise NotImplementedError + + +@dataclass(frozen=True) +class StaticExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + return EvalState( + case_id=case.case_id, + status="not_evaluated", + metadata={"_target": dict(target)}, + ) + + +@dataclass(frozen=True) +class AgentExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + query_column = spec.query_column or "query" + query = case.input[query_column] + if "agent" not in spec.target_config: + raise ValueError("agent execution requires target_config['agent']") + response = await Runners.run(query, agent=spec.target_config["agent"]) + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=response, + target=target, + metadata=case.input, + ) + + +@dataclass(frozen=True) +class TaskExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + task = spec.target_config.get("task") + if task is None: + if not spec.task_builder_ref: + raise ValueError("task execution requires task_builder_ref") + builder = load_program_callable(spec.task_builder_ref) + task = builder(case=case, target=target, spec=spec) + if inspect.isawaitable(task): + task = await task + + result = await Runners.run_task(task=task) + if isinstance(result, dict) and getattr(task, "id", None) in result: + result = result[task.id] + elif isinstance(result, dict) and len(result) == 1 and not {"status", "answer", "completion"} & result.keys(): + result = next(iter(result.values())) + elif isinstance(result, TaskResponse): + result = result + + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=result, + target=target, + metadata=case.input, + ) + + +@dataclass(frozen=True) +class ProgramExecutionAdapter: + async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: + if not spec.target_ref: + raise ValueError("program execution requires target_ref") + program = load_program_callable(spec.target_ref) + result = program(case, spec, target) + if inspect.isawaitable(result): + result = await result + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=result, + target=target, + metadata={**case.input, "_execution_mode": spec.mode.value}, + ) + + +def _validate_program_execution_spec(spec: EvalExecutionSpec) -> None: + if not spec.target_ref: + raise ValueError("program execution requires target_ref") + _validate_importable_callable_ref(spec.target_ref) + unsupported_config_keys = {"command", "commands", "workflow", "workflow_engine", "sandbox"} + if spec.runner_method is not None or unsupported_config_keys & set(spec.target_config): + raise ValueError("unsupported program execution configuration") + + +def resolve_execution_adapter(spec: EvalExecutionSpec) -> ExecutionAdapter: + if spec.mode == EvalExecutionMode.STATIC: + return StaticExecutionAdapter() + if spec.mode == EvalExecutionMode.AGENT: + return AgentExecutionAdapter() + if spec.mode == EvalExecutionMode.TASK: + return TaskExecutionAdapter() + if spec.mode == EvalExecutionMode.PROGRAM: + _validate_program_execution_spec(spec) + return ProgramExecutionAdapter() + raise ValueError(f"unsupported execution mode: {spec.mode}") diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py index 4beb2d413..44b2a1381 100644 --- a/aworld/evaluations/report.py +++ b/aworld/evaluations/report.py @@ -56,6 +56,7 @@ def get_evaluator_report_schema() -> dict[str, object]: "oneOf": [ {"type": "number"}, {"type": "boolean"}, + {"type": "string"}, ] }, "metricAggregate": { @@ -96,8 +97,10 @@ def get_evaluator_report_schema() -> dict[str, object]: "type": "string", "enum": ["pass", "fail", "needs_approval"], }, - "metric_name": {"type": "string"}, - "value": {"type": "number"}, + "metric_name": {"type": ["string", "null"]}, + "value": {"type": ["number", "string", "boolean", "null"]}, + "matched_conditions": {"type": "array"}, + "failed_conditions": {"type": "array"}, }, "additionalProperties": False, }, @@ -120,7 +123,7 @@ def get_evaluator_report_schema() -> dict[str, object]: "enum": ["pass", "fail", "needs_approval", None], }, "metric_name": {"type": ["string", "null"]}, - "metric_value": {"type": ["number", "null"]}, + "metric_value": {"type": ["number", "string", "boolean", "null"]}, "approval_required": {"type": "boolean"}, "approval_resolved": {"type": "boolean"}, "approved": {"type": ["boolean", "null"]}, @@ -202,6 +205,7 @@ def get_evaluator_report_schema() -> dict[str, object]: "suite_selection": {"type": "object"}, "automation": {"$ref": "#/$defs/automationSummary"}, "report_path": {"type": "string"}, + "judge_schema": {"type": "object"}, }, "additionalProperties": True, } diff --git a/aworld/evaluations/scorers/suite_judge.py b/aworld/evaluations/scorers/suite_judge.py index 78fabcd4c..0a5ad8adc 100644 --- a/aworld/evaluations/scorers/suite_judge.py +++ b/aworld/evaluations/scorers/suite_judge.py @@ -23,17 +23,33 @@ async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> Sc if state: target = {**target, **state} execution = await self.suite.resolve_judge_backend().execute(case_input, target, self.suite) - payload = dict(execution.payload) - self.suite.judge_schema.validate(payload) + payload = self.suite.judge_schema.validate_payload(dict(execution.payload)) + metadata = { + **payload, + "_judge_backend": execution.backend_id, + } metric_result: MetricResult = { "value": float(payload["score"]), - "metadata": { - **payload, - "_judge_backend": execution.backend_id, - }, + "metadata": metadata, + } + metric_results = {"score": metric_result} + declared_trajectory_metrics = { + scorer.metric_name + for scorer in getattr(self.suite, "trajectory_scorers", tuple()) } + for metric_name, value in payload.items(): + if ( + metric_name == "score" + or metric_name in declared_trajectory_metrics + or not isinstance(value, (int, float, bool, str)) + ): + continue + metric_results[metric_name] = { + "value": value, + "metadata": metadata, + } return ScorerResult( scorer_name=self.name, - metric_results={"score": metric_result}, + metric_results=metric_results, ) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index e84569f04..9767550a9 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -5,6 +5,7 @@ import base64 import importlib import json +import math import inspect import os import re @@ -14,11 +15,14 @@ from pathlib import Path from typing import Any, Awaitable, Callable, ClassVar, Mapping +from pydantic import BaseModel, ValidationError + from aworld.config.conf import EvaluationConfig -from aworld.evaluations.base import EvalDataCase, EvalDataset +from aworld.evaluations.base import EvalDataCase, EvalDataset, EvalTarget from aworld.evaluations.base import NoActionEvalTarget from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget, AworldTaskEvalTarget from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution_adapters import resolve_execution_adapter from aworld.evaluations.report import ( CaseEvaluationReport, EVALUATOR_REPORT_FORMAT_ID, @@ -54,37 +58,168 @@ class EvalCaseDef: metadata: dict[str, Any] = field(default_factory=dict) +@dataclass(frozen=True) +class EvalHarnessDef: + harness_id: str + execution: EvalExecutionSpec = field(default_factory=EvalExecutionSpec) + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class TrajectoryScorerDef: + metric_name: str + scorer_class: str | None = None + threshold: float = 0.0 + scorer_params: dict[str, Any] = field(default_factory=dict) + + @dataclass(frozen=True) class JudgeSchemaDef: required_fields: tuple[str, ...] = tuple() + output_model: type[BaseModel] | None = None def validate(self, payload: Mapping[str, Any]) -> None: + self.validate_payload(payload) + + def validate_payload(self, payload: Mapping[str, Any]) -> dict[str, Any]: + if self.output_model is not None: + try: + model = self.output_model.model_validate(dict(payload)) + except ValidationError as exc: + raise ValueError(str(exc)) from exc + return model.model_dump(mode="json", by_alias=True) + missing = [field for field in self.required_fields if field not in payload] if missing: joined = ", ".join(missing) raise ValueError(f"missing required judge fields: {joined}") + return dict(payload) + + def json_schema(self) -> dict[str, Any]: + if self.output_model is not None: + return self.output_model.model_json_schema() + if self.required_fields: + return { + "type": "object", + "required": list(self.required_fields), + "properties": {field: {} for field in self.required_fields}, + } + return {} @dataclass(frozen=True) class GateDecision: status: str + metric_name: str | None + value: float | int | str | bool | None + matched_conditions: list[dict[str, Any]] = field(default_factory=list) + failed_conditions: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass(frozen=True) +class GateMetricCondition: metric_name: str - value: float + op: str + threshold: float | int | str | bool + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "op": self.op, + "threshold": self.threshold, + } + + def matches(self, metrics: Mapping[str, Any]) -> bool: + if self.metric_name not in metrics: + raise KeyError(f"metric {self.metric_name} is missing") + value = metrics[self.metric_name] + if self.op == ">=": + return float(value) >= float(self.threshold) + if self.op == "<=": + return float(value) <= float(self.threshold) + if self.op == ">": + return float(value) > float(self.threshold) + if self.op == "<": + return float(value) < float(self.threshold) + if self.op == "==": + return value == self.threshold + if self.op == "!=": + return value != self.threshold + raise ValueError(f"unsupported gate operator: {self.op}") @dataclass(frozen=True) class GatePolicyDef: - metric_name: str - pass_threshold: float + metric_name: str | None = None + pass_threshold: float | None = None approval_threshold: float | None = None + pass_all: tuple[GateMetricCondition, ...] = tuple() + approval_all: tuple[GateMetricCondition, ...] = tuple() + + def normalized_conditions(self) -> tuple[tuple[GateMetricCondition, ...], tuple[GateMetricCondition, ...]]: + pass_all = self.pass_all + approval_all = self.approval_all + if not pass_all and self.metric_name is not None and self.pass_threshold is not None: + pass_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.pass_threshold),) + if not approval_all and self.metric_name is not None and self.approval_threshold is not None: + approval_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.approval_threshold),) + return pass_all, approval_all + + def primary_metric_name(self) -> str: + if self.metric_name is not None: + return self.metric_name + pass_all, approval_all = self.normalized_conditions() + for condition in (*pass_all, *approval_all): + if condition.metric_name == "score": + return condition.metric_name + for condition in (*pass_all, *approval_all): + return condition.metric_name + return "score" def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: - value = float(metrics[self.metric_name]) - if value >= self.pass_threshold: - return GateDecision(status="pass", metric_name=self.metric_name, value=value) - if self.approval_threshold is not None and value >= self.approval_threshold: - return GateDecision(status="needs_approval", metric_name=self.metric_name, value=value) - return GateDecision(status="fail", metric_name=self.metric_name, value=value) + pass_all, approval_all = self.normalized_conditions() + matched_pass: list[dict[str, Any]] = [] + failed_pass: list[dict[str, Any]] = [] + for condition in pass_all: + if condition.matches(metrics): + matched_pass.append(condition.to_dict()) + else: + failed_pass.append(condition.to_dict()) + + metric_name = self.metric_name + value = metrics.get(metric_name) if metric_name is not None else None + if pass_all and not failed_pass: + return GateDecision( + status="pass", + metric_name=metric_name, + value=value, + matched_conditions=matched_pass, + failed_conditions=[], + ) + + matched_approval: list[dict[str, Any]] = [] + failed_approval: list[dict[str, Any]] = [] + for condition in approval_all: + if condition.matches(metrics): + matched_approval.append(condition.to_dict()) + else: + failed_approval.append(condition.to_dict()) + + if approval_all and not failed_approval: + return GateDecision( + status="needs_approval", + metric_name=metric_name, + value=value, + matched_conditions=[*matched_pass, *matched_approval], + failed_conditions=failed_pass, + ) + return GateDecision( + status="fail", + metric_name=metric_name, + value=value, + matched_conditions=[*matched_pass, *matched_approval], + failed_conditions=[*failed_pass, *failed_approval], + ) @dataclass(frozen=True) @@ -215,6 +350,8 @@ class EvalSuiteDef: judge_schema: JudgeSchemaDef = field(default_factory=JudgeSchemaDef) gate_policy: GatePolicyDef | None = None execution: EvalExecutionSpec | None = None + harness: EvalHarnessDef | None = None + trajectory_scorers: tuple[TrajectoryScorerDef, ...] = tuple() judge: JudgeCallable | None = None judge_backend: JudgeBackend | None = None metadata: dict[str, Any] = field(default_factory=dict) @@ -252,6 +389,7 @@ class CompiledEvaluationPlan: dataset: EvalDataset eval_config: EvaluationConfig gate_policy: GatePolicyDef | None + harness: EvalHarnessDef | None = None @dataclass(frozen=True) @@ -462,6 +600,18 @@ def build_eval_dataset(cases: list[EvalCaseDef], target: dict[str, Any]) -> Eval return EvalDataset(eval_dataset_id=dataset_id, eval_dataset_name="suite_eval_dataset", eval_cases=eval_cases) +def resolve_eval_harness(suite: EvalSuiteDef) -> EvalHarnessDef: + if suite.harness is not None: + return suite.harness + if suite.execution is not None: + return EvalHarnessDef( + harness_id=f"{suite.suite_id}-execution", + execution=suite.execution, + metadata={"lowered_from": "suite.execution"}, + ) + return EvalHarnessDef(harness_id=f"{suite.suite_id}-static") + + class _ConfiguredTaskEvalTarget(AworldTaskEvalTarget): def __init__(self, *, target: dict[str, Any], execution: EvalExecutionSpec): super().__init__() @@ -474,27 +624,69 @@ async def build_task(self, index: int, input: EvalDataCase[dict]): return await _maybe_await(task) -def _build_eval_target(flow: EvaluationFlowDef): - execution = flow.suite.execution +class _AdapterExecutionEvalTarget(EvalTarget[dict]): + def __init__(self, *, target: dict[str, Any], harness: EvalHarnessDef): + super().__init__() + self._target = dict(target) + self._harness = harness + self._adapter = resolve_execution_adapter(harness.execution) + + async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: + case = EvalCaseDef( + case_id=getattr(input, "eval_case_id", str(index)), + input=dict(input.case_data if isinstance(input, EvalDataCase) else input), + ) + state = await self._adapter.execute(case=case, target=self._target, spec=self._harness.execution) + return {"answer": state.answer, "state": state.to_dict()} + + +def _build_eval_target(flow: EvaluationFlowDef, target: dict[str, Any]): + harness = resolve_eval_harness(flow.suite) + execution = harness.execution if execution is None or execution.mode == EvalExecutionMode.STATIC: return NoActionEvalTarget() if execution.mode == EvalExecutionMode.AGENT: + if "agent" in execution.target_config: + return AworldAgentEvalTarget( + agent=execution.target_config["agent"], + query_column=execution.query_column or "query", + ) return AworldAgentEvalTarget( agent_config=execution.target_config, query_column=execution.query_column or "query", ) if execution.mode == EvalExecutionMode.TASK: - return _ConfiguredTaskEvalTarget(target=flow.target, execution=execution) + if "task" in execution.target_config: + return _AdapterExecutionEvalTarget(target=target, harness=harness) + return _ConfiguredTaskEvalTarget(target=target, execution=execution) + if execution.mode == EvalExecutionMode.PROGRAM: + return _AdapterExecutionEvalTarget(target=target, harness=harness) raise ValueError(f"unsupported execution mode: {execution.mode}") +def _trajectory_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: + criteria: list[dict[str, Any]] = [] + for scorer in suite.trajectory_scorers: + item: dict[str, Any] = { + "metric_name": scorer.metric_name, + "threshold": scorer.threshold, + "scorer_params": dict(scorer.scorer_params), + } + if scorer.scorer_class is not None: + item["scorer_class"] = scorer.scorer_class + criteria.append(item) + return criteria + + def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: normalized_target = _normalize_target(flow.target) dataset = build_eval_dataset(flow.suite.cases, normalized_target) + harness = resolve_eval_harness(flow.suite) gate_policy = flow.suite.gate_policy or GatePolicyDef(metric_name="score", pass_threshold=0.0) + score_bounds = _gate_metric_eval_bounds(gate_policy, "score") eval_criteria = { - "metric_name": gate_policy.metric_name, - "threshold": gate_policy.pass_threshold, + "metric_name": "score", + **score_bounds, "scorer_params": { "suite": flow.suite, "name": flow.suite.suite_id, @@ -502,8 +694,8 @@ def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: } eval_config = EvaluationConfig( eval_suite_id=flow.suite.suite_id, - eval_target=_build_eval_target(flow), - eval_criterias=[eval_criteria], + eval_target=_build_eval_target(flow, normalized_target), + eval_criterias=[eval_criteria, *_trajectory_eval_criteria(flow.suite)], eval_dataset=dataset, ) return CompiledEvaluationPlan( @@ -512,20 +704,102 @@ def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: dataset=dataset, eval_config=eval_config, gate_policy=flow.suite.gate_policy, + harness=harness, ) -def _extract_metric_value(summary: Mapping[str, Any], metric_name: str) -> float: +def _extract_metric_value(summary: Mapping[str, Any], metric_name: str) -> Any: metric_summary = summary.get(metric_name, {}) if "mean" in metric_summary: return float(metric_summary["mean"]) if "true_rate" in metric_summary: return float(metric_summary["true_rate"]) if "value" in metric_summary: - return float(metric_summary["value"]) + return metric_summary["value"] + raise KeyError(f"metric {metric_name} is missing aggregate summary") + + +def _extract_metric_value_from_result_summary(summary: Mapping[str, Any], metric_name: str) -> float: + try: + return _extract_metric_value(summary, metric_name) + except KeyError: + pass + for scorer_summary in summary.values(): + if not isinstance(scorer_summary, Mapping): + continue + try: + return _extract_metric_value(scorer_summary, metric_name) + except KeyError: + continue raise KeyError(f"metric {metric_name} is missing aggregate summary") +def _flatten_result_metrics(summary: Mapping[str, Any]) -> dict[str, Any]: + metrics: dict[str, Any] = {} + for scorer_summary in summary.values(): + if not isinstance(scorer_summary, Mapping): + continue + for metric_name, metric_summary in scorer_summary.items(): + if isinstance(metric_summary, Mapping): + metrics[metric_name] = dict(metric_summary) + return metrics + + +def _gate_pass_conditions_by_metric(policy: GatePolicyDef | None) -> dict[str, tuple[GateMetricCondition, ...]]: + if policy is None: + return {} + pass_all, _ = policy.normalized_conditions() + by_metric: dict[str, list[GateMetricCondition]] = {} + for condition in pass_all: + by_metric.setdefault(condition.metric_name, []).append(condition) + return {metric_name: tuple(conditions) for metric_name, conditions in by_metric.items()} + + +def _gate_metric_status(value: Any, conditions: tuple[GateMetricCondition, ...]) -> str: + for condition in conditions: + if not condition.matches({condition.metric_name: value}): + return "FAILED" + return "PASSED" + + +def _gate_policy_conditions(policy: GatePolicyDef) -> tuple[GateMetricCondition, ...]: + pass_all, approval_all = policy.normalized_conditions() + seen: set[str] = set() + conditions: list[GateMetricCondition] = [] + for condition in (*pass_all, *approval_all): + key = f"{condition.metric_name}:{condition.op}:{condition.threshold}" + if key in seen: + continue + seen.add(key) + conditions.append(condition) + return tuple(conditions) + + +def _gate_metric_eval_bounds(policy: GatePolicyDef, metric_name: str) -> dict[str, float]: + bounds: dict[str, float] = {} + pass_all, _ = policy.normalized_conditions() + for condition in pass_all: + if condition.metric_name != metric_name: + continue + if condition.op == ">=": + bounds["threshold"] = float(condition.threshold) + elif condition.op == ">": + bounds["threshold"] = math.nextafter(float(condition.threshold), math.inf) + elif condition.op == "<=": + bounds["threshold"] = float("-inf") + bounds["max_value"] = float(condition.threshold) + elif condition.op == "<": + bounds["threshold"] = float("-inf") + bounds["max_value"] = math.nextafter(float(condition.threshold), -math.inf) + break + if "threshold" not in bounds: + if policy.metric_name == metric_name and policy.pass_threshold is not None: + bounds["threshold"] = float(policy.pass_threshold) + else: + bounds["threshold"] = 0.0 + return bounds + + def _normalize_metric_status(status: Any) -> str | None: if status is None: return None @@ -561,29 +835,37 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: gate_metrics = {} gate = None if compiled.gate_policy is not None: - gate_metrics[compiled.gate_policy.metric_name] = _extract_metric_value( - suite_summary, - compiled.gate_policy.metric_name, - ) + for condition in _gate_policy_conditions(compiled.gate_policy): + if condition.metric_name not in gate_metrics: + gate_metrics[condition.metric_name] = _extract_metric_value_from_result_summary( + eval_result.summary, + condition.metric_name, + ) gate = compiled.gate_policy.evaluate(gate_metrics) results: list[CaseEvaluationReport] = [] report_backend_id = None cases_with_metrics = 0 cases_with_judge = 0 + gate_conditions_by_metric = _gate_pass_conditions_by_metric(compiled.gate_policy) for case_result in eval_result.eval_case_results: - score_row = case_result.score_rows.get(compiled.suite.suite_id) judge_payload = {} case_metrics: dict[str, Any] = {} case_backend_id = None - if score_row is not None: + if case_result.score_rows: cases_with_metrics += 1 + for score_row in case_result.score_rows.values(): for metric_name, metric_result in score_row.metric_results.items(): if isinstance(metric_result, Mapping): case_metrics[metric_name] = {} if "value" in metric_result: case_metrics[metric_name]["value"] = metric_result["value"] status = _normalize_metric_status(metric_result.get("eval_status")) + if metric_name in gate_conditions_by_metric and "value" in case_metrics[metric_name]: + status = _gate_metric_status( + case_metrics[metric_name]["value"], + gate_conditions_by_metric[metric_name], + ) if status is not None: case_metrics[metric_name]["status"] = status metadata = metric_result.get("metadata") or {} @@ -591,7 +873,9 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: case_backend_id = metadata.get("_judge_backend") else: case_metrics[metric_name] = {"value": metric_result} - metric_result = score_row.metric_results.get(compiled.gate_policy.metric_name if compiled.gate_policy else "score", {}) + score_row = case_result.score_rows.get(compiled.suite.suite_id) + if score_row is not None: + metric_result = score_row.metric_results.get("score", {}) judge_payload = dict(metric_result.get("metadata", {})) report_backend_id = report_backend_id or judge_payload.pop("_judge_backend", None) if judge_payload: @@ -607,7 +891,15 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: ) ) - metrics = dict(suite_summary) + metrics = _flatten_result_metrics(eval_result.summary) + for metric_name, conditions in gate_conditions_by_metric.items(): + if metric_name not in metrics: + continue + try: + value = _extract_metric_value(metrics, metric_name) + except KeyError: + continue + metrics[metric_name]["eval_status"] = _gate_metric_status(value, conditions) report = EvaluatorReport({ "report_version": 1, "report_format": { @@ -631,6 +923,9 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: "approved": None, }, }) + judge_schema = compiled.suite.judge_schema.json_schema() + if judge_schema: + report["judge_schema"] = judge_schema if report_backend_id is not None: report["judge_backend"] = {"backend_id": report_backend_id} if gate is not None: @@ -638,6 +933,8 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: "status": gate.status, "metric_name": gate.metric_name, "value": gate.value, + "matched_conditions": gate.matched_conditions, + "failed_conditions": gate.failed_conditions, } return report diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index c265676ad..caea0b751 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -20,6 +20,7 @@ run_evaluator_cli, validate_evaluator_report, ) +from aworld_cli.evaluator_rendering import render_evaluator_summary @pytest.fixture(autouse=True) @@ -372,6 +373,44 @@ def test_validate_evaluator_report_accepts_valid_report() -> None: validate_evaluator_report(report) +def test_validate_and_render_categorical_gate_report() -> None: + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-02T04:00:00Z", + "suite_id": "categorical-suite", + "target": {"target_path": "/tmp/artifact.txt", "target_kind": "file"}, + "summary": {"categorical-suite": {"verdict": {"value": "approved"}}}, + "metrics": {"verdict": {"value": "approved", "eval_status": "PASSED"}}, + "results": [ + { + "case_id": "artifact.txt", + "input": {"target_path": "/tmp/artifact.txt"}, + "metrics": {"verdict": {"value": "approved", "status": "PASSED"}}, + "judge": {"score": 1.0, "verdict": "approved"}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "pass", "metric_name": "verdict", "value": "approved"}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "pass", + "metric_name": "verdict", + "metric_value": "approved", + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 0, + "case_count": 1, + "judge_backend": None, + }, + } + + validate_evaluator_report(report) + + assert "approved" in render_evaluator_summary(report) + + def test_validate_evaluator_report_rejects_invalid_gate_status() -> None: report = { "report_version": 1, diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 3400021ac..3d6721c19 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -2,6 +2,7 @@ import base64 import pytest +from pydantic import BaseModel, Field from aworld.evaluations.base import EvaluationConfig import aworld.evaluations.substrate as substrate_module @@ -9,22 +10,38 @@ AgentJudgeBackend, CallableJudgeBackend, EvalCaseDef, + EvalHarnessDef, EvalSuiteDef, EvaluationFlowDef, FallbackJudgeBackend, + GateMetricCondition, GatePolicyDef, JudgeSchemaDef, + TrajectoryScorerDef, compile_evaluation_flow, get_builtin_eval_suite, list_eval_suites, list_matching_eval_suites, load_declared_eval_suites, register_eval_suite, + resolve_eval_harness, resolve_eval_suite, resolve_eval_suite_selection, run_evaluation_flow, ) from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.report import validate_evaluator_report +from aworld.evaluations.types import MetricNames + + +class DemoJudgeOutput(BaseModel): + score: float + verdict: str + + +class AliasJudgeOutput(BaseModel): + final_score: float = Field(alias="score") + verdict: str @pytest.fixture(autouse=True) @@ -67,6 +84,30 @@ def test_compile_evaluation_flow_builds_inline_dataset_and_gate_config() -> None assert compiled.gate_policy.metric_name == "score" +def test_compile_evaluation_flow_lowers_trajectory_scorers_to_eval_criteria() -> None: + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello world"})], + judge=lambda case_input, target: {"score": 1.0}, + trajectory_scorers=( + TrajectoryScorerDef( + metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, + threshold=1.0, + ), + ), + ) + + compiled = compile_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + ) + + metric_names = [criteria["metric_name"] for criteria in compiled.eval_config.eval_criterias] + assert metric_names == ["score", MetricNames.TRAJECTORY_TOOL_CALLS] + + def test_eval_case_def_supports_expected_and_runtime_overrides() -> None: case = EvalCaseDef( case_id="case-1", @@ -96,6 +137,107 @@ def test_compile_evaluation_flow_uses_execution_backed_target_when_suite_declare assert compiled.eval_config.eval_target.__class__.__name__ == "_ConfiguredTaskEvalTarget" +def test_compile_evaluation_flow_preserves_live_agent_target_config() -> None: + live_agent = object() + suite = EvalSuiteDef( + suite_id="agent-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec( + mode=EvalExecutionMode.AGENT, + target_config={"agent": live_agent}, + ), + ) + flow = EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + + compiled = compile_evaluation_flow(flow) + + assert compiled.eval_config.eval_target.agent is live_agent + + +@pytest.mark.asyncio +async def test_program_execution_receives_normalized_target( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def demo_program(case, spec, target): + return { + "status": "success", + "answer": target["target_path"], + "metadata": {"target_kind_seen": target["target_kind"]}, + } + + async def fake_judge(case_input, target): + return {"score": 1.0, "answer": target["answer"]} + + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: demo_program, + ) + + suite = EvalSuiteDef( + suite_id="program-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + judge=fake_judge, + ) + flow = EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + + report = await run_evaluation_flow(flow) + + assert report["results"][0]["judge"]["answer"] == "demo.txt" + assert report["results"][0]["state_summary"]["answer"] == "demo.txt" + + +@pytest.mark.asyncio +async def test_task_execution_uses_adapter_target_config_task( + monkeypatch: pytest.MonkeyPatch, +) -> None: + task = type("Task", (), {"id": "task-1"})() + + async def fake_run_task(*, task): + return {task.id: {"status": "success", "answer": "task-ok"}} + + async def fake_judge(case_input, target): + return {"score": 1.0, "answer": target["answer"]} + + monkeypatch.setattr("aworld.evaluations.execution_adapters.Runners.run_task", fake_run_task) + + suite = EvalSuiteDef( + suite_id="task-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.TASK, target_config={"task": task}), + judge=fake_judge, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "file", "target_path": "artifact.txt"}, suite=suite) + ) + + assert report["results"][0]["judge"]["answer"] == "task-ok" + + +def test_resolve_eval_harness_lowers_direct_suite_execution() -> None: + suite = EvalSuiteDef( + suite_id="program-suite", + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + ) + + harness = resolve_eval_harness(suite) + + assert harness.harness_id == "program-suite-execution" + assert harness.execution is suite.execution + assert harness.metadata["lowered_from"] == "suite.execution" + + +def test_resolve_eval_harness_prefers_explicit_harness() -> None: + harness = EvalHarnessDef( + harness_id="shared-program", + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + ) + suite = EvalSuiteDef(suite_id="program-suite", harness=harness) + + assert resolve_eval_harness(suite) is harness + + def test_judge_schema_validation_rejects_missing_fields() -> None: schema = JudgeSchemaDef(required_fields=("score", "rank", "criticism")) @@ -103,6 +245,50 @@ def test_judge_schema_validation_rejects_missing_fields() -> None: schema.validate({"score": 0.8, "rank": "Good"}) +def test_typed_judge_model_accepts_valid_payload() -> None: + schema = JudgeSchemaDef(output_model=DemoJudgeOutput) + + payload = schema.validate_payload({"score": 0.8, "verdict": "ok"}) + + assert payload["score"] == 0.8 + assert payload["verdict"] == "ok" + + +def test_typed_judge_model_rejects_invalid_payload() -> None: + schema = JudgeSchemaDef(output_model=DemoJudgeOutput) + + with pytest.raises(ValueError, match="verdict"): + schema.validate_payload({"score": 0.8}) + + +def test_legacy_required_fields_schema_still_returns_payload() -> None: + schema = JudgeSchemaDef(required_fields=("score", "rank")) + + payload = schema.validate_payload({"score": 0.9, "rank": 1}) + + assert payload["rank"] == 1 + + +def test_judge_schema_exports_json_schema_for_typed_model() -> None: + schema = JudgeSchemaDef(output_model=DemoJudgeOutput) + + exported = schema.json_schema() + + assert exported["properties"]["score"]["type"] == "number" + assert "verdict" in exported["required"] + + +def test_typed_judge_model_returns_alias_keys_to_match_exported_schema() -> None: + schema = JudgeSchemaDef(output_model=AliasJudgeOutput) + + payload = schema.validate_payload({"score": 0.8, "verdict": "ok"}) + exported = schema.json_schema() + + assert payload["score"] == 0.8 + assert "final_score" not in payload + assert "score" in exported["properties"] + + def test_gate_policy_uses_pass_and_approval_thresholds() -> None: gate = GatePolicyDef( metric_name="score", @@ -115,6 +301,82 @@ def test_gate_policy_uses_pass_and_approval_thresholds() -> None: assert gate.evaluate({"score": 0.5}).status == "fail" +def test_composite_gate_returns_pass_when_all_conditions_hold() -> None: + policy = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ) + + decision = policy.evaluate({"score": 0.95, "latency": 4.2}) + + assert decision.status == "pass" + assert decision.metric_name is None + assert decision.value is None + assert len(decision.matched_conditions) == 2 + + +def test_composite_gate_returns_needs_approval_when_approval_conditions_hold() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),), + approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.75),), + ) + + decision = policy.evaluate({"score": 0.8}) + + assert decision.status == "needs_approval" + assert len(decision.failed_conditions) == 1 + assert len(decision.matched_conditions) == 1 + + +def test_legacy_threshold_gate_lowers_to_structured_policy() -> None: + policy = GatePolicyDef(metric_name="score", pass_threshold=0.9, approval_threshold=0.8) + + decision = policy.evaluate({"score": 0.85}) + + assert decision.status == "needs_approval" + assert decision.metric_name == "score" + assert decision.value == pytest.approx(0.85) + + +@pytest.mark.parametrize( + ("op", "threshold", "value"), + [ + (">", 0.9, 0.91), + ("<", 0.9, 0.89), + (">=", 0.9, 0.9), + ("<=", 0.9, 0.9), + ("==", "approved", "approved"), + ("!=", "blocked", "approved"), + ], +) +def test_gate_metric_condition_supports_all_declared_operators(op, threshold, value) -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="metric", op=op, threshold=threshold),) + ) + + assert policy.evaluate({"metric": value}).status == "pass" + + +def test_gate_policy_reports_missing_metric() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),) + ) + + with pytest.raises(KeyError, match="score"): + policy.evaluate({}) + + +def test_gate_policy_rejects_unsupported_operator() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op="contains", threshold=0.9),) + ) + + with pytest.raises(ValueError, match="unsupported gate operator"): + policy.evaluate({"score": 0.95}) + + @pytest.mark.asyncio async def test_run_evaluation_flow_executes_suite_judge_and_returns_gate() -> None: async def fake_judge(case_input, target): @@ -168,6 +430,347 @@ async def fake_judge(case_input, target): assert report["summary"]["demo-suite"]["score"]["mean"] == pytest.approx(0.7) +@pytest.mark.asyncio +async def test_run_evaluation_flow_exposes_judge_schema_once() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95, "verdict": "ok"} + + suite = EvalSuiteDef( + suite_id="typed-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge_schema=JudgeSchemaDef(output_model=DemoJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=0.8), + judge=fake_judge, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["judge_schema"]["properties"]["score"]["type"] == "number" + assert "_judge_schema" not in report["results"][0]["judge"] + validate_evaluator_report(report.to_dict()) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_evaluates_composite_gate_metrics() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert len(report["gate"]["matched_conditions"]) == 2 + assert report["metrics"]["latency"]["mean"] == pytest.approx(4.2) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_failed_composite_gate_keeps_metric_status_and_matches() -> None: + async def fake_judge(case_input, target): + return {"score": 0.7, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["eval_status"] == "FAILED" + assert report["gate"]["matched_conditions"] == [ + {"metric_name": "latency", "op": "<=", "threshold": 5.0} + ] + assert report["gate"]["failed_conditions"] == [ + {"metric_name": "score", "op": ">=", "threshold": 0.9} + ] + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_composite_gate_is_not_condition_order_sensitive() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["metrics"]["score"]["mean"] == pytest.approx(0.95) + assert report["metrics"]["latency"]["mean"] == pytest.approx(4.2) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_composite_gate_without_score_condition_still_runs_suite_judge() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "latency": 4.2} + + suite = EvalSuiteDef( + suite_id="latency-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="latency", op="<=", threshold=5.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["metrics"]["latency"]["mean"] == pytest.approx(4.2) + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_legacy_non_score_gate_does_not_set_score_threshold() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "latency": 6.0} + + suite = EvalSuiteDef( + suite_id="latency-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef(metric_name="latency", pass_threshold=5.0), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["results"][0]["metrics"]["score"]["status"] == "PASSED" + assert report["metrics"]["score"]["eval_status"] == "PASSED" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_strict_gate_operator_keeps_metric_status_consistent() -> None: + async def fake_judge(case_input, target): + return {"score": 0.9} + + suite = EvalSuiteDef( + suite_id="strict-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">", threshold=0.9),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["eval_status"] == "FAILED" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_equality_gate_keeps_metric_status_consistent() -> None: + async def fake_judge(case_input, target): + return {"score": 0.7} + + suite = EvalSuiteDef( + suite_id="equality-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op="==", threshold=0.9),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["results"][0]["metrics"]["score"]["status"] == "FAILED" + assert report["metrics"]["score"]["eval_status"] == "FAILED" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_categorical_gate_metric() -> None: + async def fake_judge(case_input, target): + return {"score": 1.0, "verdict": "approved"} + + suite = EvalSuiteDef( + suite_id="categorical-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="verdict", op="==", threshold="approved"),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["gate"]["value"] is None + assert report["results"][0]["metrics"]["verdict"]["value"] == "approved" + assert report["metrics"]["verdict"]["value"] == "approved" + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_reports_trajectory_scorer_metrics( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def demo_program(case, spec, target): + return { + "status": "success", + "answer": "ok", + "trajectory": [ + {"action": {"tool_calls": [{"id": "call-1", "function": {"name": "search", "arguments": "{}"}}]}} + ], + } + + async def fake_judge(case_input, target): + return {"score": 1.0} + + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: demo_program, + ) + + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + judge=fake_judge, + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0), + ) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["results"][0]["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["value"] == pytest.approx(1.0) + assert report["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["mean"] == pytest.approx(1.0) + + +@pytest.mark.asyncio +async def test_declared_trajectory_metric_takes_precedence_over_judge_payload_collision( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def demo_program(case, spec, target): + return { + "status": "success", + "answer": "ok", + "trajectory": [ + {"action": {"tool_calls": [{"id": "call-1", "function": {"name": "search", "arguments": "{}"}}]}} + ], + } + + async def fake_judge(case_input, target): + return {"score": 1.0, MetricNames.TRAJECTORY_TOOL_CALLS: 0.0} + + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: demo_program, + ) + + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + judge=fake_judge, + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "pass" + assert report["results"][0]["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["value"] == pytest.approx(1.0) + assert report["metrics"][MetricNames.TRAJECTORY_TOOL_CALLS]["mean"] == pytest.approx(1.0) + + @pytest.mark.asyncio async def test_suite_judge_prefers_state_payload_over_static_case_target() -> None: async def fake_judge(case_input, target): diff --git a/tests/evaluations/test_execution_adapters.py b/tests/evaluations/test_execution_adapters.py new file mode 100644 index 000000000..4dc188bba --- /dev/null +++ b/tests/evaluations/test_execution_adapters.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution_adapters import resolve_execution_adapter +from aworld.evaluations.substrate import EvalCaseDef + + +async def _demo_program(case, spec, target): + return { + "status": "success", + "answer": f"ran:{case.input['query']}", + "completion": [{"role": "assistant", "content": "final"}], + "trajectory": [{"role": "assistant", "content": "step"}], + "usage": {"total_tokens": 7}, + } + + +@pytest.mark.asyncio +async def test_program_execution_adapter_normalizes_result(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: _demo_program, + ) + spec = EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case") + adapter = resolve_execution_adapter(spec) + + state = await adapter.execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo"}), + target={"target_kind": "directory"}, + spec=spec, + ) + + assert state.case_id == "case-1" + assert state.answer == "ran:demo" + assert state.completion[0]["content"] == "final" + assert state.trajectory[0]["content"] == "step" + assert state.usage["total_tokens"] == 7 + + +def test_resolve_execution_adapter_rejects_missing_program_ref() -> None: + with pytest.raises(ValueError, match="target_ref"): + resolve_execution_adapter(EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM)) + + +def test_resolve_execution_adapter_rejects_command_style_program_ref() -> None: + with pytest.raises(ValueError, match="importable callable"): + resolve_execution_adapter( + EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="python script.py") + ) + + +@pytest.mark.parametrize( + "spec", + [ + EvalExecutionSpec( + mode=EvalExecutionMode.PROGRAM, + target_ref="pkg.module:run_case", + runner_method="shell", + ), + EvalExecutionSpec( + mode=EvalExecutionMode.PROGRAM, + target_ref="pkg.module:run_case", + target_config={"command": "python script.py"}, + ), + EvalExecutionSpec( + mode=EvalExecutionMode.PROGRAM, + target_ref="pkg.module:run_case", + target_config={"workflow": "external"}, + ), + ], +) +def test_resolve_execution_adapter_rejects_unsupported_program_runtime_config(spec: EvalExecutionSpec) -> None: + with pytest.raises(ValueError, match="unsupported program execution configuration"): + resolve_execution_adapter(spec) + + +@pytest.mark.parametrize( + "target_ref", + [ + "script.py", + "./script.py", + "scripts/run.py", + "script.py:main", + "scripts.run.py:main", + ], +) +def test_resolve_execution_adapter_rejects_path_style_program_ref(target_ref: str) -> None: + with pytest.raises(ValueError, match="importable callable"): + resolve_execution_adapter( + EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref=target_ref) + ) diff --git a/tests/evaluations/test_execution_state.py b/tests/evaluations/test_execution_state.py index 30b0ae7ad..0cf5a826d 100644 --- a/tests/evaluations/test_execution_state.py +++ b/tests/evaluations/test_execution_state.py @@ -1,7 +1,9 @@ from __future__ import annotations +import pytest + from aworld.core.task import TaskResponse -from aworld.evaluations.execution import normalize_task_response_to_eval_state +from aworld.evaluations.execution import EvalState, normalize_task_response_to_eval_state from aworld.evaluations.scorers.state_extractors import ( get_assistant_messages, get_completion, @@ -41,3 +43,86 @@ def test_state_extractors_support_completion_and_tool_queries() -> None: assert get_completion(state)[0]["content"] == "final" assert get_assistant_messages(state)[0]["content"] == "final" assert get_tool_calls(state)[0]["name"] == "search" + + +def test_normalize_mapping_response_preserves_completion_and_tool_calls() -> None: + state = normalize_task_response_to_eval_state( + case_id="case-2", + response={ + "status": "success", + "answer": "ok", + "completion": [{"role": "assistant", "content": "ok"}], + "trajectory": [{"tool_calls": [{"name": "search"}]}], + }, + ) + + assert state.completion[0]["content"] == "ok" + assert state.tool_calls[0]["name"] == "search" + + +def test_normalize_eval_state_response_preserves_state_fields() -> None: + state = normalize_task_response_to_eval_state( + case_id="case-3", + response=EvalState( + case_id="source-case", + status="success", + answer="done", + trajectory=[{"role": "assistant", "content": "step"}], + ), + target={"target_kind": "program"}, + ) + + assert state.case_id == "case-3" + assert state.answer == "done" + assert state.trajectory[0]["content"] == "step" + assert state.metadata["_target"]["target_kind"] == "program" + + +def test_normalize_eval_state_shaped_mapping_preserves_response_metadata() -> None: + state = normalize_task_response_to_eval_state( + case_id="case-4", + response=EvalState( + case_id="source-case", + status="success", + answer="done", + metadata={"program": "demo"}, + ).to_dict(), + target={"target_kind": "program"}, + metadata={"suite": "demo-suite"}, + ) + + assert state.metadata["program"] == "demo" + assert state.metadata["suite"] == "demo-suite" + assert state.metadata["_target"]["target_kind"] == "program" + + +def test_normalize_mapping_rejects_malformed_list_fields() -> None: + with pytest.raises(ValueError, match="trajectory"): + normalize_task_response_to_eval_state( + case_id="case-5", + response={ + "status": "success", + "answer": "ok", + "trajectory": "bad", + }, + ) + + with pytest.raises(ValueError, match="completion"): + normalize_task_response_to_eval_state( + case_id="case-6", + response={ + "status": "success", + "answer": "ok", + "completion": {"role": "assistant"}, + }, + ) + + with pytest.raises(ValueError, match="tool_calls"): + normalize_task_response_to_eval_state( + case_id="case-7", + response={ + "status": "success", + "answer": "ok", + "tool_calls": "bad", + }, + ) From 99c92018dfbf3403dd8ae90dca2d13d3e93d125f Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 9 Jun 2026 21:52:13 +0800 Subject: [PATCH 18/41] fix: harden evaluator v2 contracts --- aworld/evaluations/README.md | 28 ++++-- aworld/evaluations/manifests.py | 3 +- aworld/evaluations/substrate.py | 50 ++++++---- .../evaluations/test_evaluation_substrate.py | 99 ++++++++++++++++++- 4 files changed, 150 insertions(+), 30 deletions(-) diff --git a/aworld/evaluations/README.md b/aworld/evaluations/README.md index 60226726c..7419026bb 100644 --- a/aworld/evaluations/README.md +++ b/aworld/evaluations/README.md @@ -1,8 +1,9 @@ # AWorld Evaluations Module The `aworld.evaluations` module is the framework-owned evaluation substrate for AWorld. It supports both legacy -`EvaluationConfig`-driven flows and newer suite-backed evaluator flows that can execute an agent or task first, then -score both final outcomes and trajectory/process quality from a normalized execution state. +`EvaluationConfig`-driven flows and newer suite-backed evaluator flows that can execute an agent, task, or trusted +program callable first, then score final outcomes and normalized trajectory/process quality from a single execution +state. ## Table of Contents @@ -99,11 +100,16 @@ Ownership is explicit: - harnesses and execution specs own runtime behavior: whether execution is static, agent-backed, task-backed, or program-backed, plus task/runner configuration - `aworld-cli` only assembles workspace inputs into these framework objects; it does not redefine evaluator semantics +Declarative JSON manifests are intentionally narrower than in-memory framework APIs. They do not accept `execution`, +`target_ref`, `task_builder_ref`, live agent/task objects, or program callables. In-memory callers may still pass live +AWorld agent/task instances through `EvalExecutionSpec.target_config` for compatibility, but that is not a persisted +suite contract. + `EvalState` intentionally separates: - `answer`: the final deliverable or normalized terminal answer - `completion`: completion-oriented view used by outcome scorers that only care about the final assistant output -- `trajectory`: full execution history used by process, tool-use, and efficiency scorers +- `trajectory`: captured execution history used by process, tool-use, and efficiency scorers ## Scorers @@ -194,10 +200,15 @@ The current execution modes are: - `static`: judge-only evaluation with no runtime execution - `agent`: execute through `AworldAgentEvalTarget` - `task`: execute through `AworldTaskEvalTarget` -- `program`: execute an importable callable through the evaluator adapter layer and normalize the result into `EvalState` +- `program`: execute a trusted importable callable through the evaluator adapter layer and normalize the result into `EvalState` + +This gives AWorld a framework-native evaluator path that can assess final artifacts, structured outputs, and captured +trajectory quality through one substrate. It is a single-shot evaluator flow: rollout-owning harnesses, user simulators, +lifecycle hooks, child-state composition, and step-level training rewards are separate runtime-composition work. -This gives AWorld a framework-native evaluator path that can assess final artifacts, structured outputs, and trajectory -quality through one substrate. +`program` and TASK builder references are trusted in-process extension points. Importing a module can execute top-level +code, so these references should only point at evaluator code controlled by the runner or workspace owner. They are not +sandboxed and are not exposed through declared JSON manifests. Suite-backed evaluation also supports: @@ -211,13 +222,14 @@ The evaluator v2 path is intentionally close to AWorld's existing runner model: - suite -> describes the evaluation contract and default gate/judge behavior - case -> provides per-row input, optional references, and case-local execution hints -- execution spec -> describes how a case becomes a runnable AWorld execution +- lightweight harness / execution spec -> describes how a case becomes a runnable AWorld execution - eval target -> adapts the execution spec into an existing target implementation - evaluator / runner -> executes cases and produces normalized outputs - scorers -> read final answer, completion, and trajectory from `EvalState` In practice this means outcome evaluation and trajectory evaluation share one execution pipeline. A suite can score only -the final artifact, only the trajectory, or both. +the final artifact, only the captured trajectory, or both. The lightweight harness boundary selects execution defaults +and adapters; it does not own multi-turn rollout lifecycle in this version. ## Recorder diff --git a/aworld/evaluations/manifests.py b/aworld/evaluations/manifests.py index 119f639a8..2dd8f8d45 100644 --- a/aworld/evaluations/manifests.py +++ b/aworld/evaluations/manifests.py @@ -40,7 +40,7 @@ def get_declared_eval_suite_schema() -> dict[str, object]: "approval_threshold": {"type": ["number", "null"]}, }, "additionalProperties": False, - "description": "Optional gate override layered on top of the base suite defaults.", + "description": "Optional simple gate override layered on top of the base suite defaults.", }, "metadata": { "type": "object", @@ -52,6 +52,7 @@ def get_declared_eval_suite_schema() -> dict[str, object]: }, }, "additionalProperties": False, + "description": "Declared evaluator suites are metadata-only overlays; executable refs and runtime handles are not accepted.", } diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 9767550a9..dbd8b9cac 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -3,7 +3,6 @@ import asyncio import base64 -import importlib import json import math import inspect @@ -21,7 +20,8 @@ from aworld.evaluations.base import EvalDataCase, EvalDataset, EvalTarget from aworld.evaluations.base import NoActionEvalTarget from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget, AworldTaskEvalTarget -from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec, load_program_callable +from aworld.evaluations.manifests import validate_declared_eval_suite_manifest from aworld.evaluations.execution_adapters import resolve_execution_adapter from aworld.evaluations.report import ( CaseEvaluationReport, @@ -181,7 +181,12 @@ def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: matched_pass: list[dict[str, Any]] = [] failed_pass: list[dict[str, Any]] = [] for condition in pass_all: - if condition.matches(metrics): + try: + matched = condition.matches(metrics) + except KeyError: + failed_pass.append({**condition.to_dict(), "reason": "missing_metric"}) + continue + if matched: matched_pass.append(condition.to_dict()) else: failed_pass.append(condition.to_dict()) @@ -196,11 +201,24 @@ def evaluate(self, metrics: Mapping[str, Any]) -> GateDecision: matched_conditions=matched_pass, failed_conditions=[], ) + if any(condition.get("reason") == "missing_metric" for condition in failed_pass): + return GateDecision( + status="fail", + metric_name=metric_name, + value=value, + matched_conditions=matched_pass, + failed_conditions=failed_pass, + ) matched_approval: list[dict[str, Any]] = [] failed_approval: list[dict[str, Any]] = [] for condition in approval_all: - if condition.matches(metrics): + try: + matched = condition.matches(metrics) + except KeyError: + failed_approval.append({**condition.to_dict(), "reason": "missing_metric"}) + continue + if matched: matched_approval.append(condition.to_dict()) else: failed_approval.append(condition.to_dict()) @@ -517,6 +535,7 @@ def load_declared_eval_suites(workspace: str | Path | None = None) -> list[str]: for manifest_path in sorted(manifest_dir.glob("*.json")): manifest_key = str(manifest_path.resolve()) manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + validate_declared_eval_suite_manifest(manifest) suite = _build_declared_eval_suite(manifest) if suite.suite_id in seen_suite_ids: raise ValueError(f"duplicate suite_id in workspace manifests: {suite.suite_id}") @@ -837,10 +856,13 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: if compiled.gate_policy is not None: for condition in _gate_policy_conditions(compiled.gate_policy): if condition.metric_name not in gate_metrics: - gate_metrics[condition.metric_name] = _extract_metric_value_from_result_summary( - eval_result.summary, - condition.metric_name, - ) + try: + gate_metrics[condition.metric_name] = _extract_metric_value_from_result_summary( + eval_result.summary, + condition.metric_name, + ) + except KeyError: + continue gate = compiled.gate_policy.evaluate(gate_metrics) results: list[CaseEvaluationReport] = [] @@ -1055,17 +1077,7 @@ async def _maybe_await(value: Any) -> Any: def _load_callable(ref: str | None) -> Callable[..., Any]: if not ref: raise ValueError("task execution mode requires task_builder_ref") - if ":" in ref: - module_path, attr_name = ref.split(":", 1) - elif "." in ref: - module_path, attr_name = ref.rsplit(".", 1) - else: - raise ValueError(f"invalid callable reference: {ref}") - module = importlib.import_module(module_path) - candidate = getattr(module, attr_name) - if not callable(candidate): - raise ValueError(f"callable reference is not callable: {ref}") - return candidate + return load_program_callable(ref) def _load_app_evaluator_skill_prompt() -> str: diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 3d6721c19..fe86972a7 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -30,6 +30,7 @@ run_evaluation_flow, ) from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.manifests import validate_declared_eval_suite_manifest from aworld.evaluations.report import validate_evaluator_report from aworld.evaluations.types import MetricNames @@ -137,6 +138,21 @@ def test_compile_evaluation_flow_uses_execution_backed_target_when_suite_declare assert compiled.eval_config.eval_target.__class__.__name__ == "_ConfiguredTaskEvalTarget" +@pytest.mark.asyncio +async def test_task_execution_rejects_path_style_task_builder_ref() -> None: + suite = EvalSuiteDef( + suite_id="task-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + execution=EvalExecutionSpec(mode=EvalExecutionMode.TASK, task_builder_ref="scripts/run.py:build_task"), + ) + compiled = compile_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo.txt"}}, suite=suite) + ) + + with pytest.raises(ValueError, match="importable callable"): + await compiled.eval_config.eval_target.build_task(0, compiled.dataset.eval_cases[0]) + + def test_compile_evaluation_flow_preserves_live_agent_target_config() -> None: live_agent = object() suite = EvalSuiteDef( @@ -364,8 +380,26 @@ def test_gate_policy_reports_missing_metric() -> None: pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),) ) - with pytest.raises(KeyError, match="score"): - policy.evaluate({}) + decision = policy.evaluate({}) + + assert decision.status == "fail" + assert decision.failed_conditions == [ + {"metric_name": "score", "op": ">=", "threshold": 0.9, "reason": "missing_metric"} + ] + + +def test_gate_policy_missing_pass_metric_fails_even_when_approval_matches() -> None: + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="trajectory_tool_calls", op=">=", threshold=1.0),), + approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.7),), + ) + + decision = policy.evaluate({"score": 0.8}) + + assert decision.status == "fail" + assert decision.failed_conditions == [ + {"metric_name": "trajectory_tool_calls", "op": ">=", "threshold": 1.0, "reason": "missing_metric"} + ] def test_gate_policy_rejects_unsupported_operator() -> None: @@ -519,6 +553,35 @@ async def fake_judge(case_input, target): ] +@pytest.mark.asyncio +async def test_run_evaluation_flow_missing_gate_metric_fails_closed_and_keeps_results() -> None: + async def fake_judge(case_input, target): + return {"score": 0.95} + + suite = EvalSuiteDef( + suite_id="composite-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="trajectory_tool_calls", op=">=", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "file", "target_path": "artifact.txt"}, + suite=suite, + ) + ) + + assert report["gate"]["status"] == "fail" + assert report["gate"]["failed_conditions"] == [ + {"metric_name": "trajectory_tool_calls", "op": ">=", "threshold": 1.0, "reason": "missing_metric"} + ] + assert report["results"][0]["case_id"] == "case-1" + assert report["results"][0]["metrics"]["score"]["value"] == pytest.approx(0.95) + + @pytest.mark.asyncio async def test_run_evaluation_flow_composite_gate_is_not_condition_order_sensitive() -> None: async def fake_judge(case_input, target): @@ -915,6 +978,38 @@ def test_load_declared_eval_suites_registers_manifest_backed_suite( assert "strict-ui" in listed +def test_load_declared_eval_suites_rejects_execution_in_manifest(tmp_path) -> None: + manifest_dir = tmp_path / ".aworld" / "evaluators" + manifest_dir.mkdir(parents=True) + (manifest_dir / "program-suite.json").write_text( + """ +{ + "suite_id": "program-suite", + "base_suite": "app-evaluator", + "execution": { + "mode": "program", + "target_ref": "pkg.module:run_case" + } +} +""".strip(), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="Additional properties are not allowed"): + load_declared_eval_suites(tmp_path) + + +def test_declared_eval_suite_manifest_schema_rejects_execution_contract() -> None: + with pytest.raises(ValueError, match="Additional properties are not allowed"): + validate_declared_eval_suite_manifest( + { + "suite_id": "program-suite", + "base_suite": "app-evaluator", + "execution": {"mode": "program", "target_ref": "pkg.module:run_case"}, + } + ) + + def test_declared_eval_suite_can_be_selected_for_matching_target( monkeypatch: pytest.MonkeyPatch, tmp_path, From fb7a2c42939e8673472aa1640ee63e20c49ac74f Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Tue, 9 Jun 2026 21:54:58 +0800 Subject: [PATCH 19/41] docs: add evaluator v2 openspec change --- .../design.md | 183 +++++ .../implementation-plan.md | 681 ++++++++++++++++++ .../proposal.md | 34 + .../specs/evaluation-substrate/spec.md | 101 +++ .../tasks.md | 37 + 5 files changed, 1036 insertions(+) create mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md create mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md create mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md create mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md create mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md new file mode 100644 index 000000000..9daed18b4 --- /dev/null +++ b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md @@ -0,0 +1,183 @@ +## Context + +Evaluator v1 deliberately optimized for shipping a working framework substrate and official CLI flow first. The result is structurally sound, but its abstraction ceiling is still low in four places: + +- execution can only target AWorld-native `agent` and `task` paths plus judge-only `static` +- eval targets still know too much about runner entrypoints +- judge contracts are validated structurally but not typed as first-class models +- gate policies are inspectable but too narrow for multi-metric release decisions + +The goal of this follow-up is to raise that ceiling without discarding the v1 substrate. The evaluator remains an AWorld framework capability under `aworld/evaluations/`, and `aworld-cli` remains only an official consumer and assembly layer. This is a v1 extensibility increment, not a verifiers-parity runtime: rollout-owning harnesses, user simulators, lifecycle hooks, child-state composition, and training reward semantics are deferred. + +## Goals / Non-Goals + +**Goals:** + +- Add a program-backed execution path that fits the existing suite/case/execution/state model. +- Add a lightweight harness definition so reusable execution behavior is explicit without adopting verifiers' broader object model. +- Isolate execution mechanics behind a framework-owned adapter boundary so runtime-specific invocation does not leak across evaluator code. +- Make typed judge-output models the primary contract for suite-backed evaluation. +- Support structured composite gate policies for multi-metric pass, fail, and approval decisions. +- Support suite-declared trajectory scoring alongside final-result scoring. +- Preserve compatibility for existing v1 suite-backed flows, reports, and CLI evaluator behavior. + +**Non-Goals:** + +- Replacing the existing `EvalTarget -> Evaluator -> EvaluateRunner` orchestration skeleton. +- Creating a public external evaluator API v2 in this change. +- Reworking the `aworld-cli evaluator` command shape beyond compatibility adjustments required by framework changes. +- Shipping baseline history, trend analysis, or evaluator comparison workflows. +- Converging AWorld onto verifiers' public API terminology or object model. +- Adding external harness package registries, lifecycle decorators, training reward semantics, sandbox command execution, or child-state composition. + +## Ownership Model + +| Concept | Owns | Must not own | +| --- | --- | --- | +| `EvalSuiteDef` / `EvalCaseDef` | Domain inputs, case metadata, judge contract, declared scorers, gates | Runtime handles in declarative or persisted suite definitions | +| `EvalHarnessDef` | Reusable execution selection and execution defaults for a suite flow | Scoring, judge validation, report assembly | +| `EvalExecutionSpec` | Typed execution configuration for one harness or suite execution path | Arbitrary workflow engines or command execution | +| `ExecutionAdapter` | Invocation and normalization into `EvalState` | Orchestration, score calculation, gate decisions | +| `Evaluator` / `EvaluateRunner` | Existing dataset, target, scorer orchestration | Suite-specific execution semantics | + +Cases remain serializable input data. `EvalState` remains serializable rollout output containing answer, completion, artifacts, trajectory, usage, timing, errors, raw response, and metadata. Runtime clients, runners, sandboxes, program objects, and other live handles may be used transiently by adapters but must not be stored in `EvalState`. + +In-memory framework callers may still pass live AWorld agent/task objects through `EvalExecutionSpec.target_config` for compatibility with existing agent/task evaluation APIs. That path is not a declarative or JSON-serializable suite contract. Declared JSON manifests intentionally do not accept `execution`, `target_ref`, `task_builder_ref`, or live runtime handles; they only layer safe suite metadata and simple gate overrides on supported builtin suites. + +## Decisions + +### 1. Add `PROGRAM` execution as an extension of the current execution model + +`EvalExecutionMode` should gain a `PROGRAM` mode that lets a suite execute an importable callable without pretending every evaluation target is an AWorld agent or task. + +`PROGRAM` is for evaluation targets that do not use AWorld's agent or task runtime, such as a third-party API client, local library evaluator, or custom callable harness. It is not for customizing AWorld agent behavior, preprocessing case inputs, replacing judge/scorer logic, command execution, sandbox placement, or general workflow engines. + +The callable reference must be an import string (`module:attribute` or `module.attribute`) that resolves to a callable. `EvalExecutionSpec` validation should reject `PROGRAM` specs without `target_ref` and reject unsupported command or workflow forms in this change. TASK builder references use the same importable-callable validation. + +Program callables receive `(case, spec, target)` and may be sync or async. They must return one of: + +1. an `EvalState` +2. a mapping matching `EvalState` fields, including optional `status`, `answer`, `completion`, `trajectory`, `tool_calls`, `usage`, `timing`, `error`, and `metadata` +3. a `TaskResponse` +4. a bare value, treated as the final answer with success status + +If custom normalization is needed, the program should return a mapping with all relevant `EvalState` fields set explicitly and document the mapping in suite metadata. Exceptions from the program should propagate as execution failures rather than being silently converted into judge payloads. + +The program-backed path should still compile into the same evaluator substrate: + +- case definitions still provide the task-level inputs +- execution specs still describe runtime wiring +- execution output must still normalize into `EvalState` +- scorers and gate policies remain agnostic to how execution happened + +`PROGRAM` is a framework extensibility mechanism, not a new CLI product mode. + +Importable callable execution is a trusted in-process extension point. Importing a module can execute module top-level code, so `PROGRAM` and TASK builder refs must only be used for evaluator code controlled by the runner or workspace owner. This change does not sandbox imported code, provide an allowlist, sanitize third-party program payloads, or make untrusted suite manifests executable. + +### 2. Add a lightweight harness boundary over execution specs + +AWorld should not adopt verifiers' `Taskset` / `Harness` / `Env` object model, but it should make the missing execution reuse boundary explicit. + +`EvalHarnessDef` should be a small framework-owned dataclass that can be attached to a suite or flow: + +- `harness_id`: stable reusable identifier +- `execution`: `EvalExecutionSpec` +- `metadata`: optional serializable harness metadata + +Suites may continue to set `execution` directly for v1 compatibility. At compile time, direct `suite.execution` lowers into an equivalent harness so the substrate has one execution boundary. Harnesses own execution defaults and adapter selection; suites still own cases, judges, scorers, and gates. + +This is intentionally not a BYO harness plugin system and not equivalent to verifiers' rollout-owning harness. External package loading, lifecycle decorators, retry/fallback composition, multi-turn rollout ownership, and runtime handle borrowing are deferred. + +### 3. Route execution through adapters instead of hardcoded runner calls + +The follow-up should introduce an internal adapter boundary in `aworld/evaluations/`, for example an `ExecutionAdapter` protocol plus concrete adapters for: + +- static/judge-only execution +- AWorld agent execution +- AWorld task execution +- program-backed execution + +This keeps runner coupling local. If runner invocation details change later, the evaluator substrate should only need adapter updates instead of cross-cutting target rewrites. + +Adapters are a hard internal boundary: they must not replace the current `EvalTarget -> Evaluator -> EvaluateRunner` orchestration skeleton. They only execute one case through the configured runtime and normalize the result into `EvalState`. + +### 4. Make typed judge models the primary schema contract + +Judge output validation should move from required-field checks toward typed models using Pydantic, which already exists across the codebase. + +The primary suite contract should become: + +- a typed judge-output model for validation and documentation +- JSON schema derivation from that model for report and tooling integration +- a compatibility bridge so current `JudgeSchemaDef(required_fields=...)` style suites continue to work during migration + +This change is about stronger framework contracts, not about forcing every existing scorer to migrate in one pass. + +Legacy required-field definitions should lower through the same `JudgeSchemaDef` validation and schema-export API used by typed models. They should not create a parallel scoring path. + +Judge schema metadata should be surfaced once at the top level of the evaluator report, not copied into every case result. Per-case judge metadata should continue to include judge payload fields and backend id. + +### 5. Use structured composite gate conditions instead of a string DSL + +The follow-up should expand gate expressiveness, but it should avoid introducing a loose string expression DSL as the first step. + +The preferred direction is a structured gate model such as: + +- condition objects over named metrics and comparison operators +- explicit combinators like `all` / `any` +- optional approval-stage conditions separate from pass/fail conditions +- compatibility lowering from the current single-threshold policy into the new structure + +Supported operators should include `>=`, `<=`, `>`, `<`, `==`, and `!=` from the first implementation so adding strict bounds or categorical metrics later does not require an API break. + +This keeps gate logic inspectable, serializable, and consistent with AWorld's existing preference for explicit typed configuration objects. + +Legacy threshold gates should lower into structured conditions at substrate boundaries. They should not keep a separate gate evaluation path. + +### 6. Add suite-declared trajectory scoring + +The substrate already preserves trajectory in `EvalState`, and existing scorer extractors can inspect it. This change should make trajectory evaluation explicit in suite definitions so final-result scoring and process scoring can be configured together. + +`EvalSuiteDef` should gain a `trajectory_scorers` tuple of structured scorer definitions. The first implementation should lower these definitions into normal `EvalCriteria` entries for existing trajectory scorer classes, preserving the current scorer registry and report metric shapes. + +Trajectory scorers evaluate `EvalState.trajectory` and related state fields produced by the current single-shot execution flow. They should not mutate state, replace the judge layer, introduce step-level reward semantics, run multi-turn user simulation, or introduce a separate report format. + +### 7. Keep CLI changes additive and framework-driven + +The official `aworld-cli evaluator` command should inherit these improvements through framework compilation and execution, not through CLI-owned evaluator semantics. + +That means: + +- no second evaluator stack inside `aworld-cli` +- no CLI-only gate language +- no CLI-only program execution abstraction + +If follow-up CLI work becomes necessary later, it should be a separate product-focused change. + +## Risks / Trade-offs + +- [Program execution shape too generic] -> Mitigation: keep `PROGRAM` scoped to trusted importable callable refs plus normalized `EvalState` output, not arbitrary workflow engines or untrusted manifest execution. +- [Typed judge model migration friction] -> Mitigation: provide compatibility bridging from current `JudgeSchemaDef`; builtin typed-model migration may be staged after the substrate lands. +- [Composite gate policies become overdesigned] -> Mitigation: prefer structured operators and combinators over a general-purpose DSL. +- [Adapter layer duplicates existing target abstractions] -> Mitigation: keep adapters narrowly focused on execution invocation and normalization, not on replacing the orchestration skeleton. +- [Harness concept expands into a second framework] -> Mitigation: keep `EvalHarnessDef` as a lightweight typed holder for execution specs and defer lifecycle/composition/package features. + +## Migration Plan + +1. Add the harness, execution, and adapter abstractions behind compatibility paths so current suites still resolve. +2. Introduce typed judge models and bridge legacy schema definitions. +3. Expand gate evaluation logic while preserving current threshold-style gate definitions. +4. Add suite-declared trajectory scorer lowering. +5. Keep builtin suites compatible and exercise the richer substrate through focused tests; migrate builtin suites to typed models only when their public output contract is ready to change. +6. Keep CLI evaluator behavior stable while letting it consume the new framework-owned capabilities. + +Rollback strategy: + +- retain current `static` / `agent` / `task` suite behavior through compatibility lowering +- keep legacy threshold gate definitions and lightweight judge schemas valid until follow-up migrations are complete + +## Deferred Questions + +- Rich harness lifecycle hooks, retry/fallback composition, and child-state borrowing should wait for a later runtime-composition change. +- Command-backed or sandbox-backed program execution should wait for a dedicated execution-runtime change. +- Manifest exposure for every structured gate and trajectory scorer field may be staged after the core substrate supports the model. diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md new file mode 100644 index 000000000..7c6215e5a --- /dev/null +++ b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md @@ -0,0 +1,681 @@ +# AWorld Evaluator V2 Extensibility Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +> **Implementation status note:** This file records the original execution plan and is no longer the authoritative description of shipped behavior. The authoritative status is `tasks.md`, the delta spec, and the code/tests. Final implementation deliberately keeps declared JSON manifests metadata-only, defers builtin typed-model migration, treats `judge_schema` as an optional report-level object, and defines trajectory evaluation as single-shot `EvalState` inspection rather than verifiers-style rollout ownership. + +**Goal:** Extend the framework-owned evaluator substrate with lightweight harness reuse, bounded program-backed execution, adapter-isolated runtime invocation, typed judge-output contracts, structured composite gate policies, and suite-declared trajectory scoring while keeping v1 evaluator flows compatible. + +**Architecture:** Keep the current `EvalTarget -> Evaluator -> EvaluateRunner` skeleton, but move execution dispatch behind framework-owned adapters under `aworld/evaluations/`. Evolve suite-backed contracts additively: direct `suite.execution` lowers into a lightweight `EvalHarnessDef`, `EvalExecutionSpec` gains bounded import-callable `PROGRAM`, judge schemas gain typed-model support with a legacy bridge, gate policies gain structured composite conditions with lowering from the current threshold form, and trajectory scorer declarations lower into existing scorer criteria. + +**Tech Stack:** Python, AWorld evaluation substrate under `aworld/evaluations/`, Pydantic v2 models already used in the repo, pytest, OpenSpec. + +--- + +## File Structure + +- `aworld/evaluations/execution.py` + Extend execution mode definitions and shared normalization helpers used by all adapter paths. +- `aworld/evaluations/execution_adapters.py` + New internal adapter boundary for static, agent, task, and program-backed execution. +- `aworld/evaluations/substrate.py` + Compile suites onto harnesses/adapters, typed judge contracts, trajectory scorer criteria, and richer gate models while preserving compatibility. +- `aworld/evaluations/eval_targets/agent_eval.py` + Reduce direct runtime coupling so existing eval targets align with adapter-backed execution. +- `aworld/evaluations/report.py` + Surface typed judge schema metadata once at report level and structured gate outputs in the report contract where needed. +- `tests/evaluations/test_execution_state.py` + Extend execution-state tests to cover program-backed normalization. +- `tests/evaluations/test_execution_adapters.py` + New focused coverage for adapter selection and execution. +- `tests/evaluations/test_evaluation_substrate.py` + Add substrate-level coverage for harness lowering, typed judge schemas, composite gates, trajectory scorers, and backward compatibility. +- `tests/core/test_evaluator_runtime.py` + Guard that CLI-facing runtime assembly still works on top of the evolved framework substrate. +- `aworld/evaluations/README.md` + Document the new framework-owned extension points once implementation settles. + +### Task 1: Add harness lowering, execution adapters, and `PROGRAM` execution mode + +**Files:** +- Modify: `aworld/evaluations/execution.py` +- Create: `aworld/evaluations/execution_adapters.py` +- Modify: `aworld/evaluations/substrate.py` +- Modify: `aworld/evaluations/eval_targets/agent_eval.py` +- Test: `tests/evaluations/test_execution_state.py` +- Test: `tests/evaluations/test_execution_adapters.py` +- Test: `tests/evaluations/test_evaluation_substrate.py` + +- [ ] **Step 1: Write the failing harness, adapter, and program-execution tests** + +```python +# tests/evaluations/test_execution_adapters.py +from __future__ import annotations + +import pytest + +from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec +from aworld.evaluations.execution_adapters import resolve_execution_adapter +from aworld.evaluations.substrate import EvalCaseDef + + +async def _demo_program(case, spec, target): + return { + "status": "success", + "answer": f"ran:{case.input['query']}", + "completion": [{"role": "assistant", "content": "final"}], + "trajectory": [{"role": "assistant", "content": "step"}], + "usage": {"total_tokens": 7}, + } + + +@pytest.mark.asyncio +async def test_program_execution_adapter_normalizes_result(monkeypatch): + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: _demo_program, + ) + adapter = resolve_execution_adapter( + EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case") + ) + state = await adapter.execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo"}), + target={"target_kind": "directory"}, + spec=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), + ) + + assert state.case_id == "case-1" + assert state.answer == "ran:demo" + assert state.completion[0]["content"] == "final" + assert state.trajectory[0]["content"] == "step" + assert state.usage["total_tokens"] == 7 + + +def test_resolve_execution_adapter_rejects_missing_program_ref(): + with pytest.raises(ValueError, match="target_ref"): + resolve_execution_adapter(EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM)) + + +def test_resolve_execution_adapter_rejects_command_style_program_ref(): + with pytest.raises(ValueError, match="importable callable"): + resolve_execution_adapter( + EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="python script.py") + ) +``` + +```python +# tests/evaluations/test_execution_state.py +from aworld.evaluations.execution import normalize_task_response_to_eval_state + + +def test_normalize_mapping_response_preserves_completion_and_tool_calls(): + state = normalize_task_response_to_eval_state( + case_id="case-2", + response={ + "status": "success", + "answer": "ok", + "completion": [{"role": "assistant", "content": "ok"}], + "trajectory": [{"tool_calls": [{"name": "search"}]}], + }, + ) + + assert state.completion[0]["content"] == "ok" + assert state.tool_calls[0]["name"] == "search" +``` + +- [ ] **Step 2: Run the targeted tests and confirm they fail** + +Run: `pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py -q` +Expected: FAIL because `EvalExecutionMode.PROGRAM`, `execution_adapters.py`, harness lowering, and adapter resolution do not exist yet. + +- [ ] **Step 3: Add `PROGRAM` to execution definitions and create adapter implementations** + +```python +# aworld/evaluations/execution.py +class EvalExecutionMode(str, Enum): + STATIC = "static" + AGENT = "agent" + TASK = "task" + PROGRAM = "program" + + +def load_program_callable(ref: str): + if ":" in ref: + module_name, attr_name = ref.split(":", 1) + elif "." in ref: + module_name, attr_name = ref.rsplit(".", 1) + else: + raise ValueError(f"invalid program ref: {ref}") + module = importlib.import_module(module_name) + return getattr(module, attr_name) +``` + +```python +# aworld/evaluations/execution_adapters.py +from __future__ import annotations + +import inspect +from dataclasses import dataclass +from typing import Protocol + +from aworld.evaluations.execution import ( + EvalExecutionMode, + EvalExecutionSpec, + EvalState, + load_program_callable, + normalize_task_response_to_eval_state, +) +from aworld.runner import Runners + + +class ExecutionAdapter(Protocol): + async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: + pass + + +@dataclass(frozen=True) +class StaticExecutionAdapter: + async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: + return EvalState(case_id=case.case_id, status="not_evaluated", metadata={"_target": dict(target)}) + + +@dataclass(frozen=True) +class AgentExecutionAdapter: + async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: + query = case.input[spec.query_column or "query"] + response = await Runners.run(query, agent=spec.target_config["agent"]) + return normalize_task_response_to_eval_state(case_id=case.case_id, response=response, target=target) + + +@dataclass(frozen=True) +class TaskExecutionAdapter: + async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: + builder = load_program_callable(spec.task_builder_ref) + task = builder(case=case, target=target, spec=spec) + if inspect.isawaitable(task): + task = await task + response = await Runners.run_task(task=task) + return normalize_task_response_to_eval_state(case_id=case.case_id, response=response, target=target) + + +@dataclass(frozen=True) +class ProgramExecutionAdapter: + async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: + if not spec.target_ref: + raise ValueError("program execution requires target_ref") + program = load_program_callable(spec.target_ref) + result = program(case, spec, target) + if inspect.isawaitable(result): + result = await result + return normalize_task_response_to_eval_state( + case_id=case.case_id, + response=result, + target=target, + metadata={"_execution_mode": spec.mode.value}, + ) + + +def resolve_execution_adapter(spec: EvalExecutionSpec) -> ExecutionAdapter: + if spec.mode == EvalExecutionMode.STATIC: + return StaticExecutionAdapter() + if spec.mode == EvalExecutionMode.AGENT: + return AgentExecutionAdapter() + if spec.mode == EvalExecutionMode.TASK: + return TaskExecutionAdapter() + if spec.mode == EvalExecutionMode.PROGRAM: + if not spec.target_ref: + raise ValueError("program execution requires target_ref") + return ProgramExecutionAdapter() + raise ValueError(f"unsupported execution mode: {spec.mode}") +``` + +- [ ] **Step 4: Compile suite execution through lightweight harnesses and adapters in the substrate** + +```python +# aworld/evaluations/substrate.py +from aworld.evaluations.execution_adapters import resolve_execution_adapter + + +@dataclass(frozen=True) +class EvalHarnessDef: + harness_id: str + execution: EvalExecutionSpec = field(default_factory=EvalExecutionSpec) + metadata: dict[str, Any] = field(default_factory=dict) + + +def resolve_eval_harness(suite: EvalSuiteDef) -> EvalHarnessDef: + if suite.harness is not None: + return suite.harness + if suite.execution is not None: + return EvalHarnessDef( + harness_id=f"{suite.suite_id}-execution", + execution=suite.execution, + metadata={"lowered_from": "suite.execution"}, + ) + return EvalHarnessDef(harness_id=f"{suite.suite_id}-static") +``` + +```python +# aworld/evaluations/substrate.py +class _AdapterExecutionEvalTarget(EvalTarget[dict]): + async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: + case = EvalCaseDef(case_id=input.eval_case_id, input=dict(input.case_data)) + state = await self._adapter.execute(case=case, target=self._target, spec=self._harness.execution) + return {"answer": state.answer, "state": state.to_dict()} +``` + +Adapters must not replace `EvalTarget -> Evaluator -> EvaluateRunner`; they only localize per-case invocation and normalization. + +```python +# aworld/evaluations/eval_targets/agent_eval.py +class AworldTaskEvalTarget(EvalTarget[dict]): + async def run_task_response(self, task: Task) -> TaskResponse | dict | object: + return await Runners.run_task(task=task) +``` + +- [ ] **Step 5: Run adapter and substrate tests until green** + +Run: `pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py -q` +Expected: PASS, including coverage for harness lowering, adapter-backed `PROGRAM` execution, rejected invalid program refs, and unchanged `static`/`agent`/`task` compatibility. + +- [ ] **Step 6: Commit the execution-extensibility slice** + +```bash +git add aworld/evaluations/execution.py aworld/evaluations/execution_adapters.py aworld/evaluations/substrate.py aworld/evaluations/eval_targets/agent_eval.py tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py +git commit -m "feat: add adapter-backed evaluator execution" +``` + +### Task 2: Add typed judge-output contracts with a legacy compatibility bridge + +**Files:** +- Modify: `aworld/evaluations/substrate.py` +- Modify: `aworld/evaluations/scorers/suite_judge.py` +- Modify: `aworld/evaluations/report.py` +- Test: `tests/evaluations/test_evaluation_substrate.py` +- Test: `tests/core/test_evaluator_runtime.py` + +- [ ] **Step 1: Write failing tests for typed judge validation and legacy fallback** + +```python +# tests/evaluations/test_evaluation_substrate.py +from pydantic import BaseModel + +from aworld.evaluations.substrate import EvalSuiteDef, JudgeSchemaDef + + +class DemoJudgeOutput(BaseModel): + score: float + verdict: str + + +def test_typed_judge_model_accepts_valid_payload(): + suite = EvalSuiteDef( + suite_id="demo", + judge_schema=JudgeSchemaDef(output_model=DemoJudgeOutput), + ) + + payload = suite.judge_schema.validate_payload({"score": 0.8, "verdict": "ok"}) + assert payload["score"] == 0.8 + assert payload["verdict"] == "ok" + + +def test_typed_judge_model_rejects_invalid_payload(): + suite = EvalSuiteDef( + suite_id="demo", + judge_schema=JudgeSchemaDef(output_model=DemoJudgeOutput), + ) + + with pytest.raises(ValueError, match="verdict"): + suite.judge_schema.validate_payload({"score": 0.8}) + + +def test_legacy_required_fields_schema_still_validates(): + schema = JudgeSchemaDef(required_fields=("score", "rank")) + payload = schema.validate_payload({"score": 0.9, "rank": 1}) + assert payload["rank"] == 1 +``` + +- [ ] **Step 2: Run judge-schema tests to confirm failure** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: FAIL because `JudgeSchemaDef` does not yet support `output_model` or `validate_payload()`. + +- [ ] **Step 3: Evolve `JudgeSchemaDef` into a typed contract with compatibility bridging** + +```python +# aworld/evaluations/substrate.py +from pydantic import BaseModel, ValidationError + + +@dataclass(frozen=True) +class JudgeSchemaDef: + required_fields: tuple[str, ...] = tuple() + output_model: type[BaseModel] | None = None + + def validate_payload(self, payload: Mapping[str, Any]) -> dict[str, Any]: + if self.output_model is not None: + try: + model = self.output_model.model_validate(dict(payload)) + except ValidationError as exc: + raise ValueError(str(exc)) from exc + return model.model_dump(mode="json") + + missing = [field for field in self.required_fields if field not in payload] + if missing: + raise ValueError(f"missing required judge fields: {', '.join(missing)}") + return dict(payload) + + def json_schema(self) -> dict[str, Any]: + if self.output_model is not None: + return self.output_model.model_json_schema() + return { + "type": "object", + "required": list(self.required_fields), + "properties": {field: {} for field in self.required_fields}, + } +``` + +- [ ] **Step 4: Route judge scoring and report metadata through the typed schema contract** + +```python +# aworld/evaluations/scorers/suite_judge.py +payload = self.suite.judge_schema.validate_payload(dict(execution.payload)) + +metric_result = { + "value": float(payload["score"]), + "metadata": { + **payload, + "_judge_backend": execution.backend_id, + }, +} +``` + +```python +# aworld/evaluations/report.py +"judge_backend": {"type": "object"}, +"judge_schema": {"type": ["object", "null"]}, +``` + +`run_evaluation_flow()` should attach `report["judge_schema"] = suite.judge_schema.json_schema()` once at the top level when the schema is non-empty. Do not copy schema metadata into every case result. + +- [ ] **Step 5: Run substrate and runtime tests until green** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py -q` +Expected: PASS, including typed-model validation and unchanged legacy required-field flows. + +- [ ] **Step 6: Commit the typed-judge-contract slice** + +```bash +git add aworld/evaluations/substrate.py aworld/evaluations/scorers/suite_judge.py aworld/evaluations/report.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py +git commit -m "feat: add typed evaluator judge schemas" +``` + +### Task 3: Add structured composite gate policies with threshold compatibility lowering + +**Files:** +- Modify: `aworld/evaluations/substrate.py` +- Modify: `aworld/evaluations/manifests.py` +- Modify: `aworld/evaluations/report.py` +- Test: `tests/evaluations/test_evaluation_substrate.py` +- Test: `tests/docs/test_evaluator_report_docs.py` + +- [ ] **Step 1: Write failing tests for composite gates and legacy threshold compatibility** + +```python +# tests/evaluations/test_evaluation_substrate.py +from aworld.evaluations.substrate import GateMetricCondition, GatePolicyDef + + +def test_composite_gate_returns_pass_when_all_conditions_hold(): + policy = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), + ) + ) + + decision = policy.evaluate({"score": 0.95, "latency": 4.2}) + assert decision.status == "pass" + + +def test_composite_gate_returns_needs_approval_when_approval_conditions_hold(): + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),), + approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.75),), + ) + + decision = policy.evaluate({"score": 0.8}) + assert decision.status == "needs_approval" + + +def test_legacy_threshold_gate_lowers_to_structured_policy(): + policy = GatePolicyDef(metric_name="score", pass_threshold=0.9, approval_threshold=0.8) + decision = policy.evaluate({"score": 0.85}) + assert decision.status == "needs_approval" + + +@pytest.mark.parametrize( + ("op", "threshold", "value"), + [ + (">", 0.9, 0.91), + ("<", 0.9, 0.89), + (">=", 0.9, 0.9), + ("<=", 0.9, 0.9), + ("==", "approved", "approved"), + ("!=", "blocked", "approved"), + ], +) +def test_gate_metric_condition_supports_all_declared_operators(op, threshold, value): + policy = GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="metric", op=op, threshold=threshold),) + ) + + assert policy.evaluate({"metric": value}).status == "pass" +``` + +- [ ] **Step 2: Run gate tests to confirm failure** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` +Expected: FAIL because structured gate condition types do not exist yet. + +- [ ] **Step 3: Add structured gate condition objects and compatibility lowering** + +```python +# aworld/evaluations/substrate.py +@dataclass(frozen=True) +class GateMetricCondition: + metric_name: str + op: str + threshold: float | int | str | bool + + def matches(self, metrics: Mapping[str, Any]) -> bool: + value = metrics[self.metric_name] + if self.op == ">=": + return float(value) >= float(self.threshold) + if self.op == "<=": + return float(value) <= float(self.threshold) + if self.op == ">": + return float(value) > float(self.threshold) + if self.op == "<": + return float(value) < float(self.threshold) + if self.op == "==": + return value == self.threshold + if self.op == "!=": + return value != self.threshold + raise ValueError(f"unsupported gate operator: {self.op}") + + +@dataclass(frozen=True) +class GatePolicyDef: + metric_name: str | None = None + pass_threshold: float | None = None + approval_threshold: float | None = None + pass_all: tuple[GateMetricCondition, ...] = tuple() + approval_all: tuple[GateMetricCondition, ...] = tuple() + + def normalized_conditions(self) -> tuple[tuple[GateMetricCondition, ...], tuple[GateMetricCondition, ...]]: + pass_all = self.pass_all + approval_all = self.approval_all + if not pass_all and self.metric_name is not None and self.pass_threshold is not None: + pass_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.pass_threshold),) + if not approval_all and self.metric_name is not None and self.approval_threshold is not None: + approval_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.approval_threshold),) + return pass_all, approval_all +``` + +Gate evaluation should collect every metric referenced by normalized pass/approval conditions. Missing metrics should raise a clear `KeyError` naming the metric, and unsupported operators should raise `ValueError`. + +- [ ] **Step 4: Reflect the richer gate structure into manifests and report payloads** + +```python +# aworld/evaluations/manifests.py +"gate_policy": { + "type": "object", + "properties": { + "metric_name": {"type": "string"}, + "pass_threshold": {"type": "number"}, + "approval_threshold": {"type": ["number", "null"]}, + "pass_all": {"type": "array"}, + "approval_all": {"type": "array"}, + }, +} +``` + +```python +# aworld/evaluations/report.py +"gateDecision": { + "type": "object", + "required": ["status", "metric_name", "value"], + "properties": { + "status": {"type": "string", "enum": ["pass", "fail", "needs_approval"]}, + "metric_name": {"type": ["string", "null"]}, + "value": {"type": ["number", "null"]}, + "matched_conditions": {"type": "array"}, + "failed_conditions": {"type": "array"}, + }, +} +``` + +- [ ] **Step 5: Run gate and report-contract tests until green** + +Run: `pytest tests/evaluations/test_evaluation_substrate.py tests/docs/test_evaluator_report_docs.py -q` +Expected: PASS, including both composite-gate and legacy-threshold cases. + +- [ ] **Step 6: Commit the composite-gate slice** + +```bash +git add aworld/evaluations/substrate.py aworld/evaluations/manifests.py aworld/evaluations/report.py tests/evaluations/test_evaluation_substrate.py tests/docs/test_evaluator_report_docs.py +git commit -m "feat: add composite evaluator gate policies" +``` + +### Task 4: Add trajectory scorer declarations, migrate builtin suites, document the new substrate, and run full verification + +**Files:** +- Modify: `aworld/evaluations/substrate.py` +- Modify: `aworld/evaluations/README.md` +- Modify: `openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md` +- Test: `tests/evaluations/test_execution_state.py` +- Test: `tests/evaluations/test_execution_adapters.py` +- Test: `tests/evaluations/test_evaluation_substrate.py` +- Test: `tests/core/test_evaluator_runtime.py` +- Test: `tests/core/test_evaluator_top_level_command.py` +- Test: `tests/plugins/test_plugin_hooks.py` +- Test: `tests/test_plugin_cli_entrypoint.py` +- Test: `tests/docs/test_evaluator_report_docs.py` + +- [ ] **Step 1: Add suite-declared trajectory scorer lowering** + +```python +# aworld/evaluations/substrate.py +@dataclass(frozen=True) +class TrajectoryScorerDef: + metric_name: str + scorer_class: str | None = None + threshold: float = 0.0 + scorer_params: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class EvalSuiteDef: + # ... existing fields ... + trajectory_scorers: tuple[TrajectoryScorerDef, ...] = tuple() +``` + +`compile_evaluation_flow()` should append one `EvalCriteria` per trajectory scorer after the suite judge criterion. This reuses the existing scorer registry and report metric shape instead of creating a separate trajectory report path. + +- [ ] **Step 2: Add or migrate a builtin suite to exercise the new contracts end to end** + +```python +# aworld/evaluations/substrate.py +class AppEvaluatorJudgeOutput(BaseModel): + score: float + rank: int + criticism: str + praise: str + improvement_advice: str + + +def get_builtin_eval_suite(suite_id: str) -> EvalSuiteDef: + if suite_id == "app-evaluator": + return EvalSuiteDef( + suite_id="app-evaluator", + judge=_app_evaluator_judge, + judge_schema=JudgeSchemaDef(output_model=AppEvaluatorJudgeOutput), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.85),), + approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.7),), + ), + metadata={"builtin": True, "preferred_backend": "callable"}, + ) +``` + +- [ ] **Step 3: Update framework documentation and task checklist after implementation** + +```md + +- `program`: execute an importable callable through the evaluator adapter layer +- typed judge schemas: Pydantic-backed validation with JSON schema export +- composite gates: structured conditions with compatibility for threshold-style suites +- trajectory scorers: suite-declared process metrics that lower into normal evaluator criteria +``` + +```md + +- [x] 1.0 Add a lightweight `EvalHarnessDef` boundary and compatibility lowering from direct `suite.execution`. +- [x] 1.1 Add a `PROGRAM` execution mode to the framework-owned evaluation execution model. +- [x] 1.2 Introduce an internal execution adapter boundary under `aworld/evaluations/` for static, agent, task, and program-backed execution. +- [x] 2.1 Add typed judge-output model support as the primary suite-backed validation contract. +- [x] 3.1 Expand gate definitions from single-threshold checks to structured composite metric conditions. +- [x] 4.1 Add suite-declared trajectory scorer definitions that lower into normal evaluator criteria. +``` + +- [ ] **Step 4: Run the full evaluator regression suite** + +Run: `pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -q` +Expected: PASS with all evaluator framework and CLI consumer tests green. + +- [ ] **Step 5: Validate the OpenSpec change after code and docs are aligned** + +Run: `openspec validate aworld-evaluator-v2-extensibility-2026-06-09 --strict` +Expected: `Change 'aworld-evaluator-v2-extensibility-2026-06-09' is valid` + +- [ ] **Step 6: Commit the migration and verification slice** + +```bash +git add aworld/evaluations/substrate.py aworld/evaluations/README.md openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py +git commit -m "docs: finalize evaluator v2 extensibility rollout" +``` + +## Self-Review + +- Spec coverage: + - harness lowering, execution adapters, and `PROGRAM` mode -> Task 1 + - typed judge-output contracts with legacy compatibility -> Task 2 + - composite gate policies with threshold lowering -> Task 3 + - trajectory scorer declarations, verification, and spec alignment -> Task 4 +- Placeholder scan: + - no `TODO`, `TBD`, or "similar to previous task" shortcuts remain + - remaining `...` tokens only appear inside Python variadic tuple type annotations, not as placeholders + - each code-changing step contains concrete file paths and code snippets +- Type consistency: + - `EvalExecutionMode.PROGRAM`, `ExecutionAdapter`, `JudgeSchemaDef.validate_payload`, `GateMetricCondition`, and `GatePolicyDef` names are used consistently across tasks diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md new file mode 100644 index 000000000..2c1d275a1 --- /dev/null +++ b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md @@ -0,0 +1,34 @@ +## Why + +`aworld-evaluation-substrate-2026-06-01` established the first execution-backed evaluator substrate and completed the v1 CLI flow, but it intentionally stopped short of several extensibility and contract-hardening steps: + +- suite execution modes are still limited to `static`, `agent`, and `task` +- execution targets still couple directly to current runner entrypoints +- judge schemas are still lightweight required-field checks +- gate policies still only express single-metric threshold decisions + +Those tradeoffs were acceptable for v1, but they limit AWorld's ability to expose evaluation as a broader framework capability for non-agent programs, stricter automation, and richer reusable evaluator definitions. + +This change is an incremental hardening of the v1 single-shot evaluator substrate. It is not intended to claim verifiers v1 parity: multi-turn rollout ownership, user simulators, lifecycle hooks, child-state composition, and training reward semantics remain out of scope for a later runtime-composition change. + +## What Changes + +- Add a lightweight first-class `EvalHarnessDef` so suites have an explicit execution boundary in the suite/case/harness/state hierarchy without adopting a full rollout-owning harness object model. +- Extend the framework-owned evaluation substrate with a bounded `PROGRAM` execution mode for importable program-backed evaluators that do not use AWorld's agent/task runtime. +- Add an internal execution adapter layer under `aworld/evaluations/` so suite-backed evaluation no longer hardcodes runner invocation details into eval targets. +- Promote judge output contracts from required-field-only validation to typed model validation with JSON-schema-friendly structure and a compatibility bridge for existing suites. +- Expand gate policies from single-threshold checks into structured composite conditions with explicit comparison operators while preserving the current simple threshold shape as compatibility sugar. +- Add suite-declared trajectory scorers so result evaluation and normalized trajectory/process metric evaluation can be configured side by side in the current single-shot flow. +- Keep `aworld-cli evaluator` compatible as an assembly layer on top of the evolved framework substrate rather than introducing a second evaluator stack. + +## Capabilities + +### Modified Capabilities + +- `evaluation-substrate`: add adapter-backed program execution, typed judge schemas, and richer composite gate policies without breaking the v1 evaluator substrate shape. + +## Impact + +- Affected code: `aworld/evaluations/**`, especially execution specification, substrate compilation, eval target execution, judge validation, and gate evaluation paths. +- Affected APIs: internal evaluation composition APIs gain additive extensions; existing suite-backed and legacy evaluation callers remain valid through compatibility paths. +- Affected systems: framework-owned evaluator execution and scoring; `aworld-cli evaluator` should inherit the new framework capabilities without owning their semantics. diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md new file mode 100644 index 000000000..c4130959e --- /dev/null +++ b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md @@ -0,0 +1,101 @@ +## MODIFIED Requirements + +### Requirement: Execution-backed suite flows reuse framework evaluation primitives + +The framework SHALL support suite-backed evaluation flows that execute targets through existing AWorld runtime primitives and adapter-backed program execution while exposing reusable execution results for downstream scoring. + +#### Scenario: Suite-backed flow lowers execution through a reusable harness boundary +- **WHEN** a suite-backed evaluator declares execution directly or through an evaluator harness +- **THEN** the framework SHALL compile the flow through a single harness boundary that owns execution configuration and adapter selection while preserving the existing suite/case/state model + +#### Scenario: Suite-backed flow executes through current task or agent runtime +- **WHEN** a suite-backed evaluator is configured to execute through the existing AWorld agent or task runtime +- **THEN** the framework SHALL adapt the suite flow through framework-owned execution adapters instead of hardcoding runner invocation details across evaluator targets + +#### Scenario: Suite-backed flow executes through a program-backed runtime +- **WHEN** a suite-backed evaluator is configured with a program-backed execution reference +- **THEN** the framework SHALL execute that program through a framework-owned execution adapter and normalize the result into the common evaluator execution state + +#### Scenario: Program-backed runtime is bounded to importable callables +- **WHEN** a suite-backed evaluator declares program-backed execution +- **THEN** the framework SHALL require an importable callable reference and reject unsupported command, sandbox, workflow-engine, or missing-reference configuration for this change + +#### Scenario: Importable callable execution is trusted +- **WHEN** a suite-backed evaluator declares a program reference or task-builder reference +- **THEN** the framework SHALL treat that reference as trusted in-process code controlled by the runner or workspace owner and SHALL NOT expose it through declared JSON manifests in this change + +#### Scenario: Program-backed runtime returns supported output +- **WHEN** a program-backed evaluator returns an `EvalState`, an `EvalState`-shaped mapping, a `TaskResponse`, or a bare answer value +- **THEN** the framework SHALL normalize that output into the common evaluator execution state without storing live runtime handles in the state + +#### Scenario: Existing static suite execution remains available +- **WHEN** a suite-backed evaluator is defined without execution-backed target settings +- **THEN** the framework SHALL continue to support the current static evaluation path as a valid suite execution mode + +### Requirement: Schema-constrained judge outputs + +Suite-backed evaluation flows SHALL validate judge outputs against an explicit typed judge schema before final scoring and reporting are completed, while preserving compatibility for current lightweight schema definitions. + +#### Scenario: Judge output matches the declared typed schema +- **WHEN** a suite-backed evaluator returns a result that satisfies the declared typed judge-output model +- **THEN** the framework SHALL accept the result for downstream scoring, gating, and reporting + +#### Scenario: Judge output violates the declared typed schema +- **WHEN** a suite-backed evaluator returns a result that fails the declared typed judge-output model +- **THEN** the framework SHALL surface the typed schema violation as an evaluation failure or invalid result state rather than silently accepting malformed output + +#### Scenario: Legacy schema definitions remain valid during migration +- **WHEN** an existing suite-backed evaluator still uses the current lightweight required-field schema definition +- **THEN** the framework SHALL continue to validate that suite through a compatibility path without forcing immediate migration + +#### Scenario: Judge schema metadata is exposed once per report +- **WHEN** a suite-backed evaluator has a typed or compatibility judge schema +- **THEN** the framework SHALL expose the derived judge schema metadata at the report level rather than duplicating the schema in every case result + +### Requirement: First-class gate outcomes + +Suite-backed evaluation flows SHALL evaluate a declared structured gate policy and produce a gate outcome of `pass`, `fail`, or `needs_approval`. + +#### Scenario: Composite pass conditions succeed +- **WHEN** all required pass conditions in the structured gate policy are satisfied +- **THEN** the framework SHALL emit a `pass` gate outcome + +#### Scenario: Approval-stage conditions match +- **WHEN** pass conditions are not satisfied but the structured gate policy marks the result as eligible for human review +- **THEN** the framework SHALL emit a `needs_approval` gate outcome + +#### Scenario: Composite gate conditions fail without approval path +- **WHEN** required pass conditions are not satisfied and no approval-stage conditions apply +- **THEN** the framework SHALL emit a `fail` gate outcome + +#### Scenario: Legacy threshold gates remain valid +- **WHEN** an existing suite-backed evaluator uses the current single-threshold gate definition +- **THEN** the framework SHALL preserve that behavior through a compatibility lowering into the structured gate policy model + +#### Scenario: Gate conditions use explicit comparison operators +- **WHEN** a structured gate condition compares a metric to a threshold +- **THEN** the framework SHALL support `>=`, `<=`, `>`, `<`, `==`, and `!=` operators and surface unsupported operators as invalid gate configuration + +#### Scenario: Gate references a missing metric +- **WHEN** a structured gate condition references a metric that is not present in aggregate results +- **THEN** the framework SHALL fail the gate closed, include the missing condition in `failed_conditions`, and still return the completed case results and available metrics + +### Requirement: Suite-declared trajectory evaluation + +Suite-backed evaluation flows SHALL support normalized trajectory-level scoring alongside result-level judge scoring while preserving the common report metric shape. + +#### Scenario: Suite declares trajectory scorers +- **WHEN** a suite-backed evaluator declares trajectory scorer definitions +- **THEN** the framework SHALL lower those definitions into normal evaluator scoring criteria that inspect the normalized execution state trajectory + +#### Scenario: Trajectory evaluation remains single-shot in this change +- **WHEN** a suite-backed evaluator uses trajectory scorers in this change +- **THEN** the framework SHALL score the trajectory already captured in `EvalState` and SHALL NOT claim multi-turn rollout ownership, user simulation, lifecycle hooks, or step-level training reward semantics + +#### Scenario: Trajectory scorer results participate in gates and reports +- **WHEN** a trajectory scorer emits a metric result +- **THEN** the framework SHALL include that metric in case metrics, aggregate metrics, and structured gate evaluation without replacing the final-result judge score + +#### Scenario: Suite has no trajectory scorers +- **WHEN** a suite-backed evaluator omits trajectory scorer definitions +- **THEN** the framework SHALL preserve the current result-focused scoring behavior diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md new file mode 100644 index 000000000..72a881572 --- /dev/null +++ b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md @@ -0,0 +1,37 @@ +## 1. Execution Extensibility + +- [x] 1.0 Add a lightweight `EvalHarnessDef` boundary and compatibility lowering from direct `suite.execution`. +- [x] 1.1 Add a `PROGRAM` execution mode to the framework-owned evaluation execution model. +- [x] 1.2 Introduce an internal execution adapter boundary under `aworld/evaluations/` for static, agent, task, and program-backed execution. +- [x] 1.3 Keep existing `static`, `agent`, and `task` suite-backed flows working through compatibility paths. +- [x] 1.4 Normalize program-backed execution results into the same `EvalState` shape used by current execution-backed evaluation. +- [x] 1.5 Validate `PROGRAM` specs up front, including required importable `target_ref` and unsupported command/workflow forms. +- [x] 1.6 Keep importable callable execution as a trusted in-memory framework contract and reject executable refs in declared JSON manifests. + +## 2. Typed Judge Contracts + +- [x] 2.1 Add typed judge-output model support as the primary suite-backed validation contract. +- [x] 2.2 Preserve compatibility for current required-field-based judge schema definitions during migration. +- [x] 2.3 Expose judge-model-derived schema metadata once at the report level for docs or downstream tooling. + +## 3. Composite Gate Policies + +- [x] 3.1 Expand gate definitions from single-threshold checks to structured composite metric conditions. +- [x] 3.2 Support `pass`, `fail`, and `needs_approval` outcomes from composite gate evaluation. +- [x] 3.3 Keep current threshold-style gate definitions valid as compatibility sugar over the richer gate model. +- [x] 3.4 Support `>=`, `<=`, `>`, `<`, `==`, and `!=` gate operators. +- [x] 3.5 Fail structured gates closed when a condition references a missing metric while preserving the completed report payload. + +## 4. Trajectory Evaluation + +- [x] 4.1 Add suite-declared trajectory scorer definitions that lower into normal evaluator criteria. +- [x] 4.2 Keep existing trajectory scorer/extractor behavior and report metric shapes compatible. +- [x] 4.3 Add coverage for trajectory metrics participating in reports and gate evaluation. + +## 5. Verification + +- [x] 5.1 Add regression coverage for adapter-backed execution across static, agent, task, and program-backed suites. +- [x] 5.2 Add coverage for typed judge validation success, failure, and legacy compatibility paths. +- [x] 5.3 Add coverage for composite gate evaluation, all supported operators, missing metrics, and legacy threshold compatibility. +- [x] 5.4 Add error-path coverage for program exceptions and malformed program output where applicable. +- [x] 5.5 Validate the OpenSpec change and keep it aligned with the framework-owned evaluator scope. From 9feb9b6658685ef5bf9f8058039371ad41b07801 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 11:33:33 +0800 Subject: [PATCH 20/41] fix: harden evaluator trajectory configuration --- aworld/evaluations/eval_targets/agent_eval.py | 19 ++--- aworld/evaluations/execution_adapters.py | 16 +++-- aworld/evaluations/scorers/__init__.py | 3 + aworld/evaluations/substrate.py | 26 +++++++ .../evaluations/test_evaluation_substrate.py | 41 +++++++++++ tests/evaluations/test_execution_adapters.py | 72 +++++++++++++++++++ 6 files changed, 159 insertions(+), 18 deletions(-) diff --git a/aworld/evaluations/eval_targets/agent_eval.py b/aworld/evaluations/eval_targets/agent_eval.py index 0f4744a0f..1139405dd 100644 --- a/aworld/evaluations/eval_targets/agent_eval.py +++ b/aworld/evaluations/eval_targets/agent_eval.py @@ -81,20 +81,15 @@ async def predict(self, index: int, input: Union[EvalDataCase[dict], dict]) -> d "input": dict(case_data), }, )() - state = await resolve_execution_adapter( - EvalExecutionSpec( - mode=EvalExecutionMode.AGENT, - target_config={"agent": self.agent}, - query_column=query_column, - ) - ).execute( + spec = EvalExecutionSpec( + mode=EvalExecutionMode.AGENT, + target_config={"agent": self.agent}, + query_column=query_column, + ) + state = await resolve_execution_adapter(spec).execute( case=case, target=dict(case_data.get("_target", {})), - spec=EvalExecutionSpec( - mode=EvalExecutionMode.AGENT, - target_config={"agent": self.agent}, - query_column=query_column, - ), + spec=spec, ) return {"answer": state.answer, "state": state.to_dict()} diff --git a/aworld/evaluations/execution_adapters.py b/aworld/evaluations/execution_adapters.py index 2929f3381..1e40bba2c 100644 --- a/aworld/evaluations/execution_adapters.py +++ b/aworld/evaluations/execution_adapters.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from typing import Any, Protocol -from aworld.core.task import TaskResponse from aworld.evaluations.execution import ( EvalExecutionMode, EvalExecutionSpec, @@ -22,6 +21,13 @@ async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutio raise NotImplementedError +def _execution_metadata(*, mode: EvalExecutionMode | None = None) -> dict[str, Any]: + metadata: dict[str, Any] = {} + if mode is not None: + metadata["_execution_mode"] = mode.value + return metadata + + @dataclass(frozen=True) class StaticExecutionAdapter: async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutionSpec) -> EvalState: @@ -44,7 +50,7 @@ async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutio case_id=case.case_id, response=response, target=target, - metadata=case.input, + metadata=_execution_metadata(mode=spec.mode), ) @@ -65,14 +71,12 @@ async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutio result = result[task.id] elif isinstance(result, dict) and len(result) == 1 and not {"status", "answer", "completion"} & result.keys(): result = next(iter(result.values())) - elif isinstance(result, TaskResponse): - result = result return normalize_task_response_to_eval_state( case_id=case.case_id, response=result, target=target, - metadata=case.input, + metadata=_execution_metadata(mode=spec.mode), ) @@ -89,7 +93,7 @@ async def execute(self, *, case: Any, target: dict[str, Any], spec: EvalExecutio case_id=case.case_id, response=result, target=target, - metadata={**case.input, "_execution_mode": spec.mode.value}, + metadata=_execution_metadata(mode=spec.mode), ) diff --git a/aworld/evaluations/scorers/__init__.py b/aworld/evaluations/scorers/__init__.py index d4619379f..b0a178000 100644 --- a/aworld/evaluations/scorers/__init__.py +++ b/aworld/evaluations/scorers/__init__.py @@ -51,6 +51,9 @@ def unregister(self, name: str): if scorer_id in self._metric_to_scorers: del self._default_scorer_params[scorer_id] + def get_scorer_class(self, metric_name: str) -> Type[Scorer] | None: + return self._metric_to_scorers.get(metric_name) + def create_scorer_instance(self, scorer_class: Type[Scorer], criteria: EvalCriteria = None) -> Scorer: """Create a scorer instance using parameters from EvalCriteria and defaults. diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index dbd8b9cac..83a575c86 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -22,6 +22,7 @@ from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget, AworldTaskEvalTarget from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec, load_program_callable from aworld.evaluations.manifests import validate_declared_eval_suite_manifest +from aworld.evaluations.scorers import scorer_factory from aworld.evaluations.execution_adapters import resolve_execution_adapter from aworld.evaluations.report import ( CaseEvaluationReport, @@ -686,6 +687,7 @@ def _build_eval_target(flow: EvaluationFlowDef, target: dict[str, Any]): def _trajectory_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: criteria: list[dict[str, Any]] = [] for scorer in suite.trajectory_scorers: + _validate_trajectory_scorer_def(scorer) item: dict[str, Any] = { "metric_name": scorer.metric_name, "threshold": scorer.threshold, @@ -697,6 +699,30 @@ def _trajectory_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: return criteria +def _validate_trajectory_scorer_def(scorer: TrajectoryScorerDef) -> None: + scorer_class = scorer_factory.get_scorer_class(scorer.metric_name) + if scorer_class is None: + raise ValueError(f"unknown trajectory metric: {scorer.metric_name}") + if scorer.scorer_class is not None and scorer.scorer_class != scorer_class.__name__: + raise ValueError( + f"trajectory metric {scorer.metric_name} is registered to {scorer_class.__name__}, " + f"not {scorer.scorer_class}" + ) + if not scorer.scorer_params: + return + + signature = inspect.signature(scorer_class) + has_kwargs = any(param.kind == inspect.Parameter.VAR_KEYWORD for param in signature.parameters.values()) + unsupported = [ + key + for key in scorer.scorer_params + if key not in signature.parameters and not has_kwargs + ] + if unsupported: + joined = ", ".join(sorted(unsupported)) + raise ValueError(f"unsupported trajectory scorer_params for {scorer.metric_name}: {joined}") + + def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: normalized_target = _normalize_target(flow.target) dataset = build_eval_dataset(flow.suite.cases, normalized_target) diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index fe86972a7..5c0a411bc 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -109,6 +109,47 @@ def test_compile_evaluation_flow_lowers_trajectory_scorers_to_eval_criteria() -> assert metric_names == ["score", MetricNames.TRAJECTORY_TOOL_CALLS] +def test_compile_evaluation_flow_rejects_unknown_trajectory_metric() -> None: + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello world"})], + judge=lambda case_input, target: {"score": 1.0}, + trajectory_scorers=( + TrajectoryScorerDef(metric_name="trajectory_typo"), + ), + ) + + with pytest.raises(ValueError, match="unknown trajectory metric"): + compile_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + ) + + +def test_compile_evaluation_flow_rejects_unsupported_trajectory_scorer_params() -> None: + suite = EvalSuiteDef( + suite_id="trajectory-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello world"})], + judge=lambda case_input, target: {"score": 1.0}, + trajectory_scorers=( + TrajectoryScorerDef( + metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, + scorer_params={"minimum_calls": 2}, + ), + ), + ) + + with pytest.raises(ValueError, match="unsupported trajectory scorer_params"): + compile_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo.txt"}}, + suite=suite, + ) + ) + + def test_eval_case_def_supports_expected_and_runtime_overrides() -> None: case = EvalCaseDef( case_id="case-1", diff --git a/tests/evaluations/test_execution_adapters.py b/tests/evaluations/test_execution_adapters.py index 4dc188bba..59c100178 100644 --- a/tests/evaluations/test_execution_adapters.py +++ b/tests/evaluations/test_execution_adapters.py @@ -2,8 +2,10 @@ import pytest +from aworld.core.task import TaskResponse from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec from aworld.evaluations.execution_adapters import resolve_execution_adapter +from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget from aworld.evaluations.substrate import EvalCaseDef @@ -39,6 +41,76 @@ async def test_program_execution_adapter_normalizes_result(monkeypatch: pytest.M assert state.usage["total_tokens"] == 7 +@pytest.mark.asyncio +async def test_program_execution_adapter_does_not_copy_case_input_into_state_metadata( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "aworld.evaluations.execution_adapters.load_program_callable", + lambda ref: _demo_program, + ) + spec = EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case") + + state = await resolve_execution_adapter(spec).execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo", "large_blob": "x" * 1000}), + target={"target_kind": "directory"}, + spec=spec, + ) + + assert state.metadata["_execution_mode"] == "program" + assert state.metadata["_target"] == {"target_kind": "directory"} + assert "query" not in state.metadata + assert "large_blob" not in state.metadata + + +@pytest.mark.asyncio +async def test_task_execution_adapter_does_not_copy_case_input_into_state_metadata( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class DemoTask: + id = "task-1" + + async def fake_run_task(*, task): + return TaskResponse(success=True, answer="ok") + + monkeypatch.setattr("aworld.evaluations.execution_adapters.Runners.run_task", fake_run_task) + spec = EvalExecutionSpec(mode=EvalExecutionMode.TASK, target_config={"task": DemoTask()}) + + state = await resolve_execution_adapter(spec).execute( + case=EvalCaseDef(case_id="case-1", input={"query": "demo", "large_blob": "x" * 1000}), + target={"target_kind": "task"}, + spec=spec, + ) + + assert state.metadata["_target"] == {"target_kind": "task"} + assert "query" not in state.metadata + assert "large_blob" not in state.metadata + + +@pytest.mark.asyncio +async def test_agent_eval_target_reuses_resolved_execution_spec(monkeypatch: pytest.MonkeyPatch) -> None: + seen_specs = [] + + class FakeAdapter: + def __init__(self, expected_spec): + self.expected_spec = expected_spec + + async def execute(self, *, case, target, spec): + assert spec is self.expected_spec + return TaskResponse(success=True, answer="ok") + + def fake_resolve(spec): + seen_specs.append(spec) + return FakeAdapter(spec) + + monkeypatch.setattr("aworld.evaluations.eval_targets.agent_eval.resolve_execution_adapter", fake_resolve) + + result = await AworldAgentEvalTarget(agent=object()).predict(0, {"query": "hello"}) + + assert result["answer"] == "ok" + assert len(seen_specs) == 1 + + def test_resolve_execution_adapter_rejects_missing_program_ref() -> None: with pytest.raises(ValueError, match="target_ref"): resolve_execution_adapter(EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM)) From 34202d4839a196dd86a3ae633ead5f8e3efe5c58 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 11:41:27 +0800 Subject: [PATCH 21/41] docs: add evaluator runtime composition change --- .../.openspec.yaml | 2 + .../design.md | 166 +++++++++++++ .../implementation-plan.md | 219 ++++++++++++++++++ .../proposal.md | 33 +++ .../specs/evaluation-substrate/spec.md | 89 +++++++ .../tasks.md | 41 ++++ 6 files changed, 550 insertions(+) create mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml create mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md create mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md create mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md create mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md create mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml new file mode 100644 index 000000000..2cb80411e --- /dev/null +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md new file mode 100644 index 000000000..5c27f61a1 --- /dev/null +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md @@ -0,0 +1,166 @@ +## Context + +Evaluator v2 extensibility made the AWorld evaluator substrate more configurable, but it deliberately left verifiers-style runtime behavior out of scope. The current pipeline still compiles suites into the existing `EvalTarget -> Evaluator -> EvaluateRunner` skeleton and scores a single normalized state after execution. That is enough for post-hoc result and trajectory checks, but it cannot express: + +- a harness that owns rollout lifecycle and produces state +- controlled multi-turn user simulation +- per-step reward/reason records +- retry/fallback/wrapper harness composition +- child-state borrowing or links between rollout attempts +- an adoption suite that exercises these capabilities outside tests + +This change introduces runtime composition as a framework-owned layer under `aworld/evaluations/` while preserving the v2 single-shot substrate. + +## Goals / Non-Goals + +**Goals:** + +- Add a rollout-owning harness contract that executes evaluation cases and returns normalized rollout state. +- Support multi-turn rollout state with turns, messages, tool calls, usage, timing, terminal outcome, child-state links, and step rewards. +- Add a user simulator abstraction that can drive controlled multi-turn interactions. +- Add step-level reward definitions and aggregation so process quality can participate in reports and gates. +- Add at least one runtime composition wrapper, such as retry or fallback, that composes around a base harness. +- Add one builtin/adoption suite that uses typed judge schema, composite gate, trajectory scorer, and rollout harness together. +- Keep current static/agent/task/program single-shot flows compatible. + +**Non-Goals:** + +- Implementing a verifiers public API compatibility layer. +- Building a training optimizer, RL loop, or policy update system. +- Adding untrusted code execution, sandbox command execution, or package registry loading. +- Reworking `aworld-cli evaluator` UX or command syntax. +- Migrating every builtin suite in this change. +- Replacing `EvaluateRunner`; runtime composition should integrate with it through framework-owned targets/adapters. + +## Ownership Model + +| Concept | Owns | Must not own | +| --- | --- | --- | +| `EvalSuiteDef` / `EvalCaseDef` | Domain inputs, judge schema, gates, scorer declarations, runtime references | Live runtime handles in declarative manifests | +| `EvalRuntimeHarnessDef` | Rollout lifecycle configuration, simulator wiring, reward hooks, composition wrappers | Judge/scorer report assembly | +| `RuntimeHarness` | Executing one case through a rollout and returning rollout state | Gate policy decisions | +| `UserSimulator` | Producing user turns from case, rollout state, and previous assistant output | Agent execution internals | +| `StepRewarder` | Per-step reward values and reasons | Mutating rollout state or model behavior | +| `RolloutState` / `EvalState` bridge | Serializable rollout transcript and state normalization | Live clients, sandboxes, runners | + +## Decisions + +### 1. Add a rollout-owning harness layer + +Introduce a framework-owned runtime harness abstraction separate from the lightweight `EvalHarnessDef` from v2 extensibility. The runtime harness owns the lifecycle of a rollout: + +1. initialize state for one case +2. ask the user simulator or case input for the next user turn +3. execute the target runtime for one assistant/tool step +4. record messages, tool calls, observations, rewards, usage, and timing +5. decide whether the rollout is terminal +6. return a normalized rollout state + +The first implementation should keep the public surface small: + +- `EvalRuntimeHarnessDef`: immutable configuration object +- `RuntimeHarness`: protocol or base class for executing one case +- `run_rollout(case, target, harness) -> RolloutState`: internal framework entry point +- compatibility bridge from rollout state into existing `EvalState` + +This is the first AWorld harness that owns rollout. The older `EvalHarnessDef` remains a compatibility holder for single-shot execution specs. + +### 2. Model rollout state explicitly + +Add a serializable rollout state model rather than overloading arbitrary trajectory dictionaries. It should include: + +- `case_id` +- `status` +- `turns`: ordered user/assistant/tool records +- `messages`: normalized conversation messages when available +- `trajectory`: scorer-compatible trajectory view +- `tool_calls` +- `step_rewards` +- `child_states` or `attempts` for composed runtimes +- `usage` +- `timing` +- `error` +- `metadata` + +The bridge into `EvalState` should preserve the existing state summary and scorer helpers. Existing trajectory scorers should work against the bridge without needing a report format fork. + +### 3. Add user simulator contracts + +Add a user simulator interface that can be deterministic and testable: + +- input: case, target, rollout state, last assistant output +- output: next user message, terminal signal, or simulator error + +Built-in simulators should start small: + +- scripted simulator over case-provided turns +- static single-prompt simulator for compatibility + +LLM-backed simulators are a future extension unless there is already an internal judge/backend pattern that can be reused without adding product scope. + +### 4. Add step-level rewards + +Add reward records independent of final judge output: + +- `metric_name` +- `step_index` +- `value` +- `reason` +- `metadata` + +Rewarders should be pure evaluators over rollout state or an individual step. They must not mutate state or call model execution. Aggregation should produce normal evaluator metrics, for example mean reward, total reward, pass/fail threshold status, and report-level gate inputs. + +### 5. Add runtime composition wrappers + +Add one wrapper mechanism in this change so composition is real, not only a type hierarchy. The first wrapper should be retry or fallback: + +- retry wrapper: reruns a base harness when terminal state is failed or a configured reward/gate condition is not met +- fallback wrapper: tries alternate harnesses when one fails + +The wrapper must preserve child/attempt state so reports can explain which attempt passed or failed. The first implementation should support one wrapper style only if both would make the change too large. + +### 6. Add one adoption suite + +Add one builtin or framework-registered adoption suite that consumes the new runtime: + +- typed judge schema +- composite gate +- trajectory scorer +- step-level reward metric +- rollout-owning harness with scripted simulator + +This suite can be narrow and deterministic. Its purpose is to prove that the substrate is active in production code paths, not only in isolated unit tests. It should not replace `app-evaluator` unless that public contract is ready to change. + +### 7. Keep CLI additive + +`aworld-cli evaluator` should discover and run the adoption suite through existing suite selection paths. Do not add CLI-only runtime syntax in this change. If CLI ergonomics are needed later, handle them in a product-focused change after the framework contract settles. + +## Risks / Trade-offs + +- [Scope growth] -> Mitigation: ship one simulator, one wrapper style, and one adoption suite; defer untrusted execution, LLM simulators, and training loops. +- [Duplicate state models] -> Mitigation: rollout state must bridge into `EvalState` and reuse existing scorer/report helpers. +- [Hard-to-debug composed runs] -> Mitigation: preserve attempt/child state and reward reasons in serializable report metadata. +- [Adoption suite changes public behavior] -> Mitigation: add a new suite or opt-in registration rather than silently changing `app-evaluator`. +- [Runtime harness conflicts with existing adapter layer] -> Mitigation: keep adapters for single-shot execution; runtime harnesses own multi-turn rollout and may call adapters internally. + +## Migration Plan + +1. Add rollout state and harness interfaces without changing existing suite behavior. +2. Add scripted user simulator and reward records. +3. Add rollout target/adapter bridge into `EvalState`. +4. Add one runtime wrapper style with child-state reporting. +5. Add adoption suite that consumes typed schema, composite gate, trajectory scorer, rollout harness, and step rewards. +6. Keep existing evaluator regression suite green and add focused runtime-composition coverage. + +Rollback strategy: + +- runtime-composition suites are opt-in +- single-shot suite behavior and existing report fields remain compatible +- adoption suite can be unregistered or hidden without removing the underlying framework interfaces + +## Deferred Questions + +- LLM-backed user simulators should wait until deterministic scripted simulators are stable. +- Sandbox/command-backed harness execution should wait for a dedicated trusted execution change. +- Public API naming can be refined after the internal framework contract proves itself. +- Training reward integration should wait for a separate optimizer/training change. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md new file mode 100644 index 000000000..e899c8f50 --- /dev/null +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md @@ -0,0 +1,219 @@ +# AWorld Evaluator Runtime Composition Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add rollout-owning evaluator runtime composition with multi-turn harnesses, user simulation, step-level rewards, and one adoption suite that actively consumes v2 evaluator capabilities. + +**Architecture:** Keep the current single-shot evaluator substrate intact. Add a runtime-composition layer under `aworld/evaluations/` that can execute multi-turn rollouts, normalize them into `EvalState`, aggregate reward metrics, and compose retry attempts while preserving child state. + +**Tech Stack:** Python dataclasses/protocols, AWorld evaluator substrate, existing scorer/report infrastructure, Pydantic for typed judge outputs, pytest, OpenSpec. + +--- + +## File Structure + +- Create: `aworld/evaluations/runtime_composition.py` + Rollout state, turn records, user simulator protocols, runtime harness protocols, and retry wrapper primitives. +- Modify: `aworld/evaluations/substrate.py` + Compile opt-in runtime-composition suites and register the adoption suite. +- Modify: `aworld/evaluations/execution.py` + Add rollout-state-to-`EvalState` normalization helpers if they do not fit cleanly in `runtime_composition.py`. +- Modify: `aworld/evaluations/report.py` + Preserve attempt/reward metadata in existing report shape without breaking schema. +- Modify: `aworld/evaluations/scorers/**` + Add reward aggregation scorers or reuse existing scorer infrastructure for step reward metrics. +- Test: `tests/evaluations/test_runtime_composition.py` + Focused tests for rollout state, simulator, harness, retry wrapper, reward aggregation, and adoption suite. +- Test: existing evaluator regression tests + Ensure single-shot behavior remains compatible. + +## Task 1: Rollout State and Harness Contracts + +- [ ] **Step 1: Write failing rollout state tests** + +Add tests in `tests/evaluations/test_runtime_composition.py` for: + +```python +def test_rollout_state_to_eval_state_excludes_live_handles(): + live_agent = object() + state = RolloutState( + case_id="case-1", + status="success", + answer="done", + turns=[RolloutTurn(role="user", content="hello")], + metadata={"live_agent": live_agent, "safe": "ok"}, + ) + + eval_state = state.to_eval_state(target={"target_kind": "inline"}) + + assert eval_state.case_id == "case-1" + assert eval_state.answer == "done" + assert eval_state.trajectory + assert "live_agent" not in eval_state.metadata + assert eval_state.metadata["safe"] == "ok" +``` + +- [ ] **Step 2: Run test and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_rollout_state_to_eval_state_excludes_live_handles -q` + +Expected: FAIL because `runtime_composition.py` and `RolloutState` do not exist. + +- [ ] **Step 3: Add minimal rollout models** + +Create `aworld/evaluations/runtime_composition.py` with serializable dataclasses for `RolloutTurn`, `StepReward`, `RolloutState`, `EvalRuntimeHarnessDef`, `RuntimeHarness`, and `UserSimulator`. + +- [ ] **Step 4: Run rollout tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS for initial rollout state tests. + +## Task 2: Scripted User Simulator + +- [ ] **Step 1: Write failing simulator tests** + +Cover scripted turns and single-prompt behavior: + +```python +def test_scripted_user_simulator_emits_turns_in_order(): + simulator = ScriptedUserSimulator() + state = RolloutState(case_id="case-1") + case = EvalCaseDef(case_id="case-1", input={"turns": ["hi", "again"]}) + + first = simulator.next_turn(case=case, target={}, state=state, last_output=None) + state.turns.append(first) + second = simulator.next_turn(case=case, target={}, state=state, last_output="ok") + + assert first.content == "hi" + assert second.content == "again" +``` + +- [ ] **Step 2: Run simulator tests and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_scripted_user_simulator_emits_turns_in_order -q` + +Expected: FAIL because simulator implementation does not exist. + +- [ ] **Step 3: Implement scripted and single-prompt simulators** + +Add `ScriptedUserSimulator` and `SinglePromptUserSimulator` to `runtime_composition.py`. + +- [ ] **Step 4: Run simulator tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS. + +## Task 3: Runtime Harness Execution + +- [ ] **Step 1: Write failing harness execution tests** + +Add a deterministic harness test that consumes simulator turns and returns rollout state with assistant turns and trajectory. + +- [ ] **Step 2: Run harness test and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_runtime_harness_executes_multi_turn_rollout -q` + +Expected: FAIL because harness implementation does not exist. + +- [ ] **Step 3: Implement a minimal scripted runtime harness** + +Add a framework test harness or deterministic harness class that uses a simulator and a callable assistant step function. + +- [ ] **Step 4: Run harness tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS. + +## Task 4: Step Rewards and Aggregation + +- [ ] **Step 1: Write failing reward aggregation tests** + +Cover reward records becoming case and aggregate metrics. + +- [ ] **Step 2: Run reward tests and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_step_rewards_aggregate_into_metrics -q` + +Expected: FAIL because reward aggregation is not wired. + +- [ ] **Step 3: Implement step reward records and aggregation scorer** + +Use existing scorer/report metric shapes. Keep reward metrics distinct from judge metrics. + +- [ ] **Step 4: Run reward tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS. + +## Task 5: Retry Wrapper Composition + +- [ ] **Step 1: Write failing retry wrapper tests** + +Cover failed first attempt, successful second attempt, and preserved child/attempt state. + +- [ ] **Step 2: Run retry tests and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_retry_wrapper_preserves_failed_attempts -q` + +Expected: FAIL because retry wrapper does not exist. + +- [ ] **Step 3: Implement retry wrapper** + +Add a retry wrapper around a base `RuntimeHarness` with max attempts and selected terminal attempt. + +- [ ] **Step 4: Run retry tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS. + +## Task 6: Adoption Suite + +- [ ] **Step 1: Write failing adoption suite tests** + +Assert the new suite is registered, uses typed judge schema, composite gate, trajectory scorer, step reward metric, scripted simulator, and runtime harness. + +- [ ] **Step 2: Run adoption tests and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_runtime_composition_adoption_suite_runs_end_to_end -q` + +Expected: FAIL because suite does not exist. + +- [ ] **Step 3: Implement opt-in adoption suite** + +Add a narrow deterministic suite without changing `app-evaluator` behavior. + +- [ ] **Step 4: Run adoption tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluation_substrate.py -q` + +Expected: PASS. + +## Task 7: Verification and Commit + +- [ ] **Step 1: Run evaluator regression suite** + +Run: + +```bash +pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_runtime_composition.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -q +``` + +Expected: PASS. + +- [ ] **Step 2: Validate OpenSpec** + +Run: `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict` + +Expected: `Change 'aworld-evaluator-runtime-composition-2026-06-10' is valid` + +- [ ] **Step 3: Commit** + +```bash +git add aworld/evaluations/runtime_composition.py aworld/evaluations/substrate.py aworld/evaluations/execution.py aworld/evaluations/report.py aworld/evaluations/scorers tests/evaluations/test_runtime_composition.py openspec/changes/aworld-evaluator-runtime-composition-2026-06-10 +git commit -m "feat: add evaluator runtime composition" +``` diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md new file mode 100644 index 000000000..5b0ef8bcc --- /dev/null +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md @@ -0,0 +1,33 @@ +## Why + +`aworld-evaluator-v2-extensibility-2026-06-09` hardened the single-shot evaluator substrate with execution adapters, typed judge schemas, composite gates, bounded `PROGRAM` execution, and suite-declared trajectory scorers. That change intentionally stopped short of verifiers-style runtime composition: + +- `EvalHarnessDef` is a lightweight execution-spec holder, not a rollout-owning runtime object +- trajectory evaluation inspects the already captured single-shot `EvalState.trajectory` +- there is no user simulator, lifecycle hook model, child-state composition, retry/fallback harness composition, or step-level reward contract +- no builtin or adoption suite currently exercises typed judge + composite gate + trajectory scorer + rollout runtime together + +The result is useful framework substrate, but not yet a complete runtime-composition evaluation capability. This change adds the missing runtime layer and proves it through one concrete adoption suite. + +## What Changes + +- Add a rollout-owning evaluator runtime harness abstraction that can execute multi-turn cases and produce normalized rollout state. +- Add multi-turn rollout state with turns, messages, tool calls, terminal outcome, step rewards, and child-state links. +- Add a user simulator contract for controlled multi-turn agent/user evaluation. +- Add step-level reward definitions and aggregation into report metrics and gates. +- Add runtime composition wrappers, starting with retry/fallback or equivalent wrapper harness semantics. +- Add one builtin/adoption suite that actually uses typed judge output, composite gates, trajectory scoring, and the new rollout-owning harness. +- Keep existing single-shot evaluator flows compatible and avoid changing the `aworld-cli evaluator` command shape. + +## Capabilities + +### Modified Capabilities + +- `evaluation-substrate`: add rollout-owning runtime composition, multi-turn harness execution, user simulation, step-level reward scoring, and one adoption suite that consumes the v2 substrate capabilities end to end. + +## Impact + +- Affected code: `aworld/evaluations/**`, especially substrate definitions, execution/runtime orchestration, scorer integration, report assembly, and builtin suite registration. +- Affected APIs: framework-owned evaluator APIs gain additive runtime-composition contracts; existing suite-backed and legacy evaluation callers remain valid. +- Affected tests: add focused coverage for harness rollout, user simulation, reward aggregation, runtime wrappers, report/gate integration, and adoption suite behavior. +- Affected docs: clarify the difference between single-shot evaluation and rollout-owning runtime composition. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md new file mode 100644 index 000000000..c9470449b --- /dev/null +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md @@ -0,0 +1,89 @@ +## MODIFIED Requirements + +### Requirement: Runtime-composed evaluation harnesses + +Suite-backed evaluation flows SHALL support opt-in rollout-owning runtime harnesses that execute multi-turn cases and produce normalized rollout state while preserving existing single-shot evaluator behavior. + +#### Scenario: Suite selects a rollout-owning harness +- **WHEN** a suite-backed evaluator declares a runtime-composition harness +- **THEN** the framework SHALL execute the case through that harness lifecycle rather than treating the harness as only an execution-spec holder + +#### Scenario: Existing single-shot suites remain compatible +- **WHEN** a suite-backed evaluator does not declare a runtime-composition harness +- **THEN** the framework SHALL preserve the current static, agent, task, and program execution behavior + +#### Scenario: Runtime harness returns rollout state +- **WHEN** a runtime harness completes a case rollout +- **THEN** the framework SHALL normalize the rollout into evaluator state containing terminal answer, trajectory, tool calls, usage, timing, error, and metadata fields usable by existing scorer helpers + +### Requirement: Multi-turn rollout state + +Runtime-composed evaluation flows SHALL represent multi-turn execution as serializable rollout state. + +#### Scenario: Rollout has multiple turns +- **WHEN** a runtime-composed harness executes multiple user/assistant/tool turns +- **THEN** the framework SHALL preserve ordered turns, normalized messages, trajectory entries, tool calls, terminal status, and terminal answer + +#### Scenario: Runtime composition creates child states +- **WHEN** a runtime wrapper retries or falls back to another harness attempt +- **THEN** the framework SHALL preserve child or attempt state so reports can explain the composed execution path + +#### Scenario: Rollout state is serializable +- **WHEN** rollout state is converted into evaluator state or report payloads +- **THEN** the framework SHALL exclude live runtime handles, clients, agent instances, and simulator objects + +### Requirement: User simulation + +Runtime-composed evaluation flows SHALL support deterministic user simulators that drive controlled multi-turn rollouts. + +#### Scenario: Scripted simulator provides turns +- **WHEN** a case includes scripted user turns +- **THEN** the framework SHALL let the scripted simulator provide those turns in order until it reaches a terminal condition + +#### Scenario: Single-prompt simulator preserves one-shot behavior +- **WHEN** a case only includes a single prompt or query +- **THEN** the framework SHALL support a simulator that emits one user turn and then terminates unless the harness requests additional turns + +#### Scenario: Simulator errors are captured +- **WHEN** a user simulator cannot produce a valid next turn +- **THEN** the framework SHALL mark the rollout state as failed with a serializable error rather than storing the simulator object + +### Requirement: Step-level rewards + +Runtime-composed evaluation flows SHALL support step-level reward records that can be aggregated into normal evaluator metrics. + +#### Scenario: Step rewarder evaluates rollout steps +- **WHEN** a step rewarder inspects a rollout step +- **THEN** it SHALL emit a reward record containing metric name, step index, numeric value, reason, and serializable metadata + +#### Scenario: Rewards aggregate into metrics +- **WHEN** a rollout contains step reward records +- **THEN** the framework SHALL aggregate configured reward metrics into case metrics, aggregate metrics, and structured gate inputs + +#### Scenario: Rewards do not replace final judge output +- **WHEN** a suite uses both typed judge output and step rewards +- **THEN** the framework SHALL keep judge metrics and reward metrics distinct while allowing composite gates to reference both + +### Requirement: Runtime composition wrappers + +Runtime-composed evaluation flows SHALL support at least one wrapper harness that composes around a base harness and preserves attempt state. + +#### Scenario: Retry wrapper reruns failed rollout +- **WHEN** a retry wrapper receives a failed terminal rollout or a configured failed reward condition +- **THEN** it SHALL rerun the base harness up to the configured limit and preserve each attempt as child or attempt state + +#### Scenario: Retry wrapper reports terminal attempt +- **WHEN** a retry wrapper finishes +- **THEN** it SHALL expose the selected terminal attempt as the main rollout state while retaining previous attempts for inspection + +### Requirement: Runtime-composition adoption suite + +The framework SHALL include one opt-in adoption suite that exercises runtime composition and v2 extensibility together. + +#### Scenario: Adoption suite uses active v2 capabilities +- **WHEN** the adoption suite is selected +- **THEN** it SHALL use a typed judge schema, composite gate, trajectory scorer, step-level reward metric, scripted user simulator, and rollout-owning harness + +#### Scenario: App evaluator remains unchanged +- **WHEN** callers use the existing `app-evaluator` suite +- **THEN** its behavior SHALL remain compatible unless a later explicit migration change updates that suite diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md new file mode 100644 index 000000000..4e39aad4c --- /dev/null +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md @@ -0,0 +1,41 @@ +## 1. Runtime Harness Model + +- [ ] 1.1 Add a rollout-owning runtime harness definition separate from lightweight `EvalHarnessDef`. +- [ ] 1.2 Add a `RuntimeHarness` protocol or base class that executes one case and returns rollout state. +- [ ] 1.3 Preserve existing single-shot static/agent/task/program flows unchanged. + +## 2. Rollout State + +- [ ] 2.1 Add a serializable rollout state model with turns, messages, trajectory, tool calls, usage, timing, errors, metadata, and child/attempt state. +- [ ] 2.2 Bridge rollout state into existing `EvalState` so current scorer helpers and report summaries keep working. +- [ ] 2.3 Add tests proving rollout state does not store live runtime handles. + +## 3. User Simulation + +- [ ] 3.1 Add a deterministic user simulator contract. +- [ ] 3.2 Add a scripted simulator that reads turns from case input. +- [ ] 3.3 Add a single-prompt simulator for compatibility with current one-shot cases. + +## 4. Step-Level Rewards + +- [ ] 4.1 Add step reward records with metric name, step index, value, reason, and metadata. +- [ ] 4.2 Add rewarder interfaces that inspect rollout state without mutating it. +- [ ] 4.3 Aggregate step rewards into normal evaluator metrics and gate inputs. + +## 5. Runtime Composition + +- [ ] 5.1 Add one runtime wrapper style, preferably retry, around a base runtime harness. +- [ ] 5.2 Preserve child/attempt state for composed runs. +- [ ] 5.3 Add tests for retry/fallback state, terminal status, and report visibility. + +## 6. Adoption Suite + +- [ ] 6.1 Add one builtin or framework-registered adoption suite that uses the runtime-composition path. +- [ ] 6.2 The adoption suite uses typed judge schema, composite gate, trajectory scorer, step-level reward, and scripted simulator. +- [ ] 6.3 Keep `app-evaluator` behavior unchanged unless explicitly selected for migration later. + +## 7. Verification + +- [ ] 7.1 Add focused tests for harness rollout, user simulator, reward aggregation, runtime wrapper composition, and adoption suite execution. +- [ ] 7.2 Run the evaluator regression suite. +- [ ] 7.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict`. From 444a705734c41986f273e760cf481f33b289cf31 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 14:19:24 +0800 Subject: [PATCH 22/41] docs: align runtime composition outcomes and trials --- .../design.md | 86 ++++++++++++--- .../implementation-plan.md | 102 ++++++++++++++---- .../proposal.md | 6 +- .../specs/evaluation-substrate/spec.md | 48 ++++++++- .../tasks.md | 54 ++++++---- 5 files changed, 237 insertions(+), 59 deletions(-) diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md index 5c27f61a1..4f42450fe 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md @@ -5,11 +5,13 @@ Evaluator v2 extensibility made the AWorld evaluator substrate more configurable - a harness that owns rollout lifecycle and produces state - controlled multi-turn user simulation - per-step reward/reason records +- outcome checks against final environment or artifact state - retry/fallback/wrapper harness composition - child-state borrowing or links between rollout attempts +- trial-based pass@k/pass^k metrics for nondeterministic agents - an adoption suite that exercises these capabilities outside tests -This change introduces runtime composition as a framework-owned layer under `aworld/evaluations/` while preserving the v2 single-shot substrate. +This change introduces runtime composition as a framework-owned layer under `aworld/evaluations/` while preserving the v2 single-shot substrate. It also adds outcome/state-check grading because outcome verification is tightly coupled to rollout state and environment snapshots. Trial-based pass@k/pass^k execution is explicitly deferred because retry composition and independent trials have different semantics. ## Goals / Non-Goals @@ -18,9 +20,11 @@ This change introduces runtime composition as a framework-owned layer under `awo - Add a rollout-owning harness contract that executes evaluation cases and returns normalized rollout state. - Support multi-turn rollout state with turns, messages, tool calls, usage, timing, terminal outcome, child-state links, and step rewards. - Add a user simulator abstraction that can drive controlled multi-turn interactions. +- Add outcome/state-check grader definitions that verify final environment or artifact state separately from final text answer and trajectory. - Add step-level reward definitions and aggregation so process quality can participate in reports and gates. - Add at least one runtime composition wrapper, such as retry or fallback, that composes around a base harness. -- Add one builtin/adoption suite that uses typed judge schema, composite gate, trajectory scorer, and rollout harness together. +- Add one builtin/adoption suite that uses typed judge schema, composite gate, outcome grader, trajectory scorer, step reward, and rollout harness together. +- Add standard rollout metrics such as turn count, tool-call count, token usage, and timing/latency when the underlying runtime exposes them. - Keep current static/agent/task/program single-shot flows compatible. **Non-Goals:** @@ -28,6 +32,9 @@ This change introduces runtime composition as a framework-owned layer under `awo - Implementing a verifiers public API compatibility layer. - Building a training optimizer, RL loop, or policy update system. - Adding untrusted code execution, sandbox command execution, or package registry loading. +- Adding clean-environment isolation or sandbox reset semantics for each trial. +- Adding multi-trial execution, pass@k, pass^k, or trial-distribution metrics. +- Adding LLM-backed adaptive user simulation. - Reworking `aworld-cli evaluator` UX or command syntax. - Migrating every builtin suite in this change. - Replacing `EvaluateRunner`; runtime composition should integrate with it through framework-owned targets/adapters. @@ -40,6 +47,7 @@ This change introduces runtime composition as a framework-owned layer under `awo | `EvalRuntimeHarnessDef` | Rollout lifecycle configuration, simulator wiring, reward hooks, composition wrappers | Judge/scorer report assembly | | `RuntimeHarness` | Executing one case through a rollout and returning rollout state | Gate policy decisions | | `UserSimulator` | Producing user turns from case, rollout state, and previous assistant output | Agent execution internals | +| `OutcomeGrader` / `StateCheckGrader` | Checking final environment, artifact, or domain state | Driving rollout turns or replacing trajectory scoring | | `StepRewarder` | Per-step reward values and reasons | Mutating rollout state or model behavior | | `RolloutState` / `EvalState` bridge | Serializable rollout transcript and state normalization | Live clients, sandboxes, runners | @@ -76,15 +84,36 @@ Add a serializable rollout state model rather than overloading arbitrary traject - `trajectory`: scorer-compatible trajectory view - `tool_calls` - `step_rewards` +- `outcome`: final answer plus optional environment/artifact snapshot references and state-check results - `child_states` or `attempts` for composed runtimes - `usage` - `timing` +- `standard_metrics`: turn count, tool-call count, token counts, and latency/timing metrics derived from rollout state - `error` - `metadata` The bridge into `EvalState` should preserve the existing state summary and scorer helpers. Existing trajectory scorers should work against the bridge without needing a report format fork. -### 3. Add user simulator contracts +### 3. Add outcome/state-check grading + +Outcome evaluation must not be limited to terminal text. Add an outcome/state-check contract that can verify the final state produced by a rollout: + +- file or artifact existence/content checks +- structured environment snapshot checks +- database or domain-state assertions when the harness provides a serializable snapshot +- test-command or sandbox checks only through a future trusted execution/sandbox change + +The first implementation should keep state checks deterministic and in-process. A state-check grader receives the `RolloutState`, case, target, and optional serializable environment snapshot. It returns normal evaluator metric results plus structured details explaining which checks passed or failed. + +Outcome graders are distinct from: + +- typed judge output, which evaluates semantic result quality +- trajectory scorers, which evaluate process/transcript quality +- step rewarders, which evaluate individual rollout steps + +Composite gates may reference all of these metric families side by side. + +### 4. Add user simulator contracts Add a user simulator interface that can be deterministic and testable: @@ -96,21 +125,23 @@ Built-in simulators should start small: - scripted simulator over case-provided turns - static single-prompt simulator for compatibility -LLM-backed simulators are a future extension unless there is already an internal judge/backend pattern that can be reused without adding product scope. +LLM-backed simulators are a future extension. Scripted simulators are sufficient for this change's deterministic adoption suite, but they do not complete adaptive conversation-agent evaluation. -### 4. Add step-level rewards +### 5. Add step-level rewards Add reward records independent of final judge output: - `metric_name` - `step_index` - `value` +- `weight` +- `partial_credit` - `reason` - `metadata` -Rewarders should be pure evaluators over rollout state or an individual step. They must not mutate state or call model execution. Aggregation should produce normal evaluator metrics, for example mean reward, total reward, pass/fail threshold status, and report-level gate inputs. +Rewarders should be pure evaluators over rollout state or an individual step. They must not mutate state or call model execution. Aggregation should produce normal evaluator metrics, for example weighted mean reward, total reward, partial-credit rate, pass/fail threshold status, and report-level gate inputs. -### 5. Add runtime composition wrappers +### 6. Add runtime composition wrappers Add one wrapper mechanism in this change so composition is real, not only a type hierarchy. The first wrapper should be retry or fallback: @@ -119,38 +150,61 @@ Add one wrapper mechanism in this change so composition is real, not only a type The wrapper must preserve child/attempt state so reports can explain which attempt passed or failed. The first implementation should support one wrapper style only if both would make the change too large. -### 6. Add one adoption suite +Retry and fallback are not trials. Retry is an execution strategy that tries to produce one terminal rollout; trials are independent repeated evaluations used to estimate nondeterministic performance. This change must not label retry results as pass@k or pass^k. + +### 7. Add suite purpose metadata and standard metrics + +Suites should be able to describe whether they are intended for capability evaluation or regression evaluation. The first implementation can use suite metadata, for example: + +- `evaluation_purpose`: `capability` or `regression` +- `expected_pass_rate`: optional descriptive threshold or range + +Runtime-composed harnesses should derive standard transcript and latency metrics when data is available: + +- `n_turns` +- `n_tool_calls` +- `n_tokens` or token usage fields +- wall-clock duration / time cost +- optional first-token or first-action latency when exposed by the runtime + +Suites can still declare custom metrics, but these baseline metrics should not require every suite to hand-roll them. + +### 8. Add one adoption suite Add one builtin or framework-registered adoption suite that consumes the new runtime: - typed judge schema - composite gate +- outcome/state-check grader - trajectory scorer - step-level reward metric - rollout-owning harness with scripted simulator +- suite metadata marking whether the suite is for capability or regression use This suite can be narrow and deterministic. Its purpose is to prove that the substrate is active in production code paths, not only in isolated unit tests. It should not replace `app-evaluator` unless that public contract is ready to change. -### 7. Keep CLI additive +### 9. Keep CLI additive `aworld-cli evaluator` should discover and run the adoption suite through existing suite selection paths. Do not add CLI-only runtime syntax in this change. If CLI ergonomics are needed later, handle them in a product-focused change after the framework contract settles. ## Risks / Trade-offs -- [Scope growth] -> Mitigation: ship one simulator, one wrapper style, and one adoption suite; defer untrusted execution, LLM simulators, and training loops. +- [Scope growth] -> Mitigation: ship one simulator, one wrapper style, deterministic outcome checks, and one adoption suite; defer untrusted execution, LLM simulators, trials, and training loops. - [Duplicate state models] -> Mitigation: rollout state must bridge into `EvalState` and reuse existing scorer/report helpers. - [Hard-to-debug composed runs] -> Mitigation: preserve attempt/child state and reward reasons in serializable report metadata. - [Adoption suite changes public behavior] -> Mitigation: add a new suite or opt-in registration rather than silently changing `app-evaluator`. - [Runtime harness conflicts with existing adapter layer] -> Mitigation: keep adapters for single-shot execution; runtime harnesses own multi-turn rollout and may call adapters internally. +- [Retry metrics confused with pass@k] -> Mitigation: document retry as one composed rollout, not independent trials, and defer pass@k/pass^k to a separate multi-trial change. ## Migration Plan 1. Add rollout state and harness interfaces without changing existing suite behavior. -2. Add scripted user simulator and reward records. -3. Add rollout target/adapter bridge into `EvalState`. -4. Add one runtime wrapper style with child-state reporting. -5. Add adoption suite that consumes typed schema, composite gate, trajectory scorer, rollout harness, and step rewards. -6. Keep existing evaluator regression suite green and add focused runtime-composition coverage. +2. Add outcome/state-check grader contracts and deterministic state-check metrics. +3. Add scripted user simulator and reward records with weights and partial credit. +4. Add rollout target/adapter bridge into `EvalState`. +5. Add one runtime wrapper style with child-state reporting while keeping retry distinct from trials. +6. Add adoption suite that consumes typed schema, composite gate, outcome grader, trajectory scorer, rollout harness, and step rewards. +7. Keep existing evaluator regression suite green and add focused runtime-composition coverage. Rollback strategy: @@ -162,5 +216,7 @@ Rollback strategy: - LLM-backed user simulators should wait until deterministic scripted simulators are stable. - Sandbox/command-backed harness execution should wait for a dedicated trusted execution change. +- Clean-environment reset semantics should wait for a sandbox/environment isolation change. +- Multi-trial execution, pass@k, and pass^k should be handled in a separate evaluator-trials change. - Public API naming can be refined after the internal framework contract proves itself. - Training reward integration should wait for a separate optimizer/training change. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md index e899c8f50..8c84ba236 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md @@ -2,9 +2,9 @@ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Add rollout-owning evaluator runtime composition with multi-turn harnesses, user simulation, step-level rewards, and one adoption suite that actively consumes v2 evaluator capabilities. +**Goal:** Add rollout-owning evaluator runtime composition with multi-turn harnesses, outcome/state-check grading, user simulation, step-level rewards, and one adoption suite that actively consumes v2 evaluator capabilities. -**Architecture:** Keep the current single-shot evaluator substrate intact. Add a runtime-composition layer under `aworld/evaluations/` that can execute multi-turn rollouts, normalize them into `EvalState`, aggregate reward metrics, and compose retry attempts while preserving child state. +**Architecture:** Keep the current single-shot evaluator substrate intact. Add a runtime-composition layer under `aworld/evaluations/` that can execute multi-turn rollouts, normalize them into `EvalState`, evaluate final outcome snapshots, aggregate weighted reward metrics, derive standard rollout metrics, and compose retry attempts while preserving child state. Retry remains an execution wrapper, not pass@k/pass^k trial evaluation. **Tech Stack:** Python dataclasses/protocols, AWorld evaluator substrate, existing scorer/report infrastructure, Pydantic for typed judge outputs, pytest, OpenSpec. @@ -13,17 +13,17 @@ ## File Structure - Create: `aworld/evaluations/runtime_composition.py` - Rollout state, turn records, user simulator protocols, runtime harness protocols, and retry wrapper primitives. + Rollout state, turn records, outcome check records, user simulator protocols, runtime harness protocols, reward records, and retry wrapper primitives. - Modify: `aworld/evaluations/substrate.py` Compile opt-in runtime-composition suites and register the adoption suite. - Modify: `aworld/evaluations/execution.py` Add rollout-state-to-`EvalState` normalization helpers if they do not fit cleanly in `runtime_composition.py`. - Modify: `aworld/evaluations/report.py` - Preserve attempt/reward metadata in existing report shape without breaking schema. + Preserve attempt/reward/outcome metadata in existing report shape without breaking schema. - Modify: `aworld/evaluations/scorers/**` - Add reward aggregation scorers or reuse existing scorer infrastructure for step reward metrics. + Add outcome and reward aggregation scorers or reuse existing scorer infrastructure for those metrics. - Test: `tests/evaluations/test_runtime_composition.py` - Focused tests for rollout state, simulator, harness, retry wrapper, reward aggregation, and adoption suite. + Focused tests for rollout state, outcome grading, simulator, harness, retry wrapper, reward aggregation, standard metrics, and adoption suite. - Test: existing evaluator regression tests Ensure single-shot behavior remains compatible. @@ -41,6 +41,7 @@ def test_rollout_state_to_eval_state_excludes_live_handles(): status="success", answer="done", turns=[RolloutTurn(role="user", content="hello")], + outcome={"artifact_exists": True}, metadata={"live_agent": live_agent, "safe": "ok"}, ) @@ -49,6 +50,7 @@ def test_rollout_state_to_eval_state_excludes_live_handles(): assert eval_state.case_id == "case-1" assert eval_state.answer == "done" assert eval_state.trajectory + assert eval_state.artifacts["outcome"]["artifact_exists"] is True assert "live_agent" not in eval_state.metadata assert eval_state.metadata["safe"] == "ok" ``` @@ -61,7 +63,7 @@ Expected: FAIL because `runtime_composition.py` and `RolloutState` do not exist. - [ ] **Step 3: Add minimal rollout models** -Create `aworld/evaluations/runtime_composition.py` with serializable dataclasses for `RolloutTurn`, `StepReward`, `RolloutState`, `EvalRuntimeHarnessDef`, `RuntimeHarness`, and `UserSimulator`. +Create `aworld/evaluations/runtime_composition.py` with serializable dataclasses for `RolloutTurn`, `OutcomeCheckResult`, `StepReward`, `RolloutState`, `EvalRuntimeHarnessDef`, `RuntimeHarness`, and `UserSimulator`. - [ ] **Step 4: Run rollout tests until green** @@ -69,7 +71,49 @@ Run: `pytest tests/evaluations/test_runtime_composition.py -q` Expected: PASS for initial rollout state tests. -## Task 2: Scripted User Simulator +## Task 2: Outcome / State-Check Grading + +- [ ] **Step 1: Write failing outcome grader tests** + +Cover deterministic final-state checks: + +```python +def test_state_check_grader_emits_outcome_metric(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"ticket": {"status": "resolved"}}, + ) + grader = StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ) + + result = grader.grade(state=state, case=None, target={}) + + assert result.metric_name == "ticket_resolved" + assert result.value == 1.0 + assert result.passed is True +``` + +- [ ] **Step 2: Run outcome tests and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_state_check_grader_emits_outcome_metric -q` + +Expected: FAIL because `StateCheckGrader` does not exist. + +- [ ] **Step 3: Implement deterministic state-check grader** + +Add an in-process state-check grader that reads serializable rollout `outcome` data and emits normal metric-compatible results. Reject checks that require command execution, sandbox reset, or non-serializable live handles. + +- [ ] **Step 4: Run outcome tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS. + +## Task 3: Scripted User Simulator - [ ] **Step 1: Write failing simulator tests** @@ -105,7 +149,7 @@ Run: `pytest tests/evaluations/test_runtime_composition.py -q` Expected: PASS. -## Task 3: Runtime Harness Execution +## Task 4: Runtime Harness Execution - [ ] **Step 1: Write failing harness execution tests** @@ -127,11 +171,11 @@ Run: `pytest tests/evaluations/test_runtime_composition.py -q` Expected: PASS. -## Task 4: Step Rewards and Aggregation +## Task 5: Step Rewards and Aggregation - [ ] **Step 1: Write failing reward aggregation tests** -Cover reward records becoming case and aggregate metrics. +Cover reward records becoming weighted and partial-credit case/aggregate metrics. - [ ] **Step 2: Run reward tests and confirm failure** @@ -141,7 +185,7 @@ Expected: FAIL because reward aggregation is not wired. - [ ] **Step 3: Implement step reward records and aggregation scorer** -Use existing scorer/report metric shapes. Keep reward metrics distinct from judge metrics. +Use existing scorer/report metric shapes. Keep reward metrics distinct from judge and outcome metrics. - [ ] **Step 4: Run reward tests until green** @@ -149,11 +193,11 @@ Run: `pytest tests/evaluations/test_runtime_composition.py -q` Expected: PASS. -## Task 5: Retry Wrapper Composition +## Task 6: Retry Wrapper Composition - [ ] **Step 1: Write failing retry wrapper tests** -Cover failed first attempt, successful second attempt, and preserved child/attempt state. +Cover failed first attempt, successful second attempt, preserved child/attempt state, and explicit absence of pass@k/pass^k labels. - [ ] **Step 2: Run retry tests and confirm failure** @@ -163,7 +207,7 @@ Expected: FAIL because retry wrapper does not exist. - [ ] **Step 3: Implement retry wrapper** -Add a retry wrapper around a base `RuntimeHarness` with max attempts and selected terminal attempt. +Add a retry wrapper around a base `RuntimeHarness` with max attempts and selected terminal attempt. Preserve attempts as child state and do not emit trial metrics. - [ ] **Step 4: Run retry tests until green** @@ -171,11 +215,33 @@ Run: `pytest tests/evaluations/test_runtime_composition.py -q` Expected: PASS. -## Task 6: Adoption Suite +## Task 7: Standard Metrics and Suite Purpose + +- [ ] **Step 1: Write failing standard metric tests** + +Cover `n_turns`, `n_tool_calls`, token usage, and duration derivation from rollout state. + +- [ ] **Step 2: Run standard metric tests and confirm failure** + +Run: `pytest tests/evaluations/test_runtime_composition.py::test_rollout_standard_metrics_are_derived -q` + +Expected: FAIL because standard metric derivation does not exist. + +- [ ] **Step 3: Implement standard metric derivation and purpose metadata preservation** + +Add rollout standard metrics and preserve suite metadata such as `evaluation_purpose="capability"` or `evaluation_purpose="regression"` in report context. + +- [ ] **Step 4: Run standard metric tests until green** + +Run: `pytest tests/evaluations/test_runtime_composition.py -q` + +Expected: PASS. + +## Task 8: Adoption Suite - [ ] **Step 1: Write failing adoption suite tests** -Assert the new suite is registered, uses typed judge schema, composite gate, trajectory scorer, step reward metric, scripted simulator, and runtime harness. +Assert the new suite is registered, uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step reward metric, scripted simulator, purpose metadata, and runtime harness. - [ ] **Step 2: Run adoption tests and confirm failure** @@ -193,7 +259,7 @@ Run: `pytest tests/evaluations/test_runtime_composition.py tests/evaluations/tes Expected: PASS. -## Task 7: Verification and Commit +## Task 9: Verification and Commit - [ ] **Step 1: Run evaluator regression suite** diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md index 5b0ef8bcc..ba5d591bf 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md @@ -5,18 +5,22 @@ - `EvalHarnessDef` is a lightweight execution-spec holder, not a rollout-owning runtime object - trajectory evaluation inspects the already captured single-shot `EvalState.trajectory` - there is no user simulator, lifecycle hook model, child-state composition, retry/fallback harness composition, or step-level reward contract +- there is no explicit outcome/environment-state grader for verifying final external state +- there is no multi-trial execution model for pass@k or pass^k style nondeterminism metrics - no builtin or adoption suite currently exercises typed judge + composite gate + trajectory scorer + rollout runtime together -The result is useful framework substrate, but not yet a complete runtime-composition evaluation capability. This change adds the missing runtime layer and proves it through one concrete adoption suite. +The result is useful framework substrate, but not yet a complete runtime-composition evaluation capability. This change adds the missing rollout/runtime layer, adds outcome/state-check grading, and proves it through one concrete adoption suite. Multi-trial pass@k/pass^k execution remains a separate follow-up because it cuts across execution scheduling and statistical aggregation rather than harness retry behavior. ## What Changes - Add a rollout-owning evaluator runtime harness abstraction that can execute multi-turn cases and produce normalized rollout state. - Add multi-turn rollout state with turns, messages, tool calls, terminal outcome, step rewards, and child-state links. - Add a user simulator contract for controlled multi-turn agent/user evaluation. +- Add an outcome/state-check grader contract for verifying final environment or artifact state separately from text answer and trajectory. - Add step-level reward definitions and aggregation into report metrics and gates. - Add runtime composition wrappers, starting with retry/fallback or equivalent wrapper harness semantics. - Add one builtin/adoption suite that actually uses typed judge output, composite gates, trajectory scoring, and the new rollout-owning harness. +- Explicitly document that retry/fallback wrappers are not trials and must not be used as pass@k/pass^k metrics. - Keep existing single-shot evaluator flows compatible and avoid changing the `aworld-cli evaluator` command shape. ## Capabilities diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md index c9470449b..88043bcca 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md @@ -14,7 +14,7 @@ Suite-backed evaluation flows SHALL support opt-in rollout-owning runtime harnes #### Scenario: Runtime harness returns rollout state - **WHEN** a runtime harness completes a case rollout -- **THEN** the framework SHALL normalize the rollout into evaluator state containing terminal answer, trajectory, tool calls, usage, timing, error, and metadata fields usable by existing scorer helpers +- **THEN** the framework SHALL normalize the rollout into evaluator state containing terminal answer, outcome data, trajectory, tool calls, usage, timing, standard rollout metrics, error, and metadata fields usable by existing scorer helpers ### Requirement: Multi-turn rollout state @@ -32,6 +32,26 @@ Runtime-composed evaluation flows SHALL represent multi-turn execution as serial - **WHEN** rollout state is converted into evaluator state or report payloads - **THEN** the framework SHALL exclude live runtime handles, clients, agent instances, and simulator objects +#### Scenario: Standard rollout metrics are derived +- **WHEN** rollout state contains turns, tool calls, token usage, or timing data +- **THEN** the framework SHALL derive standard metrics such as turn count, tool-call count, token usage, and duration without requiring suite-specific custom scorers + +### Requirement: Outcome and state-check grading + +Runtime-composed evaluation flows SHALL support outcome graders that verify final environment, artifact, or domain state separately from final text answer and trajectory. + +#### Scenario: Outcome grader checks final state +- **WHEN** a runtime-composed suite declares an outcome or state-check grader +- **THEN** the framework SHALL evaluate the rollout state's terminal outcome or serializable environment snapshot and emit normal evaluator metrics with pass/fail details + +#### Scenario: Outcome metrics remain distinct +- **WHEN** a suite uses typed judge output, trajectory scorers, step rewards, and outcome graders together +- **THEN** the framework SHALL keep outcome metrics distinct while allowing composite gates to reference them alongside judge, trajectory, and reward metrics + +#### Scenario: Environment check needs sandbox reset +- **WHEN** an outcome grader requires clean-environment isolation, command execution, or sandbox reset semantics +- **THEN** the framework SHALL treat that as unsupported in this change and leave it to a dedicated environment-isolation change + ### Requirement: User simulation Runtime-composed evaluation flows SHALL support deterministic user simulators that drive controlled multi-turn rollouts. @@ -54,11 +74,11 @@ Runtime-composed evaluation flows SHALL support step-level reward records that c #### Scenario: Step rewarder evaluates rollout steps - **WHEN** a step rewarder inspects a rollout step -- **THEN** it SHALL emit a reward record containing metric name, step index, numeric value, reason, and serializable metadata +- **THEN** it SHALL emit a reward record containing metric name, step index, numeric value, optional weight, optional partial-credit marker, reason, and serializable metadata #### Scenario: Rewards aggregate into metrics - **WHEN** a rollout contains step reward records -- **THEN** the framework SHALL aggregate configured reward metrics into case metrics, aggregate metrics, and structured gate inputs +- **THEN** the framework SHALL aggregate configured reward metrics into case metrics, aggregate metrics, and structured gate inputs, including weighted and partial-credit summaries when configured #### Scenario: Rewards do not replace final judge output - **WHEN** a suite uses both typed judge output and step rewards @@ -76,14 +96,34 @@ Runtime-composed evaluation flows SHALL support at least one wrapper harness tha - **WHEN** a retry wrapper finishes - **THEN** it SHALL expose the selected terminal attempt as the main rollout state while retaining previous attempts for inspection +#### Scenario: Retry is not trial evaluation +- **WHEN** retry or fallback wrapper results are reported +- **THEN** the framework SHALL NOT label those attempts as independent trials, pass@k, or pass^k metrics + +### Requirement: Evaluation purpose metadata + +Suite-backed evaluation flows SHALL allow suites to declare whether they are intended for capability evaluation or regression evaluation. + +#### Scenario: Suite declares evaluation purpose +- **WHEN** a suite declares evaluation-purpose metadata +- **THEN** the framework SHALL preserve that metadata in the resolved suite/report context without changing scorer semantics + ### Requirement: Runtime-composition adoption suite The framework SHALL include one opt-in adoption suite that exercises runtime composition and v2 extensibility together. #### Scenario: Adoption suite uses active v2 capabilities - **WHEN** the adoption suite is selected -- **THEN** it SHALL use a typed judge schema, composite gate, trajectory scorer, step-level reward metric, scripted user simulator, and rollout-owning harness +- **THEN** it SHALL use a typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step-level reward metric, scripted user simulator, and rollout-owning harness #### Scenario: App evaluator remains unchanged - **WHEN** callers use the existing `app-evaluator` suite - **THEN** its behavior SHALL remain compatible unless a later explicit migration change updates that suite + +### Requirement: Multi-trial metrics are deferred + +Runtime composition SHALL distinguish retry/fallback execution from independent trial-based evaluation. + +#### Scenario: Caller requests pass@k or pass^k +- **WHEN** a caller needs independent repeated trials, pass@k, pass^k, or trial-distribution metrics +- **THEN** the framework SHALL treat that as out of scope for this change and require a later multi-trial evaluator change diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md index 4e39aad4c..ad850bbe9 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md @@ -8,34 +8,46 @@ - [ ] 2.1 Add a serializable rollout state model with turns, messages, trajectory, tool calls, usage, timing, errors, metadata, and child/attempt state. - [ ] 2.2 Bridge rollout state into existing `EvalState` so current scorer helpers and report summaries keep working. -- [ ] 2.3 Add tests proving rollout state does not store live runtime handles. +- [ ] 2.3 Include outcome data and optional serializable environment/artifact snapshots in rollout state. +- [ ] 2.4 Derive standard rollout metrics such as turn count, tool-call count, token usage, and duration. +- [ ] 2.5 Add tests proving rollout state does not store live runtime handles. -## 3. User Simulation +## 3. Outcome / State-Check Grading -- [ ] 3.1 Add a deterministic user simulator contract. -- [ ] 3.2 Add a scripted simulator that reads turns from case input. -- [ ] 3.3 Add a single-prompt simulator for compatibility with current one-shot cases. +- [ ] 3.1 Add deterministic outcome/state-check grader definitions. +- [ ] 3.2 Emit outcome metrics separately from judge, trajectory, and reward metrics. +- [ ] 3.3 Allow composite gates to reference outcome metrics. +- [ ] 3.4 Explicitly reject state checks that require sandbox reset, command execution, or clean-environment isolation in this change. -## 4. Step-Level Rewards +## 4. User Simulation -- [ ] 4.1 Add step reward records with metric name, step index, value, reason, and metadata. -- [ ] 4.2 Add rewarder interfaces that inspect rollout state without mutating it. -- [ ] 4.3 Aggregate step rewards into normal evaluator metrics and gate inputs. +- [ ] 4.1 Add a deterministic user simulator contract. +- [ ] 4.2 Add a scripted simulator that reads turns from case input. +- [ ] 4.3 Add a single-prompt simulator for compatibility with current one-shot cases. +- [ ] 4.4 Document that LLM-backed adaptive user simulation is deferred. -## 5. Runtime Composition +## 5. Step-Level Rewards -- [ ] 5.1 Add one runtime wrapper style, preferably retry, around a base runtime harness. -- [ ] 5.2 Preserve child/attempt state for composed runs. -- [ ] 5.3 Add tests for retry/fallback state, terminal status, and report visibility. +- [ ] 5.1 Add step reward records with metric name, step index, value, weight, partial-credit marker, reason, and metadata. +- [ ] 5.2 Add rewarder interfaces that inspect rollout state without mutating it. +- [ ] 5.3 Aggregate step rewards into normal evaluator metrics and gate inputs, including weighted and partial-credit summaries. -## 6. Adoption Suite +## 6. Runtime Composition -- [ ] 6.1 Add one builtin or framework-registered adoption suite that uses the runtime-composition path. -- [ ] 6.2 The adoption suite uses typed judge schema, composite gate, trajectory scorer, step-level reward, and scripted simulator. -- [ ] 6.3 Keep `app-evaluator` behavior unchanged unless explicitly selected for migration later. +- [ ] 6.1 Add one runtime wrapper style, preferably retry, around a base runtime harness. +- [ ] 6.2 Preserve child/attempt state for composed runs. +- [ ] 6.3 Add tests for retry/fallback state, terminal status, and report visibility. +- [ ] 6.4 Document and test that retry/fallback attempts are not independent trials and do not produce pass@k/pass^k metrics. -## 7. Verification +## 7. Adoption Suite -- [ ] 7.1 Add focused tests for harness rollout, user simulator, reward aggregation, runtime wrapper composition, and adoption suite execution. -- [ ] 7.2 Run the evaluator regression suite. -- [ ] 7.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict`. +- [ ] 7.1 Add one builtin or framework-registered adoption suite that uses the runtime-composition path. +- [ ] 7.2 The adoption suite uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step-level reward, and scripted simulator. +- [ ] 7.3 Mark the adoption suite with capability/regression purpose metadata. +- [ ] 7.4 Keep `app-evaluator` behavior unchanged unless explicitly selected for migration later. + +## 8. Verification + +- [ ] 8.1 Add focused tests for harness rollout, outcome grading, user simulator, reward aggregation, runtime wrapper composition, standard metrics, and adoption suite execution. +- [ ] 8.2 Run the evaluator regression suite. +- [ ] 8.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict`. From c2fb3c04c3670994dceab73aba07ce1a58c0cbf0 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 14:26:30 +0800 Subject: [PATCH 23/41] docs: specify runtime outcome and trial boundaries --- .../design.md | 39 +++++++++++++++---- .../specs/evaluation-substrate/spec.md | 30 +++++++++++++- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md index 4f42450fe..1cce886cb 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md @@ -11,7 +11,7 @@ Evaluator v2 extensibility made the AWorld evaluator substrate more configurable - trial-based pass@k/pass^k metrics for nondeterministic agents - an adoption suite that exercises these capabilities outside tests -This change introduces runtime composition as a framework-owned layer under `aworld/evaluations/` while preserving the v2 single-shot substrate. It also adds outcome/state-check grading because outcome verification is tightly coupled to rollout state and environment snapshots. Trial-based pass@k/pass^k execution is explicitly deferred because retry composition and independent trials have different semantics. +This change introduces runtime composition as a framework-owned layer under `aworld/evaluations/` while preserving the v2 single-shot substrate. It also adds outcome/state-check grading because outcome verification is tightly coupled to rollout state and environment snapshots. Trial-based pass@k/pass^k execution is explicitly deferred because retry composition and independent trials have different semantics. This change should therefore be described as the runtime-composition and outcome-grading slice of complete evaluation capability, not as the full evaluator roadmap. ## Goals / Non-Goals @@ -96,15 +96,28 @@ The bridge into `EvalState` should preserve the existing state summary and score ### 3. Add outcome/state-check grading -Outcome evaluation must not be limited to terminal text. Add an outcome/state-check contract that can verify the final state produced by a rollout: +Outcome evaluation must not be limited to terminal text. In this design, `answer` is the target's terminal response, while `outcome` is the serializable final environment, artifact, or domain state captured by the harness after rollout. Add an outcome/state-check contract that can verify that final state: - file or artifact existence/content checks - structured environment snapshot checks - database or domain-state assertions when the harness provides a serializable snapshot -- test-command or sandbox checks only through a future trusted execution/sandbox change +- coding-task results such as precomputed test summaries when produced by a trusted harness +- test-command or sandbox execution only through a future trusted execution/sandbox change The first implementation should keep state checks deterministic and in-process. A state-check grader receives the `RolloutState`, case, target, and optional serializable environment snapshot. It returns normal evaluator metric results plus structured details explaining which checks passed or failed. +A minimal state-check definition should support: + +- `metric_name` +- `source`: for example `outcome`, `metadata`, or `artifacts` +- `path`: a structured path into the selected source +- `op`: equality or numeric comparison against an expected value +- `expected` +- `weight` +- `required` + +Outcome graders must emit numeric metric values for gate compatibility, plus pass/fail details for report inspection. They must not open live databases, inspect arbitrary file paths, run shell commands, or retain environment handles. If a harness needs external checks, it must capture a serializable snapshot or summary into `RolloutState.outcome` before grading. + Outcome graders are distinct from: - typed judge output, which evaluates semantic result quality @@ -152,7 +165,19 @@ The wrapper must preserve child/attempt state so reports can explain which attem Retry and fallback are not trials. Retry is an execution strategy that tries to produce one terminal rollout; trials are independent repeated evaluations used to estimate nondeterministic performance. This change must not label retry results as pass@k or pass^k. -### 7. Add suite purpose metadata and standard metrics +### 7. Defer multi-trial evaluation explicitly + +Complete agent evaluation needs independent trial execution and distribution-level metrics, but that is a scheduler and aggregation concern rather than a harness-wrapper concern. A later evaluator-trials change should own: + +- `num_trials` or equivalent independent repeat configuration +- clean-environment reset requirements per trial +- trial-level report records +- pass@k and pass^k aggregation +- separation between retry attempts inside a trial and independent trials across the same case + +Until that exists, runtime-composed reports should expose retry attempts only as child states for one rollout and should not compute nondeterminism metrics from them. + +### 8. Add suite purpose metadata and standard metrics Suites should be able to describe whether they are intended for capability evaluation or regression evaluation. The first implementation can use suite metadata, for example: @@ -163,13 +188,13 @@ Runtime-composed harnesses should derive standard transcript and latency metrics - `n_turns` - `n_tool_calls` -- `n_tokens` or token usage fields +- `n_tokens` or token usage fields such as prompt, completion, and total tokens - wall-clock duration / time cost - optional first-token or first-action latency when exposed by the runtime Suites can still declare custom metrics, but these baseline metrics should not require every suite to hand-roll them. -### 8. Add one adoption suite +### 9. Add one adoption suite Add one builtin or framework-registered adoption suite that consumes the new runtime: @@ -183,7 +208,7 @@ Add one builtin or framework-registered adoption suite that consumes the new run This suite can be narrow and deterministic. Its purpose is to prove that the substrate is active in production code paths, not only in isolated unit tests. It should not replace `app-evaluator` unless that public contract is ready to change. -### 9. Keep CLI additive +### 10. Keep CLI additive `aworld-cli evaluator` should discover and run the adoption suite through existing suite selection paths. Do not add CLI-only runtime syntax in this change. If CLI ergonomics are needed later, handle them in a product-focused change after the framework contract settles. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md index 88043bcca..c577fa1d8 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md @@ -40,14 +40,26 @@ Runtime-composed evaluation flows SHALL represent multi-turn execution as serial Runtime-composed evaluation flows SHALL support outcome graders that verify final environment, artifact, or domain state separately from final text answer and trajectory. +#### Scenario: Outcome is distinct from terminal answer +- **WHEN** a runtime-composed rollout completes +- **THEN** the framework SHALL treat the terminal answer as response text and the outcome as serializable final environment, artifact, or domain state captured by the harness + #### Scenario: Outcome grader checks final state - **WHEN** a runtime-composed suite declares an outcome or state-check grader -- **THEN** the framework SHALL evaluate the rollout state's terminal outcome or serializable environment snapshot and emit normal evaluator metrics with pass/fail details +- **THEN** the framework SHALL evaluate the rollout state's terminal outcome or serializable environment snapshot and emit normal evaluator metrics with numeric values and pass/fail details + +#### Scenario: State check addresses a structured snapshot +- **WHEN** a state-check grader declares a source, path, operator, and expected value +- **THEN** the framework SHALL resolve that path against the selected serializable rollout state source and compare it without opening live files, databases, clients, or runtime handles #### Scenario: Outcome metrics remain distinct - **WHEN** a suite uses typed judge output, trajectory scorers, step rewards, and outcome graders together - **THEN** the framework SHALL keep outcome metrics distinct while allowing composite gates to reference them alongside judge, trajectory, and reward metrics +#### Scenario: Trusted harness provides coding-task results +- **WHEN** a trusted harness runs external checks before grading and records a serializable test summary or artifact summary in rollout outcome +- **THEN** outcome graders SHALL evaluate that recorded summary rather than invoking test commands themselves + #### Scenario: Environment check needs sandbox reset - **WHEN** an outcome grader requires clean-environment isolation, command execution, or sandbox reset semantics - **THEN** the framework SHALL treat that as unsupported in this change and leave it to a dedicated environment-isolation change @@ -64,6 +76,10 @@ Runtime-composed evaluation flows SHALL support deterministic user simulators th - **WHEN** a case only includes a single prompt or query - **THEN** the framework SHALL support a simulator that emits one user turn and then terminates unless the harness requests additional turns +#### Scenario: Adaptive LLM user simulator is requested +- **WHEN** a suite requires an LLM-backed adaptive user simulator +- **THEN** the framework SHALL treat it as out of scope for this change and require a later simulator extension + #### Scenario: Simulator errors are captured - **WHEN** a user simulator cannot produce a valid next turn - **THEN** the framework SHALL mark the rollout state as failed with a serializable error rather than storing the simulator object @@ -108,6 +124,10 @@ Suite-backed evaluation flows SHALL allow suites to declare whether they are int - **WHEN** a suite declares evaluation-purpose metadata - **THEN** the framework SHALL preserve that metadata in the resolved suite/report context without changing scorer semantics +#### Scenario: Purpose uses supported values +- **WHEN** a suite declares `evaluation_purpose` +- **THEN** the framework SHALL accept `capability` and `regression` as supported values and leave scorer thresholds under the suite's explicit gate policy + ### Requirement: Runtime-composition adoption suite The framework SHALL include one opt-in adoption suite that exercises runtime composition and v2 extensibility together. @@ -124,6 +144,14 @@ The framework SHALL include one opt-in adoption suite that exercises runtime com Runtime composition SHALL distinguish retry/fallback execution from independent trial-based evaluation. +#### Scenario: Retry attempts are reported +- **WHEN** retry or fallback attempts are retained as child rollout state +- **THEN** the framework SHALL NOT count those attempts as independent trials or use them to calculate pass@k, pass^k, or trial-distribution metrics + #### Scenario: Caller requests pass@k or pass^k - **WHEN** a caller needs independent repeated trials, pass@k, pass^k, or trial-distribution metrics - **THEN** the framework SHALL treat that as out of scope for this change and require a later multi-trial evaluator change + +#### Scenario: Future trial evaluation is added +- **WHEN** a later change adds independent trial execution +- **THEN** it SHALL keep trial scheduling, clean-environment reset semantics, and pass@k/pass^k aggregation separate from retry wrapper behavior From 9f486000a005fcc66730e0836e4c8fee28522b2d Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 14:48:48 +0800 Subject: [PATCH 24/41] feat: add evaluator runtime composition --- aworld/evaluations/report.py | 9 + aworld/evaluations/runtime_composition.py | 471 ++++++++++++++++++ aworld/evaluations/scorers/__init__.py | 11 +- .../scorers/runtime_composition.py | 94 ++++ aworld/evaluations/scorers/suite_judge.py | 5 + aworld/evaluations/substrate.py | 171 ++++++- .../implementation-plan.md | 70 +-- .../tasks.md | 60 +-- tests/evaluations/test_runtime_composition.py | 312 ++++++++++++ 9 files changed, 1135 insertions(+), 68 deletions(-) create mode 100644 aworld/evaluations/runtime_composition.py create mode 100644 aworld/evaluations/scorers/runtime_composition.py create mode 100644 tests/evaluations/test_runtime_composition.py diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py index 44b2a1381..73e19e3c2 100644 --- a/aworld/evaluations/report.py +++ b/aworld/evaluations/report.py @@ -18,6 +18,9 @@ def __init__( judge: dict[str, Any], judge_backend: dict[str, Any] | None = None, state_summary: dict[str, Any] | None = None, + artifacts: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + metric_details: dict[str, Any] | None = None, ) -> None: payload = { "case_id": case_id, @@ -27,6 +30,12 @@ def __init__( "judge_backend": judge_backend, "state_summary": state_summary or {}, } + if artifacts: + payload["artifacts"] = artifacts + if metadata: + payload["metadata"] = metadata + if metric_details: + payload["metric_details"] = metric_details super().__init__(payload) def to_dict(self) -> dict[str, Any]: diff --git a/aworld/evaluations/runtime_composition.py b/aworld/evaluations/runtime_composition.py new file mode 100644 index 000000000..84e3ce002 --- /dev/null +++ b/aworld/evaluations/runtime_composition.py @@ -0,0 +1,471 @@ +# coding: utf-8 +from __future__ import annotations + +import inspect +from dataclasses import dataclass, field +from typing import Any, Callable, Mapping, Protocol + +from aworld.evaluations.execution import EvalExecutionSpec, EvalState + + +_SCALAR_TYPES = (str, int, float, bool, type(None)) + + +def _is_serializable_value(value: Any) -> bool: + if isinstance(value, _SCALAR_TYPES): + return True + if isinstance(value, list): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, tuple): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, Mapping): + return all(isinstance(key, str) and _is_serializable_value(item) for key, item in value.items()) + return False + + +def _serializable_dict(payload: Mapping[str, Any] | None) -> dict[str, Any]: + return { + str(key): value + for key, value in dict(payload or {}).items() + if isinstance(key, str) and _is_serializable_value(value) + } + + +@dataclass(frozen=True) +class RolloutTurn: + role: str + content: Any | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + payload = { + "role": self.role, + "content": self.content, + } + metadata = _serializable_dict(self.metadata) + if metadata: + payload["metadata"] = metadata + return payload + + +@dataclass(frozen=True) +class OutcomeCheckResult: + metric_name: str + value: float + passed: bool + reason: str = "" + metadata: dict[str, Any] = field(default_factory=dict) + + def to_metric_result(self) -> dict[str, Any]: + return { + "value": self.value, + "metadata": { + "passed": self.passed, + "reason": self.reason, + **_serializable_dict(self.metadata), + }, + } + + +@dataclass(frozen=True) +class StepReward: + metric_name: str + step_index: int + value: float + weight: float = 1.0 + partial_credit: bool = False + reason: str = "" + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "step_index": self.step_index, + "value": self.value, + "weight": self.weight, + "partial_credit": self.partial_credit, + "reason": self.reason, + "metadata": _serializable_dict(self.metadata), + } + + +def _resolve_path(source: Mapping[str, Any], path: tuple[str, ...]) -> Any: + current: Any = source + for part in path: + if not isinstance(current, Mapping) or part not in current: + raise KeyError(".".join(path)) + current = current[part] + return current + + +def _compare_values(value: Any, op: str, expected: Any) -> bool: + if op == "==": + return value == expected + if op == "!=": + return value != expected + if op == ">=": + return float(value) >= float(expected) + if op == "<=": + return float(value) <= float(expected) + if op == ">": + return float(value) > float(expected) + if op == "<": + return float(value) < float(expected) + raise ValueError(f"unsupported state-check operator: {op}") + + +@dataclass(frozen=True) +class StateCheckGrader: + metric_name: str + path: tuple[str, ...] + expected: Any + source: str = "outcome" + op: str = "==" + weight: float = 1.0 + required: bool = True + + def to_dict(self) -> dict[str, Any]: + return { + "metric_name": self.metric_name, + "source": self.source, + "path": list(self.path), + "op": self.op, + "expected": self.expected, + "weight": self.weight, + "required": self.required, + } + + def grade(self, *, state: RolloutState, case: Any, target: Mapping[str, Any]) -> OutcomeCheckResult: + sources = { + "outcome": state.outcome, + "metadata": state.metadata, + "artifacts": state.to_eval_state(target=target).artifacts, + } + if self.source not in sources: + raise ValueError(f"unsupported state-check source: {self.source}") + try: + actual = _resolve_path(sources[self.source], self.path) + passed = _compare_values(actual, self.op, self.expected) + reason = "matched" if passed else f"expected {self.expected!r}, got {actual!r}" + except KeyError: + actual = None + passed = False + reason = f"missing path: {'.'.join(self.path)}" + return OutcomeCheckResult( + metric_name=self.metric_name, + value=1.0 if passed else 0.0, + passed=passed, + reason=reason, + metadata={ + "source": self.source, + "path": list(self.path), + "op": self.op, + "expected": self.expected, + "actual": actual, + "weight": self.weight, + "required": self.required, + }, + ) + + +@dataclass +class RolloutState: + case_id: str + status: str = "success" + answer: Any | None = None + turns: list[RolloutTurn] = field(default_factory=list) + messages: list[dict[str, Any]] = field(default_factory=list) + trajectory: list[dict[str, Any]] = field(default_factory=list) + tool_calls: list[dict[str, Any]] = field(default_factory=list) + step_rewards: list[StepReward] = field(default_factory=list) + outcome: dict[str, Any] = field(default_factory=dict) + attempts: list["RolloutState"] = field(default_factory=list) + child_states: list["RolloutState"] = field(default_factory=list) + usage: dict[str, Any] = field(default_factory=dict) + timing: dict[str, Any] = field(default_factory=dict) + standard_metrics: dict[str, Any] = field(default_factory=dict) + error: dict[str, Any] | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_eval_state(self, target: Mapping[str, Any] | None = None) -> EvalState: + trajectory = list(self.trajectory) + if not trajectory: + trajectory = [turn.to_dict() for turn in self.turns] + artifacts = { + "outcome": _serializable_dict(self.outcome), + "attempts": [attempt.to_dict(include_children=False) for attempt in self.attempts], + "child_states": [state.to_dict(include_children=False) for state in self.child_states], + } + metadata = _serializable_dict(self.metadata) + metadata["_target"] = dict(target or {}) + if self.standard_metrics: + metadata["standard_metrics"] = _serializable_dict(self.standard_metrics) + return EvalState( + case_id=self.case_id, + status=self.status, + answer=self.answer, + completion=[] if self.answer is None else [self.answer], + artifacts=artifacts, + trajectory=trajectory, + tool_calls=list(self.tool_calls), + usage=_serializable_dict(self.usage), + timing=_serializable_dict(self.timing), + error=self.error, + raw_response=self.to_dict(include_children=False), + metadata=metadata, + ) + + def to_dict(self, *, include_children: bool = True) -> dict[str, Any]: + payload = { + "case_id": self.case_id, + "status": self.status, + "answer": self.answer, + "turns": [turn.to_dict() for turn in self.turns], + "messages": list(self.messages), + "trajectory": list(self.trajectory), + "tool_calls": list(self.tool_calls), + "step_rewards": [reward.to_dict() for reward in self.step_rewards], + "outcome": _serializable_dict(self.outcome), + "usage": _serializable_dict(self.usage), + "timing": _serializable_dict(self.timing), + "standard_metrics": _serializable_dict(self.standard_metrics), + "error": self.error, + "metadata": _serializable_dict(self.metadata), + } + if include_children: + payload["attempts"] = [attempt.to_dict(include_children=False) for attempt in self.attempts] + payload["child_states"] = [state.to_dict(include_children=False) for state in self.child_states] + return payload + + +@dataclass(frozen=True) +class EvalRuntimeHarnessDef: + harness_id: str + execution: EvalExecutionSpec = field(default_factory=EvalExecutionSpec) + simulator: str = "single_prompt" + metadata: dict[str, Any] = field(default_factory=dict) + + +class RuntimeHarness(Protocol): + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + ... + + +class UserSimulator(Protocol): + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None: + ... + + +def _case_input(case: Any) -> dict[str, Any]: + if hasattr(case, "input") and isinstance(case.input, Mapping): + return dict(case.input) + if hasattr(case, "case_data") and isinstance(case.case_data, Mapping): + return dict(case.case_data) + if isinstance(case, Mapping): + return dict(case) + return {} + + +class ScriptedUserSimulator: + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None: + turns = _case_input(case).get("turns") or [] + user_turn_count = sum(1 for turn in state.turns if turn.role == "user") + if user_turn_count >= len(turns): + return None + return RolloutTurn(role="user", content=turns[user_turn_count]) + + +class SinglePromptUserSimulator: + def __init__(self, query_key: str = "query"): + self.query_key = query_key + + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None: + if any(turn.role == "user" for turn in state.turns): + return None + case_input = _case_input(case) + content = case_input.get(self.query_key, case_input.get("prompt")) + if content is None: + return None + return RolloutTurn(role="user", content=content) + + +async def _maybe_await(value: Any) -> Any: + if inspect.isawaitable(value): + return await value + return value + + +class CallableRuntimeHarness: + def __init__( + self, + *, + simulator: UserSimulator | None = None, + assistant_step: Callable[..., Any], + max_turns: int = 1, + ): + self.simulator = simulator or SinglePromptUserSimulator() + self.assistant_step = assistant_step + self.max_turns = max_turns + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + case_id = getattr(case, "case_id", None) or getattr(case, "eval_case_id", "case") + state = RolloutState(case_id=str(case_id)) + last_output: Any | None = None + for _ in range(self.max_turns): + user_turn = self.simulator.next_turn( + case=case, + target=target, + state=state, + last_output=last_output, + ) + if user_turn is None: + break + state.turns.append(user_turn) + state.trajectory.append(user_turn.to_dict()) + step_output = await _maybe_await( + self.assistant_step( + user_turn=user_turn, + state=state, + case=case, + target=target, + ) + ) + assistant_turn = self._assistant_turn(step_output) + state.turns.append(assistant_turn) + state.trajectory.append(assistant_turn.to_dict()) + if isinstance(step_output, Mapping): + if "answer" in step_output: + state.answer = step_output["answer"] + last_output = step_output["answer"] + for call in step_output.get("tool_calls") or []: + if isinstance(call, Mapping): + state.tool_calls.append(dict(call)) + if isinstance(step_output.get("outcome"), Mapping): + state.outcome.update(dict(step_output["outcome"])) + if isinstance(step_output.get("usage"), Mapping): + state.usage.update(dict(step_output["usage"])) + if isinstance(step_output.get("timing"), Mapping): + state.timing.update(dict(step_output["timing"])) + for reward in step_output.get("step_rewards") or []: + if isinstance(reward, StepReward): + state.step_rewards.append(reward) + elif isinstance(reward, Mapping): + state.step_rewards.append( + StepReward( + metric_name=str(reward["metric_name"]), + step_index=int(reward["step_index"]), + value=float(reward["value"]), + weight=float(reward.get("weight", 1.0)), + partial_credit=bool(reward.get("partial_credit", False)), + reason=str(reward.get("reason", "")), + metadata=dict(reward.get("metadata") or {}), + ) + ) + else: + state.answer = step_output + last_output = step_output + state.standard_metrics.update(derive_standard_metrics(state)) + return state + + def _assistant_turn(self, step_output: Any) -> RolloutTurn: + if isinstance(step_output, Mapping): + return RolloutTurn( + role="assistant", + content=step_output.get("answer"), + metadata={ + "tool_calls": list(step_output.get("tool_calls") or []), + }, + ) + return RolloutTurn(role="assistant", content=step_output) + + +def derive_standard_metrics(state: RolloutState) -> dict[str, Any]: + token_total = state.usage.get("total_tokens") + if token_total is None and isinstance(state.usage.get("tokens"), (int, float)): + token_total = state.usage["tokens"] + duration = state.timing.get("duration_ms", state.timing.get("time_cost_ms")) + return { + "n_turns": len(state.turns), + "n_tool_calls": len(state.tool_calls), + "n_tokens": token_total or 0, + "duration_ms": duration or 0, + } + + +def aggregate_step_rewards(state: RolloutState) -> dict[str, dict[str, Any]]: + grouped: dict[str, list[StepReward]] = {} + for reward in state.step_rewards: + grouped.setdefault(reward.metric_name, []).append(reward) + + metrics: dict[str, dict[str, Any]] = {} + for metric_name, rewards in grouped.items(): + weighted_sum = sum(float(reward.value) * float(reward.weight) for reward in rewards) + weight_total = sum(float(reward.weight) for reward in rewards) or 1.0 + total = sum(float(reward.value) for reward in rewards) + partial_count = sum(1 for reward in rewards if reward.partial_credit) + metrics[metric_name] = { + "value": weighted_sum / weight_total, + "metadata": { + "count": len(rewards), + "weight_total": weight_total, + "rewards": [reward.to_dict() for reward in rewards], + }, + } + metrics[f"{metric_name}_total"] = { + "value": total, + "metadata": {"count": len(rewards)}, + } + metrics[f"{metric_name}_partial_credit_rate"] = { + "value": partial_count / len(rewards), + "metadata": {"partial_credit_count": partial_count, "count": len(rewards)}, + } + return metrics + + +class RetryRuntimeHarness: + def __init__(self, *, base_harness: RuntimeHarness, max_attempts: int = 2): + if max_attempts < 1: + raise ValueError("max_attempts must be >= 1") + self.base_harness = base_harness + self.max_attempts = max_attempts + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + attempts: list[RolloutState] = [] + terminal: RolloutState | None = None + for _ in range(self.max_attempts): + attempt = await self.base_harness.run_rollout(case=case, target=target) + attempts.append(attempt) + terminal = attempt + if attempt.status == "success": + break + assert terminal is not None + terminal.attempts = attempts + terminal.child_states = attempts[:-1] + terminal.metadata = { + **terminal.metadata, + "runtime_composition": "retry", + "attempt_count": len(attempts), + } + terminal.standard_metrics.update(derive_standard_metrics(terminal)) + return terminal diff --git a/aworld/evaluations/scorers/__init__.py b/aworld/evaluations/scorers/__init__.py index b0a178000..82bd3a09e 100644 --- a/aworld/evaluations/scorers/__init__.py +++ b/aworld/evaluations/scorers/__init__.py @@ -14,6 +14,7 @@ class ScorerFactory(Factory): def __init__(self, type_name: str = None): super().__init__(type_name) self._metric_to_scorers: Dict[str, Type[Scorer]] = {} + self._name_to_scorers: Dict[str, Type[Scorer]] = {} self._default_scorer_params: Dict[int, Dict[str, Any]] = {} def __call__(self, name: str = None, criterias: Union[EvalCriteria, List[EvalCriteria]] = None, *args, **kwargs): @@ -41,6 +42,8 @@ def register(self, name: str, desc: str = '', scorer_cls: Type[Scorer] = None, * if name not in self._metric_to_scorers: self._metric_to_scorers[name] = scorer_cls + self._name_to_scorers[scorer_cls.__name__] = scorer_cls + self._name_to_scorers[f"{scorer_cls.__module__}.{scorer_cls.__name__}"] = scorer_cls else: raise ValueError(f'Scorer class {scorer_cls.__name__} already registered for metric {name}') @@ -87,11 +90,17 @@ def get_scorer_instances_for_criterias(self, criterias: Union[EvalCriteria, List for criteria in criterias: scorer_class = self._metric_to_scorers.get(criteria.metric_name) + if not scorer_class and criteria.scorer_class: + scorer_class = self._name_to_scorers.get(criteria.scorer_class) if not scorer_class: logger.error(f'No scorer class found for metric {criteria.metric_name}') raise ValueError(f'No scorer class found for metric {criteria.metric_name}') - if criteria.scorer_class and scorer_class.__name__ != criteria.scorer_class: + scorer_class_names = { + scorer_class.__name__, + f"{scorer_class.__module__}.{scorer_class.__name__}", + } + if criteria.scorer_class and criteria.scorer_class not in scorer_class_names: raise ValueError(f"registered scorer class {scorer_class.__name__} does not match criteria {criteria.scorer_class}") if scorer_class not in scorer_instances: diff --git a/aworld/evaluations/scorers/runtime_composition.py b/aworld/evaluations/scorers/runtime_composition.py new file mode 100644 index 000000000..6484dcce4 --- /dev/null +++ b/aworld/evaluations/scorers/runtime_composition.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any, Mapping + +from aworld.evaluations.base import EvalDataCase, MetricResult, Scorer, ScorerResult +from aworld.evaluations.runtime_composition import ( + RolloutState, + StateCheckGrader, + StepReward, + aggregate_step_rewards, +) +from aworld.evaluations.scorers import scorer_register +from aworld.evaluations.scorers.state_extractors import get_eval_state + + +def _rollout_state_from_output(input: EvalDataCase[dict], output: Any) -> RolloutState: + state = get_eval_state(output) + raw = state.get("raw_response") if isinstance(state.get("raw_response"), Mapping) else {} + artifacts = state.get("artifacts") if isinstance(state.get("artifacts"), Mapping) else {} + outcome = raw.get("outcome") if isinstance(raw.get("outcome"), Mapping) else artifacts.get("outcome", {}) + rewards = [] + for reward in raw.get("step_rewards") or []: + if isinstance(reward, Mapping): + rewards.append( + StepReward( + metric_name=str(reward["metric_name"]), + step_index=int(reward["step_index"]), + value=float(reward["value"]), + weight=float(reward.get("weight", 1.0)), + partial_credit=bool(reward.get("partial_credit", False)), + reason=str(reward.get("reason", "")), + metadata=dict(reward.get("metadata") or {}), + ) + ) + return RolloutState( + case_id=getattr(input, "eval_case_id", str(state.get("case_id", ""))), + status=str(state.get("status", "success")), + answer=state.get("answer"), + outcome=dict(outcome or {}), + step_rewards=rewards, + usage=dict(state.get("usage") or {}), + timing=dict(state.get("timing") or {}), + standard_metrics=dict((state.get("metadata") or {}).get("standard_metrics") or {}), + metadata=dict(state.get("metadata") or {}), + ) + + +@scorer_register("runtime_outcome") +class RuntimeOutcomeScorer(Scorer): + def __init__(self, name: str = "runtime_outcome", **kwargs): + super().__init__(name=name) + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + state = _rollout_state_from_output(input, output) + metric_results: dict[str, MetricResult] = {} + target = dict(input.case_data.get("_target", {})) if isinstance(input.case_data, Mapping) else {} + for metric_name, criteria in self.eval_criterias.items(): + params = dict(criteria.scorer_params or {}) + grader_payload = params.get("grader") or {} + grader = StateCheckGrader( + metric_name=metric_name, + source=str(grader_payload.get("source", "outcome")), + path=tuple(grader_payload.get("path") or ()), + op=str(grader_payload.get("op", "==")), + expected=grader_payload.get("expected"), + weight=float(grader_payload.get("weight", 1.0)), + required=bool(grader_payload.get("required", True)), + ) + metric_results[metric_name] = grader.grade(state=state, case=input, target=target).to_metric_result() + return ScorerResult(scorer_name=self.name, metric_results=metric_results) + + +@scorer_register("runtime_reward") +class RuntimeRewardScorer(Scorer): + def __init__(self, name: str = "runtime_reward", **kwargs): + super().__init__(name=name) + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + state = _rollout_state_from_output(input, output) + return ScorerResult(scorer_name=self.name, metric_results=aggregate_step_rewards(state)) + + +@scorer_register("runtime_standard_metric") +class RuntimeStandardMetricScorer(Scorer): + def __init__(self, name: str = "runtime_standard_metric", **kwargs): + super().__init__(name=name) + + async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> ScorerResult: + state = _rollout_state_from_output(input, output) + metric_results: dict[str, MetricResult] = {} + for metric_name in self.eval_criterias: + metric_results[metric_name] = {"value": state.standard_metrics.get(metric_name, 0)} + return ScorerResult(scorer_name=self.name, metric_results=metric_results) diff --git a/aworld/evaluations/scorers/suite_judge.py b/aworld/evaluations/scorers/suite_judge.py index 0a5ad8adc..ad62d7405 100644 --- a/aworld/evaluations/scorers/suite_judge.py +++ b/aworld/evaluations/scorers/suite_judge.py @@ -38,10 +38,15 @@ async def score(self, index: int, input: EvalDataCase[dict], output: dict) -> Sc scorer.metric_name for scorer in getattr(self.suite, "trajectory_scorers", tuple()) } + declared_runtime_metrics = { + scorer.metric_name + for scorer in getattr(self.suite, "outcome_scorers", tuple()) + } | set(getattr(self.suite, "reward_metrics", tuple())) | set(getattr(self.suite, "standard_metrics", tuple())) for metric_name, value in payload.items(): if ( metric_name == "score" or metric_name in declared_trajectory_metrics + or metric_name in declared_runtime_metrics or not isinstance(value, (int, float, bool, str)) ): continue diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 83a575c86..fe562af24 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -22,7 +22,15 @@ from aworld.evaluations.eval_targets.agent_eval import AworldAgentEvalTarget, AworldTaskEvalTarget from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec, load_program_callable from aworld.evaluations.manifests import validate_declared_eval_suite_manifest +from aworld.evaluations.runtime_composition import ( + CallableRuntimeHarness, + RuntimeHarness, + SinglePromptUserSimulator, + StateCheckGrader, + StepReward, +) from aworld.evaluations.scorers import scorer_factory +from aworld.evaluations.types import MetricNames from aworld.evaluations.execution_adapters import resolve_execution_adapter from aworld.evaluations.report import ( CaseEvaluationReport, @@ -247,6 +255,11 @@ class JudgeExecution: payload: dict[str, Any] +class _RuntimeCompositionJudgeOutput(BaseModel): + score: float + verdict: str + + class JudgeBackend: backend_id: ClassVar[str] = "judge-backend" @@ -370,6 +383,10 @@ class EvalSuiteDef: gate_policy: GatePolicyDef | None = None execution: EvalExecutionSpec | None = None harness: EvalHarnessDef | None = None + runtime_harness: RuntimeHarness | None = None + outcome_scorers: tuple[StateCheckGrader, ...] = tuple() + reward_metrics: tuple[str, ...] = tuple() + standard_metrics: tuple[str, ...] = tuple() trajectory_scorers: tuple[TrajectoryScorerDef, ...] = tuple() judge: JudgeCallable | None = None judge_backend: JudgeBackend | None = None @@ -436,7 +453,7 @@ class EvalSuiteSelection: _EVAL_SUITE_REGISTRY: dict[tuple[str | None, str], EvalSuiteRegistration] = {} _LOADED_EVAL_MANIFEST_PATHS: set[str] = set() _DECLARED_EVAL_SUITE_IDS_BY_WORKSPACE: dict[str, set[tuple[str | None, str]]] = {} -_BUILTIN_EVAL_SUITE_IDS = {"app-evaluator"} +_BUILTIN_EVAL_SUITE_IDS = {"app-evaluator", "runtime-composition-adoption"} def _eval_suite_registry_key(suite_id: str, workspace_root: str | None = None) -> tuple[str | None, str]: @@ -660,7 +677,27 @@ async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: return {"answer": state.answer, "state": state.to_dict()} +class _RuntimeCompositionEvalTarget(EvalTarget[dict]): + def __init__(self, *, target: dict[str, Any], harness: RuntimeHarness): + super().__init__() + self._target = dict(target) + self._harness = harness + + async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: + case = EvalCaseDef( + case_id=getattr(input, "eval_case_id", str(index)), + input=dict(input.case_data if isinstance(input, EvalDataCase) else input), + expected=(input.case_data or {}).get("_expected") if isinstance(input, EvalDataCase) else None, + metadata=(input.case_data or {}).get("_case_metadata", {}) if isinstance(input, EvalDataCase) else {}, + ) + rollout_state = await self._harness.run_rollout(case=case, target=self._target) + eval_state = rollout_state.to_eval_state(target=self._target) + return {"answer": eval_state.answer, "state": eval_state.to_dict()} + + def _build_eval_target(flow: EvaluationFlowDef, target: dict[str, Any]): + if flow.suite.runtime_harness is not None: + return _RuntimeCompositionEvalTarget(target=target, harness=flow.suite.runtime_harness) harness = resolve_eval_harness(flow.suite) execution = harness.execution if execution is None or execution.mode == EvalExecutionMode.STATIC: @@ -699,6 +736,36 @@ def _trajectory_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: return criteria +def _runtime_eval_criteria(suite: EvalSuiteDef) -> list[dict[str, Any]]: + criteria: list[dict[str, Any]] = [] + for scorer in suite.outcome_scorers: + criteria.append( + { + "metric_name": scorer.metric_name, + "threshold": 1.0, + "scorer_class": "RuntimeOutcomeScorer", + "scorer_params": {"grader": scorer.to_dict()}, + } + ) + for metric_name in suite.reward_metrics: + criteria.append( + { + "metric_name": metric_name, + "threshold": 0.0, + "scorer_class": "RuntimeRewardScorer", + } + ) + for metric_name in suite.standard_metrics: + criteria.append( + { + "metric_name": metric_name, + "threshold": 0.0, + "scorer_class": "RuntimeStandardMetricScorer", + } + ) + return criteria + + def _validate_trajectory_scorer_def(scorer: TrajectoryScorerDef) -> None: scorer_class = scorer_factory.get_scorer_class(scorer.metric_name) if scorer_class is None: @@ -740,7 +807,7 @@ def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: eval_config = EvaluationConfig( eval_suite_id=flow.suite.suite_id, eval_target=_build_eval_target(flow, normalized_target), - eval_criterias=[eval_criteria, *_trajectory_eval_criteria(flow.suite)], + eval_criterias=[eval_criteria, *_trajectory_eval_criteria(flow.suite), *_runtime_eval_criteria(flow.suite)], eval_dataset=dataset, ) return CompiledEvaluationPlan( @@ -868,10 +935,29 @@ def _build_state_summary(output: Mapping[str, Any] | Any) -> dict[str, Any]: "tool_call_count": len(state.get("tool_calls") or []) if isinstance(state, Mapping) else 0, "usage": dict(state.get("usage") or {}) if isinstance(state, Mapping) else {}, "timing": dict(state.get("timing") or {}) if isinstance(state, Mapping) else {}, + "standard_metrics": dict((state.get("metadata") or {}).get("standard_metrics") or {}) if isinstance(state, Mapping) else {}, "error": state.get("error") if isinstance(state, Mapping) else None, } +def _build_state_artifacts(output: Mapping[str, Any] | Any) -> dict[str, Any]: + if not isinstance(output, Mapping): + return {} + state = output.get("state") if isinstance(output.get("state"), Mapping) else output + if not isinstance(state, Mapping): + return {} + return dict(state.get("artifacts") or {}) + + +def _build_state_metadata(output: Mapping[str, Any] | Any) -> dict[str, Any]: + if not isinstance(output, Mapping): + return {} + state = output.get("state") if isinstance(output.get("state"), Mapping) else output + if not isinstance(state, Mapping): + return {} + return dict(state.get("metadata") or {}) + + async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: compiled = compile_evaluation_flow(flow) eval_result = await EvaluateRunner(config=compiled.eval_config).run() @@ -899,6 +985,7 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: for case_result in eval_result.eval_case_results: judge_payload = {} case_metrics: dict[str, Any] = {} + case_metric_details: dict[str, Any] = {} case_backend_id = None if case_result.score_rows: cases_with_metrics += 1 @@ -917,6 +1004,8 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: if status is not None: case_metrics[metric_name]["status"] = status metadata = metric_result.get("metadata") or {} + if isinstance(metadata, Mapping) and metadata: + case_metric_details[metric_name] = dict(metadata) if case_backend_id is None and isinstance(metadata, Mapping): case_backend_id = metadata.get("_judge_backend") else: @@ -936,6 +1025,9 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: judge=judge_payload, judge_backend={"backend_id": case_backend_id} if case_backend_id is not None else None, state_summary=_build_state_summary(case_result.output), + artifacts=_build_state_artifacts(case_result.output), + metadata=_build_state_metadata(case_result.output), + metric_details=case_metric_details, ) ) @@ -970,6 +1062,7 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: "resolved": False, "approved": None, }, + "suite_metadata": dict(compiled.suite.metadata), }) judge_schema = compiled.suite.judge_schema.json_schema() if judge_schema: @@ -1281,7 +1374,75 @@ async def _default_agent_judge_executor(prompt: JudgePrompt, system_prompt: str) return str(response.answer) +async def _runtime_adoption_assistant_step(*, user_turn, state, case, target) -> dict[str, Any]: + return { + "answer": "runtime composition resolved the scripted case", + "outcome": {"ticket": {"status": "resolved"}}, + "step_rewards": [ + StepReward( + metric_name="process_quality", + step_index=len(state.turns), + value=1.0, + reason="scripted runtime reached the expected terminal state", + ) + ], + "tool_calls": [{"id": "call-1", "function": {"name": "resolve_ticket", "arguments": "{}"}}], + "usage": {"total_tokens": 8}, + "timing": {"duration_ms": 1}, + } + + +async def _runtime_adoption_judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]: + outcome = ((target.get("artifacts") or {}).get("outcome") or {}) + resolved = ((outcome.get("ticket") or {}).get("status") == "resolved") + return { + "score": 1.0 if resolved else 0.0, + "verdict": "approved" if resolved else "blocked", + } + + +def _get_runtime_composition_adoption_suite() -> EvalSuiteDef: + return EvalSuiteDef( + suite_id="runtime-composition-adoption", + runtime_harness=CallableRuntimeHarness( + simulator=SinglePromptUserSimulator(), + assistant_step=_runtime_adoption_assistant_step, + max_turns=1, + ), + judge_schema=JudgeSchemaDef(output_model=_RuntimeCompositionJudgeOutput), + judge=_runtime_adoption_judge, + outcome_scorers=( + StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ), + ), + reward_metrics=("process_quality",), + standard_metrics=("n_turns", "n_tool_calls", "n_tokens", "duration_ms"), + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="ticket_resolved", op="==", threshold=1.0), + GateMetricCondition(metric_name="process_quality", op=">=", threshold=1.0), + GateMetricCondition(metric_name="n_turns", op="==", threshold=2), + GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0), + ) + ), + metadata={ + "evaluation_purpose": "capability", + "adoption_suite": True, + "runtime_composition": True, + }, + ) + + def get_builtin_eval_suite(name: str, judge_backend: JudgeBackend | None = None) -> EvalSuiteDef: + if name == "runtime-composition-adoption": + return _get_runtime_composition_adoption_suite() if name != "app-evaluator": raise KeyError(name) @@ -1385,3 +1546,9 @@ def resolve_eval_suite(name: str | None, target: str | Path) -> EvalSuiteDef: matcher=lambda target: target.get("target_kind") in {"file", "directory", "image"}, priority=10, ) +register_eval_suite( + "runtime-composition-adoption", + lambda target: get_builtin_eval_suite("runtime-composition-adoption"), + matcher=lambda target: target.get("target_kind") in {"file", "directory", "image", "inline"}, + priority=1, +) diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md index 8c84ba236..47044256a 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md @@ -29,7 +29,7 @@ ## Task 1: Rollout State and Harness Contracts -- [ ] **Step 1: Write failing rollout state tests** +- [x] **Step 1: Write failing rollout state tests** Add tests in `tests/evaluations/test_runtime_composition.py` for: @@ -55,17 +55,17 @@ def test_rollout_state_to_eval_state_excludes_live_handles(): assert eval_state.metadata["safe"] == "ok" ``` -- [ ] **Step 2: Run test and confirm failure** +- [x] **Step 2: Run test and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_rollout_state_to_eval_state_excludes_live_handles -q` Expected: FAIL because `runtime_composition.py` and `RolloutState` do not exist. -- [ ] **Step 3: Add minimal rollout models** +- [x] **Step 3: Add minimal rollout models** Create `aworld/evaluations/runtime_composition.py` with serializable dataclasses for `RolloutTurn`, `OutcomeCheckResult`, `StepReward`, `RolloutState`, `EvalRuntimeHarnessDef`, `RuntimeHarness`, and `UserSimulator`. -- [ ] **Step 4: Run rollout tests until green** +- [x] **Step 4: Run rollout tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -73,7 +73,7 @@ Expected: PASS for initial rollout state tests. ## Task 2: Outcome / State-Check Grading -- [ ] **Step 1: Write failing outcome grader tests** +- [x] **Step 1: Write failing outcome grader tests** Cover deterministic final-state checks: @@ -97,17 +97,17 @@ def test_state_check_grader_emits_outcome_metric(): assert result.passed is True ``` -- [ ] **Step 2: Run outcome tests and confirm failure** +- [x] **Step 2: Run outcome tests and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_state_check_grader_emits_outcome_metric -q` Expected: FAIL because `StateCheckGrader` does not exist. -- [ ] **Step 3: Implement deterministic state-check grader** +- [x] **Step 3: Implement deterministic state-check grader** Add an in-process state-check grader that reads serializable rollout `outcome` data and emits normal metric-compatible results. Reject checks that require command execution, sandbox reset, or non-serializable live handles. -- [ ] **Step 4: Run outcome tests until green** +- [x] **Step 4: Run outcome tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -115,7 +115,7 @@ Expected: PASS. ## Task 3: Scripted User Simulator -- [ ] **Step 1: Write failing simulator tests** +- [x] **Step 1: Write failing simulator tests** Cover scripted turns and single-prompt behavior: @@ -133,17 +133,17 @@ def test_scripted_user_simulator_emits_turns_in_order(): assert second.content == "again" ``` -- [ ] **Step 2: Run simulator tests and confirm failure** +- [x] **Step 2: Run simulator tests and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_scripted_user_simulator_emits_turns_in_order -q` Expected: FAIL because simulator implementation does not exist. -- [ ] **Step 3: Implement scripted and single-prompt simulators** +- [x] **Step 3: Implement scripted and single-prompt simulators** Add `ScriptedUserSimulator` and `SinglePromptUserSimulator` to `runtime_composition.py`. -- [ ] **Step 4: Run simulator tests until green** +- [x] **Step 4: Run simulator tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -151,21 +151,21 @@ Expected: PASS. ## Task 4: Runtime Harness Execution -- [ ] **Step 1: Write failing harness execution tests** +- [x] **Step 1: Write failing harness execution tests** Add a deterministic harness test that consumes simulator turns and returns rollout state with assistant turns and trajectory. -- [ ] **Step 2: Run harness test and confirm failure** +- [x] **Step 2: Run harness test and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_runtime_harness_executes_multi_turn_rollout -q` Expected: FAIL because harness implementation does not exist. -- [ ] **Step 3: Implement a minimal scripted runtime harness** +- [x] **Step 3: Implement a minimal scripted runtime harness** Add a framework test harness or deterministic harness class that uses a simulator and a callable assistant step function. -- [ ] **Step 4: Run harness tests until green** +- [x] **Step 4: Run harness tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -173,21 +173,21 @@ Expected: PASS. ## Task 5: Step Rewards and Aggregation -- [ ] **Step 1: Write failing reward aggregation tests** +- [x] **Step 1: Write failing reward aggregation tests** Cover reward records becoming weighted and partial-credit case/aggregate metrics. -- [ ] **Step 2: Run reward tests and confirm failure** +- [x] **Step 2: Run reward tests and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_step_rewards_aggregate_into_metrics -q` Expected: FAIL because reward aggregation is not wired. -- [ ] **Step 3: Implement step reward records and aggregation scorer** +- [x] **Step 3: Implement step reward records and aggregation scorer** Use existing scorer/report metric shapes. Keep reward metrics distinct from judge and outcome metrics. -- [ ] **Step 4: Run reward tests until green** +- [x] **Step 4: Run reward tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -195,21 +195,21 @@ Expected: PASS. ## Task 6: Retry Wrapper Composition -- [ ] **Step 1: Write failing retry wrapper tests** +- [x] **Step 1: Write failing retry wrapper tests** Cover failed first attempt, successful second attempt, preserved child/attempt state, and explicit absence of pass@k/pass^k labels. -- [ ] **Step 2: Run retry tests and confirm failure** +- [x] **Step 2: Run retry tests and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_retry_wrapper_preserves_failed_attempts -q` Expected: FAIL because retry wrapper does not exist. -- [ ] **Step 3: Implement retry wrapper** +- [x] **Step 3: Implement retry wrapper** Add a retry wrapper around a base `RuntimeHarness` with max attempts and selected terminal attempt. Preserve attempts as child state and do not emit trial metrics. -- [ ] **Step 4: Run retry tests until green** +- [x] **Step 4: Run retry tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -217,21 +217,21 @@ Expected: PASS. ## Task 7: Standard Metrics and Suite Purpose -- [ ] **Step 1: Write failing standard metric tests** +- [x] **Step 1: Write failing standard metric tests** Cover `n_turns`, `n_tool_calls`, token usage, and duration derivation from rollout state. -- [ ] **Step 2: Run standard metric tests and confirm failure** +- [x] **Step 2: Run standard metric tests and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_rollout_standard_metrics_are_derived -q` Expected: FAIL because standard metric derivation does not exist. -- [ ] **Step 3: Implement standard metric derivation and purpose metadata preservation** +- [x] **Step 3: Implement standard metric derivation and purpose metadata preservation** Add rollout standard metrics and preserve suite metadata such as `evaluation_purpose="capability"` or `evaluation_purpose="regression"` in report context. -- [ ] **Step 4: Run standard metric tests until green** +- [x] **Step 4: Run standard metric tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py -q` @@ -239,21 +239,21 @@ Expected: PASS. ## Task 8: Adoption Suite -- [ ] **Step 1: Write failing adoption suite tests** +- [x] **Step 1: Write failing adoption suite tests** Assert the new suite is registered, uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step reward metric, scripted simulator, purpose metadata, and runtime harness. -- [ ] **Step 2: Run adoption tests and confirm failure** +- [x] **Step 2: Run adoption tests and confirm failure** Run: `pytest tests/evaluations/test_runtime_composition.py::test_runtime_composition_adoption_suite_runs_end_to_end -q` Expected: FAIL because suite does not exist. -- [ ] **Step 3: Implement opt-in adoption suite** +- [x] **Step 3: Implement opt-in adoption suite** Add a narrow deterministic suite without changing `app-evaluator` behavior. -- [ ] **Step 4: Run adoption tests until green** +- [x] **Step 4: Run adoption tests until green** Run: `pytest tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluation_substrate.py -q` @@ -261,7 +261,7 @@ Expected: PASS. ## Task 9: Verification and Commit -- [ ] **Step 1: Run evaluator regression suite** +- [x] **Step 1: Run evaluator regression suite** Run: @@ -271,13 +271,13 @@ pytest tests/evaluations/test_execution_state.py tests/evaluations/test_executio Expected: PASS. -- [ ] **Step 2: Validate OpenSpec** +- [x] **Step 2: Validate OpenSpec** Run: `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict` Expected: `Change 'aworld-evaluator-runtime-composition-2026-06-10' is valid` -- [ ] **Step 3: Commit** +- [x] **Step 3: Commit** ```bash git add aworld/evaluations/runtime_composition.py aworld/evaluations/substrate.py aworld/evaluations/execution.py aworld/evaluations/report.py aworld/evaluations/scorers tests/evaluations/test_runtime_composition.py openspec/changes/aworld-evaluator-runtime-composition-2026-06-10 diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md index ad850bbe9..3911601cb 100644 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md +++ b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md @@ -1,53 +1,53 @@ ## 1. Runtime Harness Model -- [ ] 1.1 Add a rollout-owning runtime harness definition separate from lightweight `EvalHarnessDef`. -- [ ] 1.2 Add a `RuntimeHarness` protocol or base class that executes one case and returns rollout state. -- [ ] 1.3 Preserve existing single-shot static/agent/task/program flows unchanged. +- [x] 1.1 Add a rollout-owning runtime harness definition separate from lightweight `EvalHarnessDef`. +- [x] 1.2 Add a `RuntimeHarness` protocol or base class that executes one case and returns rollout state. +- [x] 1.3 Preserve existing single-shot static/agent/task/program flows unchanged. ## 2. Rollout State -- [ ] 2.1 Add a serializable rollout state model with turns, messages, trajectory, tool calls, usage, timing, errors, metadata, and child/attempt state. -- [ ] 2.2 Bridge rollout state into existing `EvalState` so current scorer helpers and report summaries keep working. -- [ ] 2.3 Include outcome data and optional serializable environment/artifact snapshots in rollout state. -- [ ] 2.4 Derive standard rollout metrics such as turn count, tool-call count, token usage, and duration. -- [ ] 2.5 Add tests proving rollout state does not store live runtime handles. +- [x] 2.1 Add a serializable rollout state model with turns, messages, trajectory, tool calls, usage, timing, errors, metadata, and child/attempt state. +- [x] 2.2 Bridge rollout state into existing `EvalState` so current scorer helpers and report summaries keep working. +- [x] 2.3 Include outcome data and optional serializable environment/artifact snapshots in rollout state. +- [x] 2.4 Derive standard rollout metrics such as turn count, tool-call count, token usage, and duration. +- [x] 2.5 Add tests proving rollout state does not store live runtime handles. ## 3. Outcome / State-Check Grading -- [ ] 3.1 Add deterministic outcome/state-check grader definitions. -- [ ] 3.2 Emit outcome metrics separately from judge, trajectory, and reward metrics. -- [ ] 3.3 Allow composite gates to reference outcome metrics. -- [ ] 3.4 Explicitly reject state checks that require sandbox reset, command execution, or clean-environment isolation in this change. +- [x] 3.1 Add deterministic outcome/state-check grader definitions. +- [x] 3.2 Emit outcome metrics separately from judge, trajectory, and reward metrics. +- [x] 3.3 Allow composite gates to reference outcome metrics. +- [x] 3.4 Explicitly reject state checks that require sandbox reset, command execution, or clean-environment isolation in this change. ## 4. User Simulation -- [ ] 4.1 Add a deterministic user simulator contract. -- [ ] 4.2 Add a scripted simulator that reads turns from case input. -- [ ] 4.3 Add a single-prompt simulator for compatibility with current one-shot cases. -- [ ] 4.4 Document that LLM-backed adaptive user simulation is deferred. +- [x] 4.1 Add a deterministic user simulator contract. +- [x] 4.2 Add a scripted simulator that reads turns from case input. +- [x] 4.3 Add a single-prompt simulator for compatibility with current one-shot cases. +- [x] 4.4 Document that LLM-backed adaptive user simulation is deferred. ## 5. Step-Level Rewards -- [ ] 5.1 Add step reward records with metric name, step index, value, weight, partial-credit marker, reason, and metadata. -- [ ] 5.2 Add rewarder interfaces that inspect rollout state without mutating it. -- [ ] 5.3 Aggregate step rewards into normal evaluator metrics and gate inputs, including weighted and partial-credit summaries. +- [x] 5.1 Add step reward records with metric name, step index, value, weight, partial-credit marker, reason, and metadata. +- [x] 5.2 Add rewarder interfaces that inspect rollout state without mutating it. +- [x] 5.3 Aggregate step rewards into normal evaluator metrics and gate inputs, including weighted and partial-credit summaries. ## 6. Runtime Composition -- [ ] 6.1 Add one runtime wrapper style, preferably retry, around a base runtime harness. -- [ ] 6.2 Preserve child/attempt state for composed runs. -- [ ] 6.3 Add tests for retry/fallback state, terminal status, and report visibility. -- [ ] 6.4 Document and test that retry/fallback attempts are not independent trials and do not produce pass@k/pass^k metrics. +- [x] 6.1 Add one runtime wrapper style, preferably retry, around a base runtime harness. +- [x] 6.2 Preserve child/attempt state for composed runs. +- [x] 6.3 Add tests for retry/fallback state, terminal status, and report visibility. +- [x] 6.4 Document and test that retry/fallback attempts are not independent trials and do not produce pass@k/pass^k metrics. ## 7. Adoption Suite -- [ ] 7.1 Add one builtin or framework-registered adoption suite that uses the runtime-composition path. -- [ ] 7.2 The adoption suite uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step-level reward, and scripted simulator. -- [ ] 7.3 Mark the adoption suite with capability/regression purpose metadata. -- [ ] 7.4 Keep `app-evaluator` behavior unchanged unless explicitly selected for migration later. +- [x] 7.1 Add one builtin or framework-registered adoption suite that uses the runtime-composition path. +- [x] 7.2 The adoption suite uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step-level reward, and scripted simulator. +- [x] 7.3 Mark the adoption suite with capability/regression purpose metadata. +- [x] 7.4 Keep `app-evaluator` behavior unchanged unless explicitly selected for migration later. ## 8. Verification -- [ ] 8.1 Add focused tests for harness rollout, outcome grading, user simulator, reward aggregation, runtime wrapper composition, standard metrics, and adoption suite execution. -- [ ] 8.2 Run the evaluator regression suite. -- [ ] 8.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict`. +- [x] 8.1 Add focused tests for harness rollout, outcome grading, user simulator, reward aggregation, runtime wrapper composition, standard metrics, and adoption suite execution. +- [x] 8.2 Run the evaluator regression suite. +- [x] 8.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict`. diff --git a/tests/evaluations/test_runtime_composition.py b/tests/evaluations/test_runtime_composition.py new file mode 100644 index 000000000..d93cc0b45 --- /dev/null +++ b/tests/evaluations/test_runtime_composition.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import pytest +from pydantic import BaseModel + +from aworld.evaluations.base import EvalCriteria +from aworld.evaluations.runtime_composition import ( + CallableRuntimeHarness, + RetryRuntimeHarness, + RolloutState, + RolloutTurn, + ScriptedUserSimulator, + SinglePromptUserSimulator, + StateCheckGrader, + StepReward, + aggregate_step_rewards, + derive_standard_metrics, +) +from aworld.evaluations.scorers import scorer_factory +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + JudgeSchemaDef, + TrajectoryScorerDef, + get_builtin_eval_suite, + run_evaluation_flow, +) +from aworld.evaluations.types import MetricNames + + +class RuntimeJudgeOutput(BaseModel): + score: float + verdict: str + + +def test_rollout_state_to_eval_state_excludes_live_handles(): + live_agent = object() + state = RolloutState( + case_id="case-1", + status="success", + answer="done", + turns=[RolloutTurn(role="user", content="hello")], + outcome={"artifact_exists": True}, + metadata={"live_agent": live_agent, "safe": "ok"}, + ) + + eval_state = state.to_eval_state(target={"target_kind": "inline"}) + + assert eval_state.case_id == "case-1" + assert eval_state.answer == "done" + assert eval_state.trajectory + assert eval_state.artifacts["outcome"]["artifact_exists"] is True + assert "live_agent" not in eval_state.metadata + assert eval_state.metadata["safe"] == "ok" + + +def test_state_check_grader_emits_outcome_metric(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"ticket": {"status": "resolved"}}, + ) + grader = StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ) + + result = grader.grade(state=state, case=None, target={}) + + assert result.metric_name == "ticket_resolved" + assert result.value == 1.0 + assert result.passed is True + + +def test_scripted_user_simulator_emits_turns_in_order(): + simulator = ScriptedUserSimulator() + state = RolloutState(case_id="case-1") + case = EvalCaseDef(case_id="case-1", input={"turns": ["hi", "again"]}) + + first = simulator.next_turn(case=case, target={}, state=state, last_output=None) + state.turns.append(first) + second = simulator.next_turn(case=case, target={}, state=state, last_output="ok") + + assert first.content == "hi" + assert second.content == "again" + + +def test_single_prompt_user_simulator_emits_one_turn(): + simulator = SinglePromptUserSimulator() + case = EvalCaseDef(case_id="case-1", input={"query": "hello"}) + state = RolloutState(case_id="case-1") + + first = simulator.next_turn(case=case, target={}, state=state) + state.turns.append(first) + second = simulator.next_turn(case=case, target={}, state=state) + + assert first.content == "hello" + assert second is None + + +@pytest.mark.asyncio +async def test_runtime_harness_executes_multi_turn_rollout(): + async def assistant_step(*, user_turn, state, case, target): + return { + "answer": f"ack:{user_turn.content}", + "tool_calls": [{"id": f"call-{len(state.turns)}"}], + } + + harness = CallableRuntimeHarness( + simulator=ScriptedUserSimulator(), + assistant_step=assistant_step, + max_turns=2, + ) + case = EvalCaseDef(case_id="case-1", input={"turns": ["hi", "again"]}) + + state = await harness.run_rollout(case=case, target={"target_kind": "inline"}) + + assert state.answer == "ack:again" + assert [turn.role for turn in state.turns] == ["user", "assistant", "user", "assistant"] + assert len(state.trajectory) == 4 + assert state.tool_calls == [{"id": "call-1"}, {"id": "call-3"}] + + +def test_step_rewards_aggregate_into_metrics(): + state = RolloutState( + case_id="case-1", + step_rewards=[ + StepReward(metric_name="process_quality", step_index=0, value=1.0, weight=2.0), + StepReward( + metric_name="process_quality", + step_index=1, + value=0.5, + weight=1.0, + partial_credit=True, + ), + ], + ) + + metrics = aggregate_step_rewards(state) + + assert metrics["process_quality"]["value"] == pytest.approx((1.0 * 2.0 + 0.5) / 3.0) + assert metrics["process_quality_total"]["value"] == pytest.approx(1.5) + assert metrics["process_quality_partial_credit_rate"]["value"] == pytest.approx(0.5) + + +@pytest.mark.asyncio +async def test_retry_wrapper_preserves_failed_attempts(): + attempts = [] + + class FlakyHarness: + async def run_rollout(self, *, case, target): + attempts.append(len(attempts) + 1) + if len(attempts) == 1: + return RolloutState(case_id=case.case_id, status="failed", answer="bad") + return RolloutState(case_id=case.case_id, status="success", answer="ok") + + wrapper = RetryRuntimeHarness(base_harness=FlakyHarness(), max_attempts=2) + case = EvalCaseDef(case_id="case-1", input={"query": "hello"}) + + state = await wrapper.run_rollout(case=case, target={}) + + assert state.status == "success" + assert state.answer == "ok" + assert [attempt.status for attempt in state.attempts] == ["failed", "success"] + assert "pass@1" not in state.standard_metrics + assert "pass^1" not in state.standard_metrics + + +@pytest.mark.asyncio +async def test_retry_wrapper_attempts_serialize_without_self_recursion(): + class FlakyHarness: + def __init__(self): + self.calls = 0 + + async def run_rollout(self, *, case, target): + self.calls += 1 + return RolloutState( + case_id=case.case_id, + status="success" if self.calls == 2 else "failed", + answer=f"attempt-{self.calls}", + ) + + wrapper = RetryRuntimeHarness(base_harness=FlakyHarness(), max_attempts=2) + case = EvalCaseDef(case_id="case-1", input={"query": "hello"}) + + state = await wrapper.run_rollout(case=case, target={}) + eval_state = state.to_eval_state(target={}) + state_dict = state.to_dict() + + assert [attempt["answer"] for attempt in eval_state.artifacts["attempts"]] == ["attempt-1", "attempt-2"] + assert [attempt["answer"] for attempt in state_dict["attempts"]] == ["attempt-1", "attempt-2"] + + +def test_rollout_standard_metrics_are_derived(): + state = RolloutState( + case_id="case-1", + turns=[ + RolloutTurn(role="user", content="hello"), + RolloutTurn(role="assistant", content="ok"), + ], + tool_calls=[{"id": "call-1"}], + usage={"prompt_tokens": 3, "completion_tokens": 2, "total_tokens": 5}, + timing={"duration_ms": 120}, + ) + + metrics = derive_standard_metrics(state) + + assert metrics == { + "n_turns": 2, + "n_tool_calls": 1, + "n_tokens": 5, + "duration_ms": 120, + } + + +@pytest.mark.asyncio +async def test_runtime_composition_adoption_suite_runs_end_to_end(): + async def assistant_step(*, user_turn, state, case, target): + return { + "answer": "ticket resolved", + "outcome": {"ticket": {"status": "resolved"}}, + "step_rewards": [ + StepReward(metric_name="process_quality", step_index=0, value=1.0, reason="direct resolution") + ], + "tool_calls": [{"id": "call-1", "function": {"name": "resolve_ticket", "arguments": "{}"}}], + "usage": {"total_tokens": 7}, + "timing": {"duration_ms": 25}, + } + + async def fake_judge(case_input, target): + assert target["artifacts"]["outcome"]["ticket"]["status"] == "resolved" + return {"score": 1.0, "verdict": "approved"} + + suite = EvalSuiteDef( + suite_id="runtime-adoption", + cases=[EvalCaseDef(case_id="case-1", input={"query": "resolve ticket"})], + runtime_harness=CallableRuntimeHarness( + simulator=SinglePromptUserSimulator(), + assistant_step=assistant_step, + max_turns=1, + ), + judge_schema=JudgeSchemaDef(output_model=RuntimeJudgeOutput), + judge=fake_judge, + outcome_scorers=( + StateCheckGrader( + metric_name="ticket_resolved", + path=("ticket", "status"), + expected="resolved", + ), + ), + reward_metrics=("process_quality",), + standard_metrics=("n_turns", "n_tool_calls", "n_tokens", "duration_ms"), + trajectory_scorers=( + TrajectoryScorerDef(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, threshold=1.0), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=0.9), + GateMetricCondition(metric_name="ticket_resolved", op="==", threshold=1.0), + GateMetricCondition(metric_name="process_quality", op=">=", threshold=1.0), + GateMetricCondition(metric_name="n_turns", op="==", threshold=2), + GateMetricCondition(metric_name=MetricNames.TRAJECTORY_TOOL_CALLS, op="==", threshold=1.0), + ) + ), + metadata={"evaluation_purpose": "capability"}, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["gate"]["status"] == "pass" + assert report["metrics"]["ticket_resolved"]["mean"] == pytest.approx(1.0) + assert report["metrics"]["process_quality"]["mean"] == pytest.approx(1.0) + assert report["metrics"]["n_turns"]["mean"] == pytest.approx(2.0) + assert report["results"][0]["metric_details"]["ticket_resolved"]["passed"] is True + assert report["results"][0]["artifacts"]["outcome"]["ticket"]["status"] == "resolved" + assert report["suite_metadata"]["evaluation_purpose"] == "capability" + + +def test_builtin_runtime_composition_adoption_suite_is_registered(): + suite = get_builtin_eval_suite("runtime-composition-adoption") + + assert suite.suite_id == "runtime-composition-adoption" + assert suite.runtime_harness is not None + assert suite.judge_schema.output_model is not None + assert suite.outcome_scorers + assert suite.reward_metrics == ("process_quality",) + assert suite.metadata["evaluation_purpose"] == "capability" + + +def test_runtime_scorer_can_be_selected_by_full_class_name_for_dynamic_metric(): + scorers = scorer_factory( + criterias=[ + EvalCriteria( + metric_name="custom_outcome", + scorer_class="aworld.evaluations.scorers.runtime_composition.RuntimeOutcomeScorer", + scorer_params={ + "grader": { + "path": ["ok"], + "expected": True, + } + }, + ) + ] + ) + + assert scorers[0].__class__.__name__ == "RuntimeOutcomeScorer" From ef661970d2d189d67f2d40c4de26c8293a50278b Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 15:00:06 +0800 Subject: [PATCH 25/41] docs: add evaluator trials pass metrics change --- .../.openspec.yaml | 2 + .../design.md | 97 +++++++++ .../implementation-plan.md | 200 ++++++++++++++++++ .../proposal.md | 34 +++ .../specs/evaluation-substrate/spec.md | 65 ++++++ .../tasks.md | 40 ++++ 6 files changed, 438 insertions(+) create mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml create mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md create mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md create mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md create mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md create mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml new file mode 100644 index 000000000..2cb80411e --- /dev/null +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md new file mode 100644 index 000000000..933e216f8 --- /dev/null +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md @@ -0,0 +1,97 @@ +## Context + +Runtime composition now gives AWorld a rollout-owning harness and serializable rollout state. That solves multi-turn execution and outcome inspection for one evaluation attempt. It does not solve nondeterminism measurement: a model or agent can fail one rollout and pass another under the same case. Agent evaluation needs independent repeated trials and distribution-level metrics. + +This change adds trial execution above the existing suite/harness layer. A trial is one independent evaluation of one case. A retry attempt is not a trial; retry is an execution strategy inside one trial. + +## Goals / Non-Goals + +**Goals:** + +- Add trial configuration with a default of one trial. +- Execute each case for `num_trials` independent trials. +- Preserve trial index, trial id, terminal status, metrics, and state summary in reports. +- Compute pass@k and pass^k from independent trial outcomes. +- Keep retry/fallback attempts nested inside a trial and excluded from trial metrics. +- Support both single-shot suites and runtime-composed suites. +- Keep existing evaluator behavior unchanged when no trial configuration is supplied. + +**Non-Goals:** + +- Adding sandbox reset, filesystem/database isolation, or clean-environment orchestration. +- Adding LLM-backed adaptive user simulators. +- Adding training-loop or optimizer integration. +- Redesigning `EvaluateRunner` public API beyond additive evaluator-substrate wiring. +- Treating retry/fallback attempts as trials. + +## Decisions + +### 1. Model trials as evaluator-level repetition, not harness retries + +Add a `TrialPolicyDef` on `EvalSuiteDef`: + +- `num_trials`: positive integer, default `1` +- `pass_at_k`: tuple of k values to report, default empty +- `pass_caret_k`: tuple of k values to report, default empty +- `success_metric`: metric used to decide whether a trial passed, default derived from the gate primary metric or `score` + +The framework should normalize invalid values at compile time: `num_trials >= 1`, k values between `1` and `num_trials`, and `success_metric` must be a declared or gate-referenced metric. + +### 2. Preserve one trial as current behavior + +If `TrialPolicyDef.num_trials == 1`, report shape and existing aggregate metrics should remain compatible. Trial-specific fields may be absent or present as additive metadata, but no existing required field should change. + +### 3. Expand cases without changing case identity + +The evaluator should execute `case_id` repeatedly with trial metadata: + +- stable original case id +- `trial_index` +- `trial_id` +- optional deterministic seed metadata + +Reports should group results by original case id while still exposing individual trial case results. A practical first implementation can expand dataset case ids to `case_id::trial-N` and retain `original_case_id` in case metadata. + +### 4. Compute pass@k and pass^k from trial outcomes + +For each original case and metric: + +- pass@k is true if any of the first k independent trials passed +- pass^k is true if all of the first k independent trials passed + +Aggregate report metrics should include rates across original cases: + +- `_pass@k` +- `_pass^k` + +These values are report-level metrics and may be referenced by composite gates. + +### 5. Keep retry/fallback inside each trial + +If a runtime harness uses retry, the selected terminal attempt determines that trial's metric outcome. Child attempts stay in rollout artifacts/metadata for inspection, but pass@k/pass^k must count the trial once. + +### 6. Defer environment isolation + +Trials need independence, but this change does not create sandboxes. The implementation should provide metadata hooks for later environment reset integration and document that true clean-state independence requires the follow-up environment-isolation change. + +## Risks / Trade-offs + +- [Retry confused with trials] -> Mitigation: explicit report fields and tests that retry child attempts do not increase trial count. +- [Report bloat] -> Mitigation: per-trial state summaries stay per case result; full child states remain artifacts or references. +- [Existing repeat_times behavior collision] -> Mitigation: keep suite trial policy framework-owned and avoid relying on legacy `Evaluator.repeat_times` pass@k behavior unless it can preserve required report semantics. +- [False independence without sandbox] -> Mitigation: document clean-state reset as out of scope and preserve hooks for later isolation. + +## Migration Plan + +1. Add `TrialPolicyDef` and compile-time validation. +2. Expand suite cases into trial cases with metadata while preserving original case id. +3. Add trial-aware result grouping and pass@k/pass^k aggregation. +4. Add report fields for trial policy, trial counts, and trial metrics. +5. Add tests proving retry attempts do not count as trials. +6. Keep existing one-trial suites and `app-evaluator` behavior compatible. + +## Deferred Questions + +- Environment reset semantics should be handled in an `evaluator-environment-isolation` change. +- LLM-backed adaptive user simulators should stay in a simulator-focused change. +- Training/optimizer integration should wait until trial metrics and environment isolation are stable. diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md new file mode 100644 index 000000000..a4255218f --- /dev/null +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md @@ -0,0 +1,200 @@ +# AWorld Evaluator Trials and pass@k Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add independent evaluator trials with pass@k/pass^k metrics while keeping retry/fallback attempts distinct from trials. + +**Architecture:** Add a small trial policy layer to the suite substrate. Expand cases into trial rows before evaluation, preserve original case metadata, then aggregate pass@k/pass^k from independent trial case results during report assembly. + +**Tech Stack:** Python dataclasses, existing evaluator substrate/report/scorer infrastructure, pytest, OpenSpec. + +--- + +## File Structure + +- Modify: `aworld/evaluations/substrate.py` + Add `TrialPolicyDef`, trial case expansion, trial aggregation, report metadata, and gate integration. +- Modify: `aworld/evaluations/report.py` + Allow additive trial metadata fields if needed. +- Test: `tests/evaluations/test_evaluator_trials.py` + Focused TDD coverage for trial policy, expansion, pass@k/pass^k, retry separation, and report shape. +- Test: existing evaluator regression tests + Ensure one-trial behavior remains compatible. + +## Task 1: Trial Policy + +- [ ] **Step 1: Write failing trial policy tests** + +Add tests in `tests/evaluations/test_evaluator_trials.py`: + +```python +from aworld.evaluations.substrate import TrialPolicyDef + + +def test_trial_policy_rejects_invalid_k_values(): + with pytest.raises(ValueError, match="k values"): + TrialPolicyDef(num_trials=2, pass_at_k=(3,)).validate() +``` + +- [ ] **Step 2: Run test and confirm failure** + +Run: `pytest tests/evaluations/test_evaluator_trials.py::test_trial_policy_rejects_invalid_k_values -q` + +Expected: FAIL because `TrialPolicyDef` does not exist. + +- [ ] **Step 3: Implement `TrialPolicyDef`** + +Add a frozen dataclass in `aworld/evaluations/substrate.py`: + +```python +@dataclass(frozen=True) +class TrialPolicyDef: + num_trials: int = 1 + pass_at_k: tuple[int, ...] = tuple() + pass_caret_k: tuple[int, ...] = tuple() + success_metric: str | None = None + + def validate(self) -> None: + if self.num_trials < 1: + raise ValueError("num_trials must be >= 1") + invalid = [k for k in (*self.pass_at_k, *self.pass_caret_k) if k < 1 or k > self.num_trials] + if invalid: + raise ValueError("k values must be between 1 and num_trials") +``` + +- [ ] **Step 4: Run policy tests until green** + +Run: `pytest tests/evaluations/test_evaluator_trials.py -q` + +Expected: PASS for initial policy tests. + +## Task 2: Trial Case Expansion + +- [ ] **Step 1: Write failing expansion tests** + +```python +def test_build_eval_dataset_expands_trial_cases(): + suite = EvalSuiteDef( + suite_id="trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + trial_policy=TrialPolicyDef(num_trials=3), + ) + compiled = compile_evaluation_flow(EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite)) + + ids = [case.eval_case_id for case in compiled.dataset.eval_cases] + assert ids == ["case-1::trial-1", "case-1::trial-2", "case-1::trial-3"] + assert compiled.dataset.eval_cases[0].case_data["_trial"]["original_case_id"] == "case-1" + assert compiled.dataset.eval_cases[0].case_data["_trial"]["trial_index"] == 1 +``` + +- [ ] **Step 2: Run expansion test and confirm failure** + +Run: `pytest tests/evaluations/test_evaluator_trials.py::test_build_eval_dataset_expands_trial_cases -q` + +Expected: FAIL because `EvalSuiteDef` does not accept `trial_policy`. + +- [ ] **Step 3: Implement trial expansion** + +Add `trial_policy: TrialPolicyDef = field(default_factory=TrialPolicyDef)` to `EvalSuiteDef`. Update `compile_evaluation_flow()` to expand `flow.suite.cases` before `build_eval_dataset()`, preserving `_trial` metadata. + +- [ ] **Step 4: Run expansion tests until green** + +Run: `pytest tests/evaluations/test_evaluator_trials.py -q` + +Expected: PASS. + +## Task 3: pass@k/pass^k Aggregation + +- [ ] **Step 1: Write failing aggregation tests** + +Use a deterministic judge that passes trial 2 and fails trials 1/3. Assert `score_pass@2 == 1.0` and `score_pass^2 == 0.0`. + +- [ ] **Step 2: Run aggregation test and confirm failure** + +Run: `pytest tests/evaluations/test_evaluator_trials.py::test_run_evaluation_flow_reports_pass_at_k_and_pass_caret_k -q` + +Expected: FAIL because trial aggregation does not exist. + +- [ ] **Step 3: Implement trial aggregation** + +In `run_evaluation_flow()`, group case results by `_trial.original_case_id`, derive each trial pass/fail from `TrialPolicyDef.success_metric` or gate primary metric, then add aggregate metrics named `_pass@k` and `_pass^k`. + +- [ ] **Step 4: Run aggregation tests until green** + +Run: `pytest tests/evaluations/test_evaluator_trials.py -q` + +Expected: PASS. + +## Task 4: Retry Separation + +- [ ] **Step 1: Write failing retry/trial separation test** + +Use a runtime harness wrapped in retry with `num_trials=2`. Assert report trial count is `2`, not the number of retry attempts, and pass@k counts terminal trial outcomes only. + +- [ ] **Step 2: Run retry separation test and confirm failure** + +Run: `pytest tests/evaluations/test_evaluator_trials.py::test_retry_attempts_do_not_count_as_trials -q` + +Expected: FAIL until trial grouping ignores retry child attempts. + +- [ ] **Step 3: Preserve retry attempts as artifacts only** + +Ensure trial aggregation reads only top-level trial results and never inspects `artifacts.attempts` as independent outcomes. + +- [ ] **Step 4: Run retry separation tests until green** + +Run: `pytest tests/evaluations/test_evaluator_trials.py -q` + +Expected: PASS. + +## Task 5: Report Shape and Compatibility + +- [ ] **Step 1: Write failing report compatibility tests** + +Assert one-trial `app-evaluator` style suites keep current required report fields, while multi-trial reports include `trial_policy`, `trial_counts`, and per-result trial metadata. + +- [ ] **Step 2: Run report tests and confirm failure** + +Run: `pytest tests/evaluations/test_evaluator_trials.py::test_multi_trial_report_exposes_trial_metadata -q` + +Expected: FAIL until report metadata is added. + +- [ ] **Step 3: Add additive report fields** + +Add report fields without changing existing required fields: + +```python +report["trial_policy"] = {...} +report["trial_counts"] = {"original_cases": n, "trials_total": m} +``` + +- [ ] **Step 4: Run report tests until green** + +Run: `pytest tests/evaluations/test_evaluator_trials.py tests/evaluations/test_evaluation_substrate.py -q` + +Expected: PASS. + +## Task 6: Verification and Commit + +- [ ] **Step 1: Run evaluator regression suite** + +Run: + +```bash +pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluator_trials.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -q +``` + +Expected: PASS. + +- [ ] **Step 2: Validate OpenSpec** + +Run: `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict` + +Expected: `Change 'aworld-evaluator-trials-passk-2026-06-10' is valid` + +- [ ] **Step 3: Commit** + +```bash +git add aworld/evaluations/substrate.py aworld/evaluations/report.py tests/evaluations/test_evaluator_trials.py openspec/changes/aworld-evaluator-trials-passk-2026-06-10 +git commit -m "feat: add evaluator trial pass metrics" +``` diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md new file mode 100644 index 000000000..e33cab757 --- /dev/null +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md @@ -0,0 +1,34 @@ +## Why + +`aworld-evaluator-runtime-composition-2026-06-10` added rollout-owning harnesses, outcome/state-check grading, step rewards, retry wrappers, and an adoption suite. It deliberately kept retry/fallback separate from independent trial evaluation. + +Complete agent evaluation still needs a first-class way to measure nondeterministic target behavior: + +- run the same case multiple independent times +- preserve per-trial rollout state and metrics +- compute pass@k and pass^k without confusing retry attempts for trials +- report trial distributions without changing existing single-shot suite behavior + +Without this layer, users can run a deterministic regression suite, but cannot answer "does this agent solve the task at least once in k attempts?" or "does it solve the task every time across k attempts?". + +## What Changes + +- Add suite-level trial configuration for independent repeated evaluation. +- Add trial-aware execution/report structures that retain per-trial case results. +- Add pass@k and pass^k aggregate metrics computed from independent trial outcomes. +- Keep retry/fallback attempts inside a trial and explicitly exclude them from pass@k/pass^k calculation. +- Add one opt-in adoption suite or test fixture proving trials work with runtime-composed suites and existing single-shot suites remain compatible. +- Defer clean-environment reset/sandbox orchestration to a dedicated environment-isolation change. + +## Capabilities + +### Modified Capabilities + +- `evaluation-substrate`: add independent trial execution, trial reports, and pass@k/pass^k aggregation for suite-backed evaluation flows. + +## Impact + +- Affected code: `aworld/evaluations/**`, especially suite definitions, flow compilation, evaluator orchestration, report assembly, and runtime-composition integration. +- Affected APIs: additive trial configuration on suite-backed evaluator APIs; existing callers default to one trial. +- Affected tests: add focused coverage for trial expansion, pass@k/pass^k math, retry/trial separation, report shape, and compatibility. +- Affected docs: clarify trial semantics and how they differ from retry/fallback wrappers. diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md new file mode 100644 index 000000000..ee1f8e36e --- /dev/null +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md @@ -0,0 +1,65 @@ +## MODIFIED Requirements + +### Requirement: Trial-based evaluation + +Suite-backed evaluation flows SHALL support independent repeated trials for each evaluation case while preserving current one-trial behavior by default. + +#### Scenario: Suite declares multiple trials +- **WHEN** a suite-backed evaluator declares a trial policy with `num_trials` greater than one +- **THEN** the framework SHALL execute each original case independently for the configured number of trials + +#### Scenario: Suite does not declare trials +- **WHEN** a suite-backed evaluator does not declare a trial policy +- **THEN** the framework SHALL execute each case once and preserve existing report behavior + +#### Scenario: Trial metadata is attached +- **WHEN** a case is expanded into trial executions +- **THEN** each trial execution SHALL preserve the original case id, trial index, and trial id in serializable case or state metadata + +### Requirement: pass@k and pass^k metrics + +Trial-based evaluation SHALL compute pass@k and pass^k metrics from independent trial outcomes. + +#### Scenario: pass@k is computed +- **WHEN** a suite declares pass@k for a metric and a case has at least k trials +- **THEN** the framework SHALL mark that case as pass@k when any of the first k independent trials passes the configured success metric + +#### Scenario: pass^k is computed +- **WHEN** a suite declares pass^k for a metric and a case has at least k trials +- **THEN** the framework SHALL mark that case as pass^k when all of the first k independent trials pass the configured success metric + +#### Scenario: Trial metrics are aggregated +- **WHEN** pass@k or pass^k is computed for all cases +- **THEN** the framework SHALL expose aggregate pass@k/pass^k rates as normal report metrics that composite gates can reference + +### Requirement: Retry attempts are not trials + +Trial-based evaluation SHALL keep runtime retry/fallback attempts separate from independent trials. + +#### Scenario: Retry wrapper runs inside a trial +- **WHEN** a runtime-composed trial uses a retry or fallback wrapper +- **THEN** the framework SHALL count the selected terminal rollout as one trial and preserve child attempts only as trial artifacts or metadata + +#### Scenario: pass@k excludes retry attempts +- **WHEN** pass@k or pass^k metrics are calculated +- **THEN** retry or fallback child attempts SHALL NOT increase the number of trials or directly contribute separate trial outcomes + +### Requirement: Trial reports + +Evaluator reports SHALL expose trial metadata and aggregate trial metrics additively. + +#### Scenario: Multiple trials are reported +- **WHEN** a suite runs multiple trials +- **THEN** the report SHALL include trial policy metadata, total trial counts, and per-trial metadata sufficient to group trials by original case id + +#### Scenario: Single-trial reports remain compatible +- **WHEN** a suite runs one trial +- **THEN** existing required report fields SHALL remain compatible and trial-specific fields SHALL be additive only + +### Requirement: Environment reset is deferred + +Trial-based evaluation SHALL acknowledge clean-environment isolation as a separate concern. + +#### Scenario: Suite requires clean environment per trial +- **WHEN** a suite requires filesystem, database, sandbox, or external state reset between trials +- **THEN** the framework SHALL treat that reset orchestration as out of scope for this change and leave it to a dedicated environment-isolation change diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md new file mode 100644 index 000000000..28b80ebac --- /dev/null +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md @@ -0,0 +1,40 @@ +## 1. Trial Policy Model + +- [ ] 1.1 Add `TrialPolicyDef` with `num_trials`, `pass_at_k`, `pass_caret_k`, and `success_metric`. +- [ ] 1.2 Add `trial_policy` to `EvalSuiteDef` with a default one-trial policy. +- [ ] 1.3 Validate trial policy during flow compilation. +- [ ] 1.4 Preserve existing single-shot behavior when `num_trials == 1`. + +## 2. Trial Case Expansion + +- [ ] 2.1 Expand each suite case into independent trial case rows when `num_trials > 1`. +- [ ] 2.2 Preserve `original_case_id`, `trial_index`, and `trial_id` in case metadata. +- [ ] 2.3 Ensure runtime-composed harnesses receive trial metadata without storing live handles. +- [ ] 2.4 Add tests for stable case grouping and trial metadata. + +## 3. Trial Outcome Aggregation + +- [ ] 3.1 Determine per-trial pass/fail from the configured `success_metric`. +- [ ] 3.2 Compute pass@k per original case. +- [ ] 3.3 Compute pass^k per original case. +- [ ] 3.4 Aggregate pass@k/pass^k rates across original cases. +- [ ] 3.5 Allow composite gates to reference trial aggregate metrics. + +## 4. Retry / Trial Separation + +- [ ] 4.1 Add tests proving retry child attempts do not increase trial count. +- [ ] 4.2 Ensure pass@k/pass^k uses the selected terminal rollout of each trial. +- [ ] 4.3 Preserve retry attempts only inside trial artifacts/metadata. + +## 5. Report Shape + +- [ ] 5.1 Add report-level trial policy metadata. +- [ ] 5.2 Add report-level trial count summaries. +- [ ] 5.3 Add per-case trial grouping or trial metadata sufficient to reconstruct groups. +- [ ] 5.4 Keep existing report schema compatible via additive fields. + +## 6. Verification + +- [ ] 6.1 Add focused tests for trial policy validation, trial expansion, pass@k/pass^k aggregation, retry separation, and report shape. +- [ ] 6.2 Run evaluator regression tests. +- [ ] 6.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict`. From ff2c1ac9c8ba0057bd38996e5e93b310c88b2171 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 15:14:08 +0800 Subject: [PATCH 26/41] feat: add evaluator trial pass metrics --- aworld/evaluations/report.py | 3 + aworld/evaluations/substrate.py | 150 ++++++++++++++++- .../implementation-plan.md | 46 +++--- .../tasks.md | 46 +++--- tests/evaluations/test_evaluator_trials.py | 153 ++++++++++++++++++ 5 files changed, 351 insertions(+), 47 deletions(-) create mode 100644 tests/evaluations/test_evaluator_trials.py diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py index 73e19e3c2..029b2cefb 100644 --- a/aworld/evaluations/report.py +++ b/aworld/evaluations/report.py @@ -21,6 +21,7 @@ def __init__( artifacts: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None, metric_details: dict[str, Any] | None = None, + trial: dict[str, Any] | None = None, ) -> None: payload = { "case_id": case_id, @@ -36,6 +37,8 @@ def __init__( payload["metadata"] = metadata if metric_details: payload["metric_details"] = metric_details + if trial: + payload["trial"] = trial super().__init__(payload) def to_dict(self) -> dict[str, Any]: diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index fe562af24..9ecf59889 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -82,6 +82,33 @@ class TrajectoryScorerDef: scorer_params: dict[str, Any] = field(default_factory=dict) +@dataclass(frozen=True) +class TrialPolicyDef: + num_trials: int = 1 + pass_at_k: tuple[int, ...] = tuple() + pass_caret_k: tuple[int, ...] = tuple() + success_metric: str | None = None + + def validate(self) -> None: + if self.num_trials < 1: + raise ValueError("num_trials must be >= 1") + invalid = [ + k + for k in (*self.pass_at_k, *self.pass_caret_k) + if k < 1 or k > self.num_trials + ] + if invalid: + raise ValueError("k values must be between 1 and num_trials") + + def to_dict(self) -> dict[str, Any]: + return { + "num_trials": self.num_trials, + "pass_at_k": list(self.pass_at_k), + "pass_caret_k": list(self.pass_caret_k), + "success_metric": self.success_metric, + } + + @dataclass(frozen=True) class JudgeSchemaDef: required_fields: tuple[str, ...] = tuple() @@ -387,6 +414,7 @@ class EvalSuiteDef: outcome_scorers: tuple[StateCheckGrader, ...] = tuple() reward_metrics: tuple[str, ...] = tuple() standard_metrics: tuple[str, ...] = tuple() + trial_policy: TrialPolicyDef = field(default_factory=TrialPolicyDef) trajectory_scorers: tuple[TrajectoryScorerDef, ...] = tuple() judge: JudgeCallable | None = None judge_backend: JudgeBackend | None = None @@ -637,6 +665,31 @@ def build_eval_dataset(cases: list[EvalCaseDef], target: dict[str, Any]) -> Eval return EvalDataset(eval_dataset_id=dataset_id, eval_dataset_name="suite_eval_dataset", eval_cases=eval_cases) +def _expand_trial_cases(cases: list[EvalCaseDef], trial_policy: TrialPolicyDef) -> list[EvalCaseDef]: + trial_policy.validate() + if trial_policy.num_trials == 1: + return cases + + expanded: list[EvalCaseDef] = [] + for case in cases: + for trial_index in range(1, trial_policy.num_trials + 1): + trial_id = f"{case.case_id}::trial-{trial_index}" + trial_metadata = { + "original_case_id": case.case_id, + "trial_index": trial_index, + "trial_id": trial_id, + } + expanded.append( + replace( + case, + case_id=trial_id, + input={**case.input, "_trial": trial_metadata}, + metadata={**case.metadata, "_trial": trial_metadata}, + ) + ) + return expanded + + def resolve_eval_harness(suite: EvalSuiteDef) -> EvalHarnessDef: if suite.harness is not None: return suite.harness @@ -792,7 +845,8 @@ def _validate_trajectory_scorer_def(scorer: TrajectoryScorerDef) -> None: def compile_evaluation_flow(flow: EvaluationFlowDef) -> CompiledEvaluationPlan: normalized_target = _normalize_target(flow.target) - dataset = build_eval_dataset(flow.suite.cases, normalized_target) + trial_cases = _expand_trial_cases(flow.suite.cases, flow.suite.trial_policy) + dataset = build_eval_dataset(trial_cases, normalized_target) harness = resolve_eval_harness(flow.suite) gate_policy = flow.suite.gate_policy or GatePolicyDef(metric_name="score", pass_threshold=0.0) score_bounds = _gate_metric_eval_bounds(gate_policy, "score") @@ -846,6 +900,96 @@ def _extract_metric_value_from_result_summary(summary: Mapping[str, Any], metric raise KeyError(f"metric {metric_name} is missing aggregate summary") +def _case_trial_metadata(case_result: Any) -> dict[str, Any]: + input_obj = getattr(case_result, "input", None) + case_data = getattr(input_obj, "case_data", {}) if input_obj is not None else {} + trial = case_data.get("_trial") if isinstance(case_data, Mapping) else None + return dict(trial or {}) + + +def _case_metric_value(case_result: Any, metric_name: str) -> Any: + for score_row in getattr(case_result, "score_rows", {}).values(): + metric_result = getattr(score_row, "metric_results", {}).get(metric_name) + if isinstance(metric_result, Mapping) and "value" in metric_result: + return metric_result["value"] + if metric_result is not None: + return metric_result + raise KeyError(metric_name) + + +def _metric_value_passed(value: Any) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return float(value) > 0.0 + return bool(value) + + +def _summarize_binary_values(values: list[float]) -> dict[str, Any]: + if not values: + return {"mean": 0.0, "min": 0.0, "max": 0.0, "std": 0.0} + mean = sum(values) / len(values) + return { + "mean": mean, + "min": min(values), + "max": max(values), + "std": 0.0, + } + + +def _trial_base_success_metric(metric_name: str) -> str: + for marker in ("_pass@", "_pass^"): + if marker in metric_name: + return metric_name.split(marker, 1)[0] + return metric_name + + +def _apply_trial_metrics(eval_result: Any, suite: EvalSuiteDef, gate_policy: GatePolicyDef | None) -> dict[str, Any]: + policy = suite.trial_policy + if policy.num_trials == 1 and not policy.pass_at_k and not policy.pass_caret_k: + return { + "original_cases": len(eval_result.eval_case_results), + "trials_total": len(eval_result.eval_case_results), + } + + configured_metric = policy.success_metric or (gate_policy.primary_metric_name() if gate_policy else "score") + success_metric = _trial_base_success_metric(configured_metric) + groups: dict[str, list[Any]] = {} + for case_result in eval_result.eval_case_results: + trial = _case_trial_metadata(case_result) + original_case_id = trial.get("original_case_id") or case_result.eval_case_id + groups.setdefault(str(original_case_id), []).append(case_result) + + trial_metrics: dict[str, dict[str, Any]] = {} + for k in policy.pass_at_k: + values: list[float] = [] + for results in groups.values(): + ordered = sorted(results, key=lambda result: int(_case_trial_metadata(result).get("trial_index", 1))) + selected = ordered[:k] + passed = any(_metric_value_passed(_case_metric_value(result, success_metric)) for result in selected) + values.append(1.0 if passed else 0.0) + trial_metrics[f"{success_metric}_pass@{k}"] = _summarize_binary_values(values) + + for k in policy.pass_caret_k: + values = [] + for results in groups.values(): + ordered = sorted(results, key=lambda result: int(_case_trial_metadata(result).get("trial_index", 1))) + selected = ordered[:k] + passed = len(selected) >= k and all( + _metric_value_passed(_case_metric_value(result, success_metric)) + for result in selected + ) + values.append(1.0 if passed else 0.0) + trial_metrics[f"{success_metric}_pass^{k}"] = _summarize_binary_values(values) + + if trial_metrics: + eval_result.summary["trial_metrics"] = trial_metrics + return { + "original_cases": len(groups), + "trials_total": len(eval_result.eval_case_results), + } + + def _flatten_result_metrics(summary: Mapping[str, Any]) -> dict[str, Any]: metrics: dict[str, Any] = {} for scorer_summary in summary.values(): @@ -961,6 +1105,7 @@ def _build_state_metadata(output: Mapping[str, Any] | Any) -> dict[str, Any]: async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: compiled = compile_evaluation_flow(flow) eval_result = await EvaluateRunner(config=compiled.eval_config).run() + trial_counts = _apply_trial_metrics(eval_result, compiled.suite, compiled.gate_policy) suite_summary = eval_result.summary.get(compiled.suite.suite_id, {}) gate_metrics = {} @@ -1028,6 +1173,7 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: artifacts=_build_state_artifacts(case_result.output), metadata=_build_state_metadata(case_result.output), metric_details=case_metric_details, + trial=_case_trial_metadata(case_result), ) ) @@ -1063,6 +1209,8 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: "approved": None, }, "suite_metadata": dict(compiled.suite.metadata), + "trial_policy": compiled.suite.trial_policy.to_dict(), + "trial_counts": trial_counts, }) judge_schema = compiled.suite.judge_schema.json_schema() if judge_schema: diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md index a4255218f..8d4e7c1b3 100644 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md @@ -23,7 +23,7 @@ ## Task 1: Trial Policy -- [ ] **Step 1: Write failing trial policy tests** +- [x] **Step 1: Write failing trial policy tests** Add tests in `tests/evaluations/test_evaluator_trials.py`: @@ -36,13 +36,13 @@ def test_trial_policy_rejects_invalid_k_values(): TrialPolicyDef(num_trials=2, pass_at_k=(3,)).validate() ``` -- [ ] **Step 2: Run test and confirm failure** +- [x] **Step 2: Run test and confirm failure** Run: `pytest tests/evaluations/test_evaluator_trials.py::test_trial_policy_rejects_invalid_k_values -q` Expected: FAIL because `TrialPolicyDef` does not exist. -- [ ] **Step 3: Implement `TrialPolicyDef`** +- [x] **Step 3: Implement `TrialPolicyDef`** Add a frozen dataclass in `aworld/evaluations/substrate.py`: @@ -62,7 +62,7 @@ class TrialPolicyDef: raise ValueError("k values must be between 1 and num_trials") ``` -- [ ] **Step 4: Run policy tests until green** +- [x] **Step 4: Run policy tests until green** Run: `pytest tests/evaluations/test_evaluator_trials.py -q` @@ -70,7 +70,7 @@ Expected: PASS for initial policy tests. ## Task 2: Trial Case Expansion -- [ ] **Step 1: Write failing expansion tests** +- [x] **Step 1: Write failing expansion tests** ```python def test_build_eval_dataset_expands_trial_cases(): @@ -87,17 +87,17 @@ def test_build_eval_dataset_expands_trial_cases(): assert compiled.dataset.eval_cases[0].case_data["_trial"]["trial_index"] == 1 ``` -- [ ] **Step 2: Run expansion test and confirm failure** +- [x] **Step 2: Run expansion test and confirm failure** Run: `pytest tests/evaluations/test_evaluator_trials.py::test_build_eval_dataset_expands_trial_cases -q` Expected: FAIL because `EvalSuiteDef` does not accept `trial_policy`. -- [ ] **Step 3: Implement trial expansion** +- [x] **Step 3: Implement trial expansion** Add `trial_policy: TrialPolicyDef = field(default_factory=TrialPolicyDef)` to `EvalSuiteDef`. Update `compile_evaluation_flow()` to expand `flow.suite.cases` before `build_eval_dataset()`, preserving `_trial` metadata. -- [ ] **Step 4: Run expansion tests until green** +- [x] **Step 4: Run expansion tests until green** Run: `pytest tests/evaluations/test_evaluator_trials.py -q` @@ -105,21 +105,21 @@ Expected: PASS. ## Task 3: pass@k/pass^k Aggregation -- [ ] **Step 1: Write failing aggregation tests** +- [x] **Step 1: Write failing aggregation tests** Use a deterministic judge that passes trial 2 and fails trials 1/3. Assert `score_pass@2 == 1.0` and `score_pass^2 == 0.0`. -- [ ] **Step 2: Run aggregation test and confirm failure** +- [x] **Step 2: Run aggregation test and confirm failure** Run: `pytest tests/evaluations/test_evaluator_trials.py::test_run_evaluation_flow_reports_pass_at_k_and_pass_caret_k -q` Expected: FAIL because trial aggregation does not exist. -- [ ] **Step 3: Implement trial aggregation** +- [x] **Step 3: Implement trial aggregation** In `run_evaluation_flow()`, group case results by `_trial.original_case_id`, derive each trial pass/fail from `TrialPolicyDef.success_metric` or gate primary metric, then add aggregate metrics named `_pass@k` and `_pass^k`. -- [ ] **Step 4: Run aggregation tests until green** +- [x] **Step 4: Run aggregation tests until green** Run: `pytest tests/evaluations/test_evaluator_trials.py -q` @@ -127,21 +127,21 @@ Expected: PASS. ## Task 4: Retry Separation -- [ ] **Step 1: Write failing retry/trial separation test** +- [x] **Step 1: Write failing retry/trial separation test** Use a runtime harness wrapped in retry with `num_trials=2`. Assert report trial count is `2`, not the number of retry attempts, and pass@k counts terminal trial outcomes only. -- [ ] **Step 2: Run retry separation test and confirm failure** +- [x] **Step 2: Run retry separation test and confirm failure** Run: `pytest tests/evaluations/test_evaluator_trials.py::test_retry_attempts_do_not_count_as_trials -q` Expected: FAIL until trial grouping ignores retry child attempts. -- [ ] **Step 3: Preserve retry attempts as artifacts only** +- [x] **Step 3: Preserve retry attempts as artifacts only** Ensure trial aggregation reads only top-level trial results and never inspects `artifacts.attempts` as independent outcomes. -- [ ] **Step 4: Run retry separation tests until green** +- [x] **Step 4: Run retry separation tests until green** Run: `pytest tests/evaluations/test_evaluator_trials.py -q` @@ -149,17 +149,17 @@ Expected: PASS. ## Task 5: Report Shape and Compatibility -- [ ] **Step 1: Write failing report compatibility tests** +- [x] **Step 1: Write failing report compatibility tests** Assert one-trial `app-evaluator` style suites keep current required report fields, while multi-trial reports include `trial_policy`, `trial_counts`, and per-result trial metadata. -- [ ] **Step 2: Run report tests and confirm failure** +- [x] **Step 2: Run report tests and confirm failure** Run: `pytest tests/evaluations/test_evaluator_trials.py::test_multi_trial_report_exposes_trial_metadata -q` Expected: FAIL until report metadata is added. -- [ ] **Step 3: Add additive report fields** +- [x] **Step 3: Add additive report fields** Add report fields without changing existing required fields: @@ -168,7 +168,7 @@ report["trial_policy"] = {...} report["trial_counts"] = {"original_cases": n, "trials_total": m} ``` -- [ ] **Step 4: Run report tests until green** +- [x] **Step 4: Run report tests until green** Run: `pytest tests/evaluations/test_evaluator_trials.py tests/evaluations/test_evaluation_substrate.py -q` @@ -176,7 +176,7 @@ Expected: PASS. ## Task 6: Verification and Commit -- [ ] **Step 1: Run evaluator regression suite** +- [x] **Step 1: Run evaluator regression suite** Run: @@ -186,13 +186,13 @@ pytest tests/evaluations/test_execution_state.py tests/evaluations/test_executio Expected: PASS. -- [ ] **Step 2: Validate OpenSpec** +- [x] **Step 2: Validate OpenSpec** Run: `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict` Expected: `Change 'aworld-evaluator-trials-passk-2026-06-10' is valid` -- [ ] **Step 3: Commit** +- [x] **Step 3: Commit** ```bash git add aworld/evaluations/substrate.py aworld/evaluations/report.py tests/evaluations/test_evaluator_trials.py openspec/changes/aworld-evaluator-trials-passk-2026-06-10 diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md index 28b80ebac..d9f181c5c 100644 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md +++ b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md @@ -1,40 +1,40 @@ ## 1. Trial Policy Model -- [ ] 1.1 Add `TrialPolicyDef` with `num_trials`, `pass_at_k`, `pass_caret_k`, and `success_metric`. -- [ ] 1.2 Add `trial_policy` to `EvalSuiteDef` with a default one-trial policy. -- [ ] 1.3 Validate trial policy during flow compilation. -- [ ] 1.4 Preserve existing single-shot behavior when `num_trials == 1`. +- [x] 1.1 Add `TrialPolicyDef` with `num_trials`, `pass_at_k`, `pass_caret_k`, and `success_metric`. +- [x] 1.2 Add `trial_policy` to `EvalSuiteDef` with a default one-trial policy. +- [x] 1.3 Validate trial policy during flow compilation. +- [x] 1.4 Preserve existing single-shot behavior when `num_trials == 1`. ## 2. Trial Case Expansion -- [ ] 2.1 Expand each suite case into independent trial case rows when `num_trials > 1`. -- [ ] 2.2 Preserve `original_case_id`, `trial_index`, and `trial_id` in case metadata. -- [ ] 2.3 Ensure runtime-composed harnesses receive trial metadata without storing live handles. -- [ ] 2.4 Add tests for stable case grouping and trial metadata. +- [x] 2.1 Expand each suite case into independent trial case rows when `num_trials > 1`. +- [x] 2.2 Preserve `original_case_id`, `trial_index`, and `trial_id` in case metadata. +- [x] 2.3 Ensure runtime-composed harnesses receive trial metadata without storing live handles. +- [x] 2.4 Add tests for stable case grouping and trial metadata. ## 3. Trial Outcome Aggregation -- [ ] 3.1 Determine per-trial pass/fail from the configured `success_metric`. -- [ ] 3.2 Compute pass@k per original case. -- [ ] 3.3 Compute pass^k per original case. -- [ ] 3.4 Aggregate pass@k/pass^k rates across original cases. -- [ ] 3.5 Allow composite gates to reference trial aggregate metrics. +- [x] 3.1 Determine per-trial pass/fail from the configured `success_metric`. +- [x] 3.2 Compute pass@k per original case. +- [x] 3.3 Compute pass^k per original case. +- [x] 3.4 Aggregate pass@k/pass^k rates across original cases. +- [x] 3.5 Allow composite gates to reference trial aggregate metrics. ## 4. Retry / Trial Separation -- [ ] 4.1 Add tests proving retry child attempts do not increase trial count. -- [ ] 4.2 Ensure pass@k/pass^k uses the selected terminal rollout of each trial. -- [ ] 4.3 Preserve retry attempts only inside trial artifacts/metadata. +- [x] 4.1 Add tests proving retry child attempts do not increase trial count. +- [x] 4.2 Ensure pass@k/pass^k uses the selected terminal rollout of each trial. +- [x] 4.3 Preserve retry attempts only inside trial artifacts/metadata. ## 5. Report Shape -- [ ] 5.1 Add report-level trial policy metadata. -- [ ] 5.2 Add report-level trial count summaries. -- [ ] 5.3 Add per-case trial grouping or trial metadata sufficient to reconstruct groups. -- [ ] 5.4 Keep existing report schema compatible via additive fields. +- [x] 5.1 Add report-level trial policy metadata. +- [x] 5.2 Add report-level trial count summaries. +- [x] 5.3 Add per-case trial grouping or trial metadata sufficient to reconstruct groups. +- [x] 5.4 Keep existing report schema compatible via additive fields. ## 6. Verification -- [ ] 6.1 Add focused tests for trial policy validation, trial expansion, pass@k/pass^k aggregation, retry separation, and report shape. -- [ ] 6.2 Run evaluator regression tests. -- [ ] 6.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict`. +- [x] 6.1 Add focused tests for trial policy validation, trial expansion, pass@k/pass^k aggregation, retry separation, and report shape. +- [x] 6.2 Run evaluator regression tests. +- [x] 6.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict`. diff --git a/tests/evaluations/test_evaluator_trials.py b/tests/evaluations/test_evaluator_trials.py new file mode 100644 index 000000000..76e4020d0 --- /dev/null +++ b/tests/evaluations/test_evaluator_trials.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.runtime_composition import RetryRuntimeHarness, RolloutState +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + TrialPolicyDef, + compile_evaluation_flow, + run_evaluation_flow, +) +from aworld.evaluations.report import validate_evaluator_report + + +def test_trial_policy_rejects_invalid_k_values(): + with pytest.raises(ValueError, match="k values"): + TrialPolicyDef(num_trials=2, pass_at_k=(3,)).validate() + + +def test_build_eval_dataset_expands_trial_cases(): + suite = EvalSuiteDef( + suite_id="trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + trial_policy=TrialPolicyDef(num_trials=3), + ) + + compiled = compile_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + ids = [case.eval_case_id for case in compiled.dataset.eval_cases] + assert ids == ["case-1::trial-1", "case-1::trial-2", "case-1::trial-3"] + assert compiled.dataset.eval_cases[0].case_data["_trial"]["original_case_id"] == "case-1" + assert compiled.dataset.eval_cases[0].case_data["_trial"]["trial_index"] == 1 + + +@pytest.mark.asyncio +async def test_run_evaluation_flow_reports_pass_at_k_and_pass_caret_k(): + async def fake_judge(case_input, target): + trial_index = case_input["_trial"]["trial_index"] + return {"score": 1.0 if trial_index == 2 else 0.0} + + suite = EvalSuiteDef( + suite_id="trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + trial_policy=TrialPolicyDef( + num_trials=3, + pass_at_k=(2,), + pass_caret_k=(2,), + success_metric="score", + ), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score_pass@2", op=">=", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["metrics"]["score_pass@2"]["mean"] == pytest.approx(1.0) + assert report["metrics"]["score_pass^2"]["mean"] == pytest.approx(0.0) + assert report["gate"]["status"] == "pass" + + +@pytest.mark.asyncio +async def test_trial_success_metric_defaults_from_trial_gate_metric_base_name(): + async def fake_judge(case_input, target): + return {"score": 1.0 if case_input["_trial"]["trial_index"] == 2 else 0.0} + + suite = EvalSuiteDef( + suite_id="trial-default-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2, pass_at_k=(2,)), + gate_policy=GatePolicyDef( + pass_all=(GateMetricCondition(metric_name="score_pass@2", op=">=", threshold=1.0),) + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["metrics"]["score_pass@2"]["mean"] == pytest.approx(1.0) + assert report["gate"]["status"] == "pass" + + +@pytest.mark.asyncio +async def test_retry_attempts_do_not_count_as_trials(): + class RetryInsideTrialHarness: + def __init__(self): + self.calls = 0 + + async def run_rollout(self, *, case, target): + self.calls += 1 + if self.calls % 2 == 1: + return RolloutState(case_id=case.case_id, status="failed", answer="failed-attempt") + return RolloutState(case_id=case.case_id, status="success", answer="passed-trial") + + async def fake_judge(case_input, target): + return {"score": 1.0 if target.get("answer") == "passed-trial" else 0.0} + + suite = EvalSuiteDef( + suite_id="retry-trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + runtime_harness=RetryRuntimeHarness( + base_harness=RetryInsideTrialHarness(), + max_attempts=2, + ), + judge=fake_judge, + trial_policy=TrialPolicyDef( + num_trials=2, + pass_at_k=(2,), + success_metric="score", + ), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["trial_counts"]["trials_total"] == 2 + assert report["metrics"]["score_pass@2"]["mean"] == pytest.approx(1.0) + assert len(report["results"][0]["artifacts"]["attempts"]) == 2 + + +@pytest.mark.asyncio +async def test_multi_trial_report_exposes_trial_metadata(): + async def fake_judge(case_input, target): + return {"score": 1.0} + + suite = EvalSuiteDef( + suite_id="trial-report-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2, pass_at_k=(2,), success_metric="score"), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert report["trial_policy"]["num_trials"] == 2 + assert report["trial_counts"] == {"original_cases": 1, "trials_total": 2} + assert report["results"][0]["trial"]["original_case_id"] == "case-1" + assert report["results"][0]["trial"]["trial_index"] == 1 + validate_evaluator_report(report.to_dict()) From e4ebc7c9bc4a5b94e9decd55b6db1f72d10b9f30 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 15:22:57 +0800 Subject: [PATCH 27/41] feat: add evaluator environment isolation --- aworld/evaluations/runtime_composition.py | 128 +++++++- .../.openspec.yaml | 2 + .../design.md | 96 ++++++ .../implementation-plan.md | 150 ++++++++++ .../proposal.md | 19 ++ .../specs/evaluation-substrate/spec.md | 58 ++++ .../tasks.md | 35 +++ .../evaluations/test_environment_isolation.py | 281 ++++++++++++++++++ 8 files changed, 768 insertions(+), 1 deletion(-) create mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml create mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md create mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md create mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md create mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md create mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md create mode 100644 tests/evaluations/test_environment_isolation.py diff --git a/aworld/evaluations/runtime_composition.py b/aworld/evaluations/runtime_composition.py index 84e3ce002..74855512c 100644 --- a/aworld/evaluations/runtime_composition.py +++ b/aworld/evaluations/runtime_composition.py @@ -2,7 +2,7 @@ from __future__ import annotations import inspect -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from typing import Any, Callable, Mapping, Protocol from aworld.evaluations.execution import EvalExecutionSpec, EvalState @@ -89,6 +89,20 @@ def to_dict(self) -> dict[str, Any]: } +@dataclass(frozen=True) +class EnvironmentSnapshot: + environment_id: str + trial_id: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "environment_id": self.environment_id, + "trial_id": self.trial_id, + "metadata": _serializable_dict(self.metadata), + } + + def _resolve_path(source: Mapping[str, Any], path: tuple[str, ...]) -> Any: current: Any = source for part in path: @@ -251,6 +265,21 @@ async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutS ... +class EnvironmentFixture(Protocol): + def reset(self, *, case: Any, target: Mapping[str, Any]) -> EnvironmentSnapshot | Mapping[str, Any]: + ... + + def cleanup( + self, + *, + snapshot: EnvironmentSnapshot, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + ) -> EnvironmentSnapshot | Mapping[str, Any] | None: + ... + + class UserSimulator(Protocol): def next_turn( self, @@ -316,6 +345,103 @@ async def _maybe_await(value: Any) -> Any: return value +def _environment_snapshot_from(value: EnvironmentSnapshot | Mapping[str, Any], *, case: Any) -> EnvironmentSnapshot: + if isinstance(value, EnvironmentSnapshot): + return value + if not isinstance(value, Mapping): + raise TypeError("environment fixture must return EnvironmentSnapshot or mapping") + environment_id = value.get("environment_id") + if environment_id is None: + raise ValueError("environment snapshot requires environment_id") + case_input = _case_input(case) + trial = case_input.get("_trial") if isinstance(case_input.get("_trial"), Mapping) else {} + return EnvironmentSnapshot( + environment_id=str(environment_id), + trial_id=value.get("trial_id") or trial.get("trial_id"), + metadata=dict(value.get("metadata") or {}), + ) + + +def _case_with_environment(case: Any, snapshot: EnvironmentSnapshot) -> Any: + snapshot_dict = snapshot.to_dict() + case_input = _case_input(case) + metadata = getattr(case, "metadata", {}) + if not isinstance(metadata, Mapping): + metadata = {} + try: + return replace( + case, + input={**case_input, "_environment": snapshot_dict}, + metadata={**dict(metadata), "_environment": snapshot_dict}, + ) + except TypeError: + return case + + +class EnvironmentIsolatedRuntimeHarness: + def __init__(self, *, base_harness: RuntimeHarness, fixture: EnvironmentFixture): + self.base_harness = base_harness + self.fixture = fixture + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + reset_value = await _maybe_await(self.fixture.reset(case=case, target=target)) + snapshot = _environment_snapshot_from(reset_value, case=case) + snapshot_dict = snapshot.to_dict() + isolated_case = _case_with_environment(case, snapshot) + isolated_target = {**dict(target), "_environment": snapshot_dict} + + try: + state = await self.base_harness.run_rollout(case=isolated_case, target=isolated_target) + except Exception: + cleanup_state = RolloutState(case_id=str(getattr(case, "case_id", "case")), status="failed") + try: + await _maybe_await( + self.fixture.cleanup( + snapshot=snapshot, + case=isolated_case, + target=isolated_target, + state=cleanup_state, + ) + ) + except Exception: + pass + raise + + state.metadata = { + **state.metadata, + "environment": snapshot_dict, + } + try: + cleanup_value = await _maybe_await( + self.fixture.cleanup( + snapshot=snapshot, + case=isolated_case, + target=isolated_target, + state=state, + ) + ) + except Exception as exc: + state.status = "failed" + state.error = { + "type": exc.__class__.__name__, + "message": str(exc), + "phase": "environment_cleanup", + } + state.metadata = { + **state.metadata, + "environment_cleanup_error": dict(state.error), + } + return state + + if cleanup_value is not None: + cleanup_snapshot = _environment_snapshot_from(cleanup_value, case=isolated_case) + state.metadata = { + **state.metadata, + "environment_cleanup": cleanup_snapshot.to_dict(), + } + return state + + class CallableRuntimeHarness: def __init__( self, diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml new file mode 100644 index 000000000..2cb80411e --- /dev/null +++ b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md new file mode 100644 index 000000000..92a376619 --- /dev/null +++ b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md @@ -0,0 +1,96 @@ +## Context + +The evaluator roadmap now has three layers in place: + +- rollout-owning runtime harnesses with serializable `RolloutState` +- deterministic outcome/state-check graders +- independent trials with pass@k/pass^k aggregation + +The remaining correctness gap is trial independence. Re-running a case without resetting environment state can inflate pass rates or hide regressions. This change adds the environment lifecycle boundary that lets a suite declare reset semantics without embedding live handles in suite state or report state. + +## Goals / Non-Goals + +**Goals:** + +- Add a trusted environment fixture protocol for reset and cleanup. +- Provide a runtime-harness wrapper that applies the fixture around each rollout. +- Ensure each expanded trial receives its own environment metadata. +- Record environment reset/cleanup metadata in serializable rollout/report state. +- Clean up after failed rollouts or raised exceptions where possible. +- Keep retry attempts inside a single environment reset unless the suite explicitly wraps retry differently. + +**Non-Goals:** + +- Running shell commands, test commands, or arbitrary workflow engines. +- Providing a production sandbox/container implementation. +- Managing external databases or filesystem snapshots directly. +- Supporting untrusted suite manifests for environment fixture references. +- Adding LLM-backed adaptive user simulators or training/optimizer integration. + +## Decisions + +### 1. Add a trusted fixture lifecycle + +Define a small in-process contract: + +- `reset(case, target) -> EnvironmentSnapshot` +- `cleanup(snapshot, case, target, state) -> EnvironmentSnapshot | None` + +The fixture is trusted Python code supplied by the suite author, not a declared JSON manifest capability. Returned metadata must be serializable. Live clients, file handles, subprocesses, and credentials must not be retained in rollout state. + +### 2. Represent reset output as serializable environment snapshot + +Add `EnvironmentSnapshot` with: + +- `environment_id` +- `trial_id` +- `metadata` + +The snapshot is injected into: + +- `case.input["_environment"]` +- `case.metadata["_environment"]` +- `target["_environment"]` +- `RolloutState.metadata["environment"]` + +This lets the base harness find a workspace id, database schema id, or seed without coupling to a concrete sandbox implementation. + +### 3. Use wrapper composition instead of changing every harness + +Add `EnvironmentIsolatedRuntimeHarness(base_harness, fixture)`. The wrapper owns reset and cleanup around exactly one call to `base_harness.run_rollout`. + +For multi-trial suites, case expansion already creates one case row per trial, so the wrapper naturally runs one reset per trial. For retry suites, the recommended composition is: + +- `EnvironmentIsolatedRuntimeHarness(RetryRuntimeHarness(base))`: one environment per trial, retry attempts share that trial environment. + +If a suite intentionally needs one environment per retry attempt, it can wrap in the opposite order: + +- `RetryRuntimeHarness(EnvironmentIsolatedRuntimeHarness(base))` + +### 4. Fail closed on lifecycle errors + +If reset fails, the rollout should not run. If cleanup fails after a successful rollout, the terminal state should record cleanup failure metadata and mark the state failed only if the fixture declares cleanup failure as fatal. The first implementation keeps cleanup failure fatal by default to avoid silently reporting contaminated environments as clean. + +If the base harness raises, the wrapper must still attempt cleanup and then re-raise the original error unless cleanup failure is the only error. + +## Risks / Trade-offs + +- [False sandbox confidence] -> Mitigation: name the feature environment fixture lifecycle, not production sandboxing, and document sandbox adapters as future work. +- [Live handle leakage] -> Mitigation: serialize snapshots through existing serializable filtering before storing state. +- [Retry/trial confusion] -> Mitigation: document wrapper order and add tests proving one reset per trial when retry is inside isolation. +- [Cleanup masking rollout errors] -> Mitigation: preserve original rollout exception when both rollout and cleanup fail. + +## Migration Plan + +1. Add environment snapshot and fixture protocol primitives. +2. Add environment-isolated runtime harness wrapper. +3. Inject serializable environment metadata into case, target, and rollout state. +4. Add trial integration tests proving one reset per trial. +5. Add failure-path tests for cleanup on raised rollout. +6. Keep existing suites unchanged unless they opt into the wrapper. + +## Deferred Questions + +- Concrete filesystem/database/container adapters should be handled in a later environment-adapter change. +- LLM-backed adaptive user simulation remains a simulator-focused follow-up. +- Training/optimizer integration should wait until environment isolation and trial metrics stabilize. diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md new file mode 100644 index 000000000..141743ad4 --- /dev/null +++ b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md @@ -0,0 +1,150 @@ +# AWorld Evaluator Environment Isolation Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add trusted per-rollout environment reset/cleanup lifecycle support for runtime-composed evaluator suites. + +**Architecture:** Extend `runtime_composition.py` with serializable environment snapshots, a fixture protocol, and a wrapper harness. The wrapper resets before one base rollout, injects environment metadata into case/target context, cleans up afterward, and records lifecycle metadata in `RolloutState` without exposing live handles. + +**Tech Stack:** Python dataclasses/protocols, existing runtime harness and trial substrate, pytest, OpenSpec. + +--- + +## File Structure + +- Modify: `aworld/evaluations/runtime_composition.py` + Add environment snapshot, fixture protocol, wrapper harness, context injection, and cleanup semantics. +- Test: `tests/evaluations/test_environment_isolation.py` + Focused TDD coverage for reset/cleanup, trial integration, retry composition, failure cleanup, and report metadata. +- Validate: `openspec/changes/aworld-evaluator-environment-isolation-2026-06-10` + Keep tasks/spec/design aligned with implementation. + +## Task 1: Environment Snapshot And Fixture + +- [x] **Step 1: Write failing snapshot serialization test** + +Create `tests/evaluations/test_environment_isolation.py`: + +```python +from aworld.evaluations.runtime_composition import EnvironmentSnapshot + + +def test_environment_snapshot_excludes_live_handles(): + snapshot = EnvironmentSnapshot( + environment_id="env-1", + trial_id="case-1::trial-1", + metadata={"workspace": "/tmp/demo", "client": object()}, + ) + + assert snapshot.to_dict() == { + "environment_id": "env-1", + "trial_id": "case-1::trial-1", + "metadata": {"workspace": "/tmp/demo"}, + } +``` + +- [x] **Step 2: Run test and confirm failure** + +Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_snapshot_excludes_live_handles -q` + +Expected: FAIL because `EnvironmentSnapshot` does not exist. + +- [x] **Step 3: Implement `EnvironmentSnapshot`** + +Add a frozen dataclass in `aworld/evaluations/runtime_composition.py` with `to_dict()` that uses `_serializable_dict()`. + +- [x] **Step 4: Run snapshot test until green** + +Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_snapshot_excludes_live_handles -q` + +Expected: PASS. + +## Task 2: Reset/Cleanup Wrapper + +- [x] **Step 1: Write failing reset and cleanup test** + +Add a test where a fixture records `reset` and `cleanup`, and the base harness asserts `_environment` exists in both case and target. + +- [x] **Step 2: Run wrapper test and confirm failure** + +Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_isolated_harness_resets_and_cleans_up -q` + +Expected: FAIL because `EnvironmentIsolatedRuntimeHarness` does not exist. + +- [x] **Step 3: Implement wrapper harness** + +Add `EnvironmentFixture` protocol and `EnvironmentIsolatedRuntimeHarness`. Use `_maybe_await()` for sync/async fixture hooks. Inject snapshot dictionaries into copied case input/metadata and copied target. + +- [x] **Step 4: Run wrapper tests until green** + +Run: `pytest tests/evaluations/test_environment_isolation.py -q` + +Expected: PASS for initial wrapper tests. + +## Task 3: Trial And Retry Semantics + +- [x] **Step 1: Write failing trial reset count test** + +Use `EvalSuiteDef(trial_policy=TrialPolicyDef(num_trials=2))` with `EnvironmentIsolatedRuntimeHarness`. Assert two resets, two cleanups, and distinct trial ids. + +- [x] **Step 2: Run test and confirm failure** + +Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_isolation_resets_once_per_trial -q` + +Expected: FAIL until wrapper metadata flows through expanded trial cases. + +- [x] **Step 3: Write retry-inside-isolation test** + +Compose `EnvironmentIsolatedRuntimeHarness(base_harness=RetryRuntimeHarness(...))`. Assert reset count equals trial count, not retry attempt count. + +- [x] **Step 4: Run trial/retry tests until green** + +Run: `pytest tests/evaluations/test_environment_isolation.py tests/evaluations/test_evaluator_trials.py -q` + +Expected: PASS. + +## Task 4: Failure Cleanup + +- [x] **Step 1: Write failing cleanup-on-rollout-error test** + +Create a base harness that raises after reset. Assert cleanup is attempted and the original rollout exception is raised. + +- [x] **Step 2: Implement failure cleanup path** + +Wrap base rollout execution in `try/except/finally`. Preserve original exception when cleanup also fails. + +- [x] **Step 3: Write cleanup-failure-after-success test** + +Create a cleanup hook that raises after a successful rollout. Assert the returned state has `status == "failed"` and environment cleanup error metadata. + +- [x] **Step 4: Run failure tests until green** + +Run: `pytest tests/evaluations/test_environment_isolation.py -q` + +Expected: PASS. + +## Task 5: Verification And Commit + +- [x] **Step 1: Run evaluator regression suite** + +Run: + +```bash +pytest tests/evaluations/test_environment_isolation.py tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluator_trials.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py -q +``` + +Expected: PASS. + +- [x] **Step 2: Validate OpenSpec** + +Run: `openspec validate aworld-evaluator-environment-isolation-2026-06-10 --strict` + +Expected: `Change 'aworld-evaluator-environment-isolation-2026-06-10' is valid` + +- [x] **Step 3: Commit** + +```bash +git add aworld/evaluations/runtime_composition.py tests/evaluations/test_environment_isolation.py +git add -f openspec/changes/aworld-evaluator-environment-isolation-2026-06-10 +git commit -m "feat: add evaluator environment isolation" +``` diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md new file mode 100644 index 000000000..60a625216 --- /dev/null +++ b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md @@ -0,0 +1,19 @@ +# AWorld Evaluator Environment Isolation + +## Why + +Runtime composition can now run one rollout and trial policy can repeat cases for pass@k/pass^k. Those trials are not truly independent if they share filesystem, database, service, or in-memory state. Evaluator users need a framework-owned reset lifecycle so every trial can start from a declared clean environment and record enough metadata to audit the reset. + +## What Changes + +- Add a trusted environment fixture contract for setup/reset/cleanup around each runtime-composed rollout. +- Add a wrapper harness that runs environment reset before the base harness and cleanup after the terminal rollout. +- Inject serializable environment metadata into the case/target visible to the base harness. +- Preserve environment metadata in rollout state, evaluator state, and report artifacts/metadata. +- Keep real sandbox/container/database adapters out of scope; this change defines the lifecycle and trusted in-process contract. + +## Impact + +- Affected code: `aworld/evaluations/runtime_composition.py`, runtime-composed substrate paths, report metadata through existing state serialization. +- Affected tests: add focused coverage for reset-per-trial, retry separation, cleanup-on-failure, and report metadata. +- Follow-ups: concrete filesystem/database/container environment fixtures and LLM-backed user simulators remain separate changes. diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md new file mode 100644 index 000000000..3bf2a61e8 --- /dev/null +++ b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md @@ -0,0 +1,58 @@ +## MODIFIED Requirements + +### Requirement: Environment lifecycle for runtime evaluation + +Runtime-composed evaluation flows SHALL support an opt-in trusted environment lifecycle that resets environment state before a rollout and cleans it up afterward. + +#### Scenario: Runtime harness uses environment fixture +- **WHEN** a runtime-composed suite wraps its harness with an environment fixture +- **THEN** the framework SHALL call the fixture reset hook before executing the base rollout +- **AND** the framework SHALL call the fixture cleanup hook after the base rollout finishes + +#### Scenario: Environment metadata is serializable +- **WHEN** a fixture returns environment metadata +- **THEN** the framework SHALL preserve only serializable metadata in rollout state, evaluator state, and reports +- **AND** the framework SHALL exclude live handles such as clients, file handles, subprocesses, and credentials from serialized state + +#### Scenario: Base harness needs environment context +- **WHEN** environment reset succeeds +- **THEN** the framework SHALL expose the environment snapshot to the base harness through case input, case metadata, and target metadata + +### Requirement: Environment isolation across trials + +Trial-based evaluation SHALL be able to reset environment state independently for each trial. + +#### Scenario: Multi-trial evaluation uses environment isolation +- **WHEN** a suite declares multiple trials and wraps its runtime harness with environment isolation +- **THEN** each expanded trial SHALL receive a distinct reset lifecycle + +#### Scenario: Retry runs inside one isolated trial +- **WHEN** a suite composes retry inside the environment-isolated harness +- **THEN** retry attempts SHALL share one environment reset for that trial +- **AND** retry attempts SHALL NOT increase environment reset count + +### Requirement: Environment lifecycle failure handling + +Environment lifecycle handling SHALL fail closed and preserve cleanup attempts. + +#### Scenario: Reset fails +- **WHEN** an environment reset hook fails +- **THEN** the framework SHALL NOT execute the base rollout +- **AND** the evaluation SHALL surface the reset error through the normal runtime error path + +#### Scenario: Rollout fails +- **WHEN** the base rollout raises after reset succeeds +- **THEN** the framework SHALL attempt cleanup +- **AND** the framework SHALL preserve the original rollout error if cleanup also fails + +#### Scenario: Cleanup fails after rollout success +- **WHEN** cleanup fails after the base rollout returns a terminal state +- **THEN** the framework SHALL mark the rollout state failed and record cleanup error metadata unless the fixture explicitly declares cleanup failure non-fatal + +### Requirement: Sandbox execution remains deferred + +Environment lifecycle support SHALL define trusted reset/cleanup boundaries without introducing untrusted sandbox command execution. + +#### Scenario: Suite requests command-backed sandbox reset +- **WHEN** a suite requires shell commands, container lifecycle, workflow engines, database snapshotting, or filesystem reset +- **THEN** this change SHALL treat that as adapter-specific future work rather than executing arbitrary commands in the evaluator substrate diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md new file mode 100644 index 000000000..a9512a44c --- /dev/null +++ b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md @@ -0,0 +1,35 @@ +## 1. Environment Fixture Primitives + +- [x] 1.1 Add `EnvironmentSnapshot` with serializable `environment_id`, `trial_id`, and metadata. +- [x] 1.2 Add an `EnvironmentFixture` protocol with async-compatible `reset` and `cleanup`. +- [x] 1.3 Add validation/serialization helpers that exclude live handles from snapshot metadata. + +## 2. Runtime Harness Wrapper + +- [x] 2.1 Add `EnvironmentIsolatedRuntimeHarness`. +- [x] 2.2 Reset before exactly one base rollout. +- [x] 2.3 Inject snapshot metadata into case input, case metadata, and target. +- [x] 2.4 Add cleanup after rollout and preserve cleanup metadata in rollout state. + +## 3. Trial And Retry Semantics + +- [x] 3.1 Prove multi-trial suites reset once per trial. +- [x] 3.2 Prove retry attempts do not increase reset count when retry is inside environment isolation. +- [x] 3.3 Document wrapper-order semantics for one-environment-per-trial versus one-environment-per-attempt. + +## 4. Failure Semantics + +- [x] 4.1 Attempt cleanup when the base harness raises. +- [x] 4.2 Preserve the original rollout exception when rollout and cleanup both fail. +- [x] 4.3 Record cleanup failure metadata on terminal rollout state when cleanup fails after rollout success. + +## 5. Report Shape + +- [x] 5.1 Ensure environment metadata appears through existing state metadata/artifacts. +- [x] 5.2 Keep report schema additive and compatible. + +## 6. Verification + +- [x] 6.1 Add focused tests for reset, cleanup, trial count, retry composition, failure cleanup, and report metadata. +- [x] 6.2 Run evaluator regression tests. +- [x] 6.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-environment-isolation-2026-06-10 --strict`. diff --git a/tests/evaluations/test_environment_isolation.py b/tests/evaluations/test_environment_isolation.py new file mode 100644 index 000000000..51f052ff1 --- /dev/null +++ b/tests/evaluations/test_environment_isolation.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.runtime_composition import ( + EnvironmentIsolatedRuntimeHarness, + EnvironmentSnapshot, + RetryRuntimeHarness, + RolloutState, +) +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + TrialPolicyDef, + run_evaluation_flow, +) + + +def test_environment_snapshot_excludes_live_handles(): + snapshot = EnvironmentSnapshot( + environment_id="env-1", + trial_id="case-1::trial-1", + metadata={"workspace": "/tmp/demo", "client": object()}, + ) + + assert snapshot.to_dict() == { + "environment_id": "env-1", + "trial_id": "case-1::trial-1", + "metadata": {"workspace": "/tmp/demo"}, + } + + +@pytest.mark.asyncio +async def test_environment_isolated_harness_resets_and_cleans_up(): + events = [] + + class RecordingFixture: + async def reset(self, *, case, target): + events.append(("reset", case.case_id)) + return EnvironmentSnapshot( + environment_id="env-1", + trial_id=case.input["_trial"]["trial_id"], + metadata={"workspace": "/tmp/demo", "client": object()}, + ) + + async def cleanup(self, *, snapshot, case, target, state): + events.append(("cleanup", snapshot.environment_id, state.status)) + return EnvironmentSnapshot( + environment_id=snapshot.environment_id, + trial_id=snapshot.trial_id, + metadata={"cleaned": True}, + ) + + class InspectingHarness: + async def run_rollout(self, *, case, target): + assert case.input["_environment"]["environment_id"] == "env-1" + assert case.metadata["_environment"]["trial_id"] == "case-1::trial-1" + assert target["_environment"]["metadata"]["workspace"] == "/tmp/demo" + return RolloutState(case_id=case.case_id, status="success", answer="ok") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=InspectingHarness(), + fixture=RecordingFixture(), + ) + case = EvalCaseDef( + case_id="case-1::trial-1", + input={"_trial": {"trial_id": "case-1::trial-1"}}, + ) + + state = await harness.run_rollout(case=case, target={}) + + assert events == [("reset", "case-1::trial-1"), ("cleanup", "env-1", "success")] + assert state.metadata["environment"]["environment_id"] == "env-1" + assert state.metadata["environment_cleanup"]["metadata"]["cleaned"] is True + + +@pytest.mark.asyncio +async def test_environment_isolation_resets_once_per_trial(): + class CountingFixture: + def __init__(self): + self.resets = [] + self.cleanups = [] + + async def reset(self, *, case, target): + trial_id = case.input["_trial"]["trial_id"] + self.resets.append(trial_id) + return EnvironmentSnapshot( + environment_id=f"env-{len(self.resets)}", + trial_id=trial_id, + metadata={"trial_id": trial_id}, + ) + + async def cleanup(self, *, snapshot, case, target, state): + self.cleanups.append(snapshot.trial_id) + return EnvironmentSnapshot( + environment_id=snapshot.environment_id, + trial_id=snapshot.trial_id, + metadata={"cleaned": True}, + ) + + class EnvironmentAwareHarness: + async def run_rollout(self, *, case, target): + environment_id = case.input["_environment"]["environment_id"] + return RolloutState( + case_id=case.case_id, + status="success", + answer=environment_id, + ) + + async def fake_judge(case_input, target): + return {"score": 1.0} + + fixture = CountingFixture() + suite = EvalSuiteDef( + suite_id="environment-trial-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + runtime_harness=EnvironmentIsolatedRuntimeHarness( + base_harness=EnvironmentAwareHarness(), + fixture=fixture, + ), + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert fixture.resets == ["case-1::trial-1", "case-1::trial-2"] + assert fixture.cleanups == ["case-1::trial-1", "case-1::trial-2"] + assert report["results"][0]["metadata"]["environment"]["environment_id"] == "env-1" + assert report["results"][1]["metadata"]["environment"]["environment_id"] == "env-2" + + +@pytest.mark.asyncio +async def test_retry_inside_environment_isolation_does_not_increase_reset_count(): + class CountingFixture: + def __init__(self): + self.reset_count = 0 + + async def reset(self, *, case, target): + self.reset_count += 1 + return EnvironmentSnapshot(environment_id=f"env-{self.reset_count}") + + async def cleanup(self, *, snapshot, case, target, state): + return None + + class FlakyHarness: + def __init__(self): + self.calls = 0 + + async def run_rollout(self, *, case, target): + self.calls += 1 + if self.calls % 2 == 1: + return RolloutState(case_id=case.case_id, status="failed", answer="failed-attempt") + return RolloutState(case_id=case.case_id, status="success", answer="passed-trial") + + async def fake_judge(case_input, target): + return {"score": 1.0 if target.get("answer") == "passed-trial" else 0.0} + + fixture = CountingFixture() + suite = EvalSuiteDef( + suite_id="environment-retry-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], + runtime_harness=EnvironmentIsolatedRuntimeHarness( + base_harness=RetryRuntimeHarness(base_harness=FlakyHarness(), max_attempts=2), + fixture=fixture, + ), + judge=fake_judge, + trial_policy=TrialPolicyDef(num_trials=2), + ) + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite) + ) + + assert fixture.reset_count == 2 + assert len(report["results"][0]["artifacts"]["attempts"]) == 2 + assert len(report["results"][1]["artifacts"]["attempts"]) == 2 + + +@pytest.mark.asyncio +async def test_environment_cleanup_runs_when_rollout_raises(): + events = [] + + class RecordingFixture: + async def reset(self, *, case, target): + events.append("reset") + return EnvironmentSnapshot(environment_id="env-1") + + async def cleanup(self, *, snapshot, case, target, state): + events.append(("cleanup", state.status)) + return None + + class RaisingHarness: + async def run_rollout(self, *, case, target): + raise RuntimeError("rollout boom") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=RaisingHarness(), + fixture=RecordingFixture(), + ) + + with pytest.raises(RuntimeError, match="rollout boom"): + await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + assert events == ["reset", ("cleanup", "failed")] + + +@pytest.mark.asyncio +async def test_reset_failure_prevents_rollout_execution(): + class FailingResetFixture: + async def reset(self, *, case, target): + raise RuntimeError("reset boom") + + async def cleanup(self, *, snapshot, case, target, state): + raise AssertionError("cleanup should not run when reset fails") + + class UnexpectedHarness: + async def run_rollout(self, *, case, target): + raise AssertionError("rollout should not run when reset fails") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=UnexpectedHarness(), + fixture=FailingResetFixture(), + ) + + with pytest.raises(RuntimeError, match="reset boom"): + await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + +@pytest.mark.asyncio +async def test_cleanup_failure_during_rollout_error_preserves_rollout_error(): + class FailingCleanupFixture: + async def reset(self, *, case, target): + return EnvironmentSnapshot(environment_id="env-1") + + async def cleanup(self, *, snapshot, case, target, state): + raise RuntimeError("cleanup boom") + + class RaisingHarness: + async def run_rollout(self, *, case, target): + raise RuntimeError("rollout boom") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=RaisingHarness(), + fixture=FailingCleanupFixture(), + ) + + with pytest.raises(RuntimeError, match="rollout boom"): + await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + +@pytest.mark.asyncio +async def test_cleanup_failure_after_success_marks_rollout_failed(): + class FailingCleanupFixture: + async def reset(self, *, case, target): + return EnvironmentSnapshot(environment_id="env-1") + + async def cleanup(self, *, snapshot, case, target, state): + raise RuntimeError("cleanup boom") + + class PassingHarness: + async def run_rollout(self, *, case, target): + return RolloutState(case_id=case.case_id, status="success", answer="ok") + + harness = EnvironmentIsolatedRuntimeHarness( + base_harness=PassingHarness(), + fixture=FailingCleanupFixture(), + ) + + state = await harness.run_rollout(case=EvalCaseDef(case_id="case-1", input={}), target={}) + + assert state.status == "failed" + assert state.error == { + "type": "RuntimeError", + "message": "cleanup boom", + "phase": "environment_cleanup", + } + assert state.metadata["environment_cleanup_error"]["message"] == "cleanup boom" From caee5e5e1733a63b099e68822ada3c654cccf9dd Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 15:27:25 +0800 Subject: [PATCH 28/41] feat: add adaptive evaluator user simulator --- aworld/evaluations/runtime_composition.py | 57 +++++++++- .../.openspec.yaml | 2 + .../design.md | 77 +++++++++++++ .../implementation-plan.md | 94 ++++++++++++++++ .../proposal.md | 19 ++++ .../specs/evaluation-substrate/spec.md | 34 ++++++ .../tasks.md | 17 +++ tests/evaluations/test_llm_user_simulator.py | 104 ++++++++++++++++++ 8 files changed, 399 insertions(+), 5 deletions(-) create mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml create mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md create mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md create mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md create mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md create mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md create mode 100644 tests/evaluations/test_llm_user_simulator.py diff --git a/aworld/evaluations/runtime_composition.py b/aworld/evaluations/runtime_composition.py index 74855512c..56c290823 100644 --- a/aworld/evaluations/runtime_composition.py +++ b/aworld/evaluations/runtime_composition.py @@ -339,6 +339,51 @@ def next_turn( return RolloutTurn(role="user", content=content) +class LLMUserSimulator: + def __init__(self, *, turn_generator: Callable[..., Any]): + self.turn_generator = turn_generator + + def next_turn( + self, + *, + case: Any, + target: Mapping[str, Any], + state: RolloutState, + last_output: Any | None = None, + ) -> RolloutTurn | None | Any: + user_turn_count = sum(1 for turn in state.turns if turn.role == "user") + generated = self.turn_generator( + case=case, + target=target, + state=state, + last_output=last_output, + turn_index=user_turn_count, + ) + if inspect.isawaitable(generated): + return self._await_turn(generated) + return self._normalize_turn(generated) + + async def _await_turn(self, generated: Any) -> RolloutTurn | None: + return self._normalize_turn(await generated) + + def _normalize_turn(self, generated: Any) -> RolloutTurn | None: + if generated is None: + return None + if isinstance(generated, RolloutTurn): + return generated + if isinstance(generated, str): + return RolloutTurn(role="user", content=generated) + if isinstance(generated, Mapping): + if generated.get("stop") is True: + return None + return RolloutTurn( + role=str(generated.get("role", "user")), + content=generated.get("content"), + metadata=dict(generated.get("metadata") or {}), + ) + raise TypeError("LLMUserSimulator generator must return str, mapping, RolloutTurn, or None") + + async def _maybe_await(value: Any) -> Any: if inspect.isawaitable(value): return await value @@ -459,11 +504,13 @@ async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutS state = RolloutState(case_id=str(case_id)) last_output: Any | None = None for _ in range(self.max_turns): - user_turn = self.simulator.next_turn( - case=case, - target=target, - state=state, - last_output=last_output, + user_turn = await _maybe_await( + self.simulator.next_turn( + case=case, + target=target, + state=state, + last_output=last_output, + ) ) if user_turn is None: break diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml new file mode 100644 index 000000000..2cb80411e --- /dev/null +++ b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md new file mode 100644 index 000000000..38b88c158 --- /dev/null +++ b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md @@ -0,0 +1,77 @@ +## Context + +Runtime composition currently includes: + +- `ScriptedUserSimulator` for fixed user turns +- `SinglePromptUserSimulator` for one-shot prompts +- `CallableRuntimeHarness` for multi-turn rollout execution + +This is enough for deterministic tests but not for adaptive dialog evaluation. A realistic user simulator should inspect the current conversation, the last assistant output, and case goal before deciding whether to continue, clarify, challenge, or stop. + +## Goals / Non-Goals + +**Goals:** + +- Add a provider-agnostic adaptive simulator class. +- Support sync and async simulator generation. +- Keep generated turns serializable and report-safe. +- Give the generator enough structured context to implement LLM-backed user behavior. +- Preserve existing scripted and single-prompt behavior. + +**Non-Goals:** + +- Adding a concrete LLM provider client. +- Storing live model clients, credentials, or API responses in suite/report state. +- Adding training or optimizer integration. +- Replacing deterministic scripted simulators. + +## Decisions + +### 1. Use injected generator callable + +`LLMUserSimulator` accepts a `turn_generator` callable. The callable receives: + +- `case` +- `target` +- `state` +- `last_output` +- `turn_index` + +It may return: + +- `str`: user content +- `RolloutTurn`: full turn +- `Mapping`: `{"content": "...", "metadata": {...}}` +- `Mapping` with `{"stop": True}` or `None`: stop conversation + +This keeps provider integration outside the substrate while making the runtime API ready for LLM-backed adapters. + +### 2. Await simulator outputs in the harness + +`CallableRuntimeHarness` should call `await _maybe_await(simulator.next_turn(...))`. Existing sync simulators keep working, and async LLM-backed simulators become first-class. + +### 3. Keep metadata serializable + +Generated mapping metadata is filtered through existing `RolloutTurn.to_dict()` serialization. Live clients remain in the simulator instance, not in `RolloutState`. + +### 4. Stop behavior is explicit + +The simulator returns `None` or `{"stop": True}` to end the rollout. This keeps max-turn enforcement in `CallableRuntimeHarness` and stop-decision semantics in the simulator. + +## Risks / Trade-offs + +- [Provider ambiguity] -> Mitigation: this change is adapter-ready but provider-neutral. +- [Non-determinism in tests] -> Mitigation: tests use deterministic fake generators. +- [Live handle leakage] -> Mitigation: turn metadata is serialized through existing filtering; simulator internals are never copied into state. + +## Migration Plan + +1. Add async simulator support to `CallableRuntimeHarness`. +2. Add `LLMUserSimulator`. +3. Add tests for string, mapping, turn, stop, and async generation behavior. +4. Keep existing simulator tests green. + +## Deferred Questions + +- Concrete provider adapters should be separate changes. +- Training/optimizer integration remains deferred until evaluator runtime primitives stabilize. diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md new file mode 100644 index 000000000..937dd8029 --- /dev/null +++ b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md @@ -0,0 +1,94 @@ +# AWorld Evaluator LLM User Simulator Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add provider-neutral adaptive user simulation for runtime-composed evaluator rollouts. + +**Architecture:** Extend `CallableRuntimeHarness` to await simulator outputs, then add `LLMUserSimulator` as a thin adapter around an injected generator callable. The simulator normalizes string/mapping/turn/stop outputs into `RolloutTurn | None` and leaves provider clients outside serializable rollout state. + +**Tech Stack:** Python protocols/dataclasses, existing runtime-composition harness, pytest, OpenSpec. + +--- + +## File Structure + +- Modify: `aworld/evaluations/runtime_composition.py` + Add async simulator support and `LLMUserSimulator`. +- Test: `tests/evaluations/test_llm_user_simulator.py` + Focused TDD coverage for adaptive generation and stop behavior. + +## Task 1: Async Simulator Support + +- [x] **Step 1: Write failing async simulator test** + +Create `tests/evaluations/test_llm_user_simulator.py` with an async simulator whose `next_turn` returns a `RolloutTurn`. + +- [x] **Step 2: Run and confirm failure** + +Run: `pytest tests/evaluations/test_llm_user_simulator.py::test_callable_runtime_harness_awaits_async_simulator -q` + +Expected: FAIL because `CallableRuntimeHarness` does not await simulator output. + +- [x] **Step 3: Await simulator next turn** + +Change `CallableRuntimeHarness.run_rollout()` to call `await _maybe_await(self.simulator.next_turn(...))`. + +- [x] **Step 4: Run test until green** + +Run: `pytest tests/evaluations/test_llm_user_simulator.py -q` + +Expected: PASS. + +## Task 2: LLMUserSimulator + +- [x] **Step 1: Write failing adaptive generation tests** + +Add tests for string output, mapping output with metadata, explicit stop output, and generator context arguments. + +- [x] **Step 2: Run and confirm failure** + +Run: `pytest tests/evaluations/test_llm_user_simulator.py -q` + +Expected: FAIL because `LLMUserSimulator` does not exist. + +- [x] **Step 3: Implement `LLMUserSimulator`** + +Add a class accepting `turn_generator`. Normalize outputs: + +- `None` -> `None` +- `{"stop": True}` -> `None` +- `str` -> `RolloutTurn(role="user", content=value)` +- `RolloutTurn` -> returned directly +- mapping -> `RolloutTurn(role=..., content=..., metadata=...)` + +- [x] **Step 4: Run simulator tests until green** + +Run: `pytest tests/evaluations/test_llm_user_simulator.py -q` + +Expected: PASS. + +## Task 3: Verification And Commit + +- [x] **Step 1: Run runtime/evaluator regression** + +Run: + +```bash +pytest tests/evaluations/test_llm_user_simulator.py tests/evaluations/test_runtime_composition.py tests/evaluations/test_environment_isolation.py tests/evaluations/test_evaluator_trials.py -q +``` + +Expected: PASS. + +- [x] **Step 2: Validate OpenSpec** + +Run: `openspec validate aworld-evaluator-llm-user-simulator-2026-06-10 --strict` + +Expected: `Change 'aworld-evaluator-llm-user-simulator-2026-06-10' is valid` + +- [x] **Step 3: Commit** + +```bash +git add aworld/evaluations/runtime_composition.py tests/evaluations/test_llm_user_simulator.py +git add -f openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10 +git commit -m "feat: add adaptive evaluator user simulator" +``` diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md new file mode 100644 index 000000000..e91363319 --- /dev/null +++ b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md @@ -0,0 +1,19 @@ +# AWorld Evaluator LLM User Simulator + +## Why + +Scripted and single-prompt simulators are useful for deterministic smoke tests, but conversational agent evaluation often needs an adaptive user that reacts to the assistant's previous output and rollout state. The evaluator runtime already owns turns and rollout state, so the next step is a provider-agnostic LLM-backed simulator contract that can drive multi-turn conversations without coupling the substrate to one model vendor. + +## What Changes + +- Add an adaptive `LLMUserSimulator` that delegates user-turn generation to an injected sync or async callable. +- Allow `CallableRuntimeHarness` to await simulator `next_turn` implementations. +- Pass case input, target metadata, current rollout state, previous assistant output, and turn index to the simulator generator. +- Support generator outputs as strings, mappings, `RolloutTurn`, or explicit stop signals. +- Preserve only serializable simulator metadata in emitted turns. + +## Impact + +- Affected code: `aworld/evaluations/runtime_composition.py`. +- Affected tests: add focused coverage for async simulator support, adaptive LLM-style generation, stop behavior, and report-safe metadata. +- Non-goal: this change does not ship a concrete OpenAI/Anthropic client adapter or manage API keys. diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md new file mode 100644 index 000000000..1b7a61b96 --- /dev/null +++ b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md @@ -0,0 +1,34 @@ +## MODIFIED Requirements + +### Requirement: Adaptive user simulation + +Runtime-composed evaluation flows SHALL support adaptive user simulators that can react to previous assistant outputs and rollout state. + +#### Scenario: Simulator generates user turn from rollout context +- **WHEN** a runtime harness requests the next user turn from an adaptive simulator +- **THEN** the simulator SHALL receive the evaluation case, target metadata, current rollout state, last assistant output, and turn index +- **AND** it SHALL be able to return the next serializable user turn + +#### Scenario: Simulator is async +- **WHEN** a simulator returns an awaitable next-turn result +- **THEN** the runtime harness SHALL await the result before appending the user turn + +#### Scenario: Simulator stops conversation +- **WHEN** a simulator returns `None` or an explicit stop signal +- **THEN** the runtime harness SHALL stop requesting additional user turns for that rollout + +#### Scenario: Simulator returns metadata +- **WHEN** a simulator returns turn metadata +- **THEN** the framework SHALL preserve only serializable metadata in trajectory/report state + +### Requirement: Provider-neutral LLM simulator boundary + +LLM-backed user simulation SHALL be provider-neutral at the evaluator substrate layer. + +#### Scenario: Suite uses external LLM client +- **WHEN** a suite author wants to use an OpenAI, Anthropic, local, or custom model-backed user simulator +- **THEN** the evaluator substrate SHALL accept an injected callable or simulator instance rather than constructing a provider client itself + +#### Scenario: Simulator contains live model client +- **WHEN** a simulator instance holds a live client, credential, or transport handle +- **THEN** the framework SHALL NOT serialize that handle into rollout state, evaluator state, or reports diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md new file mode 100644 index 000000000..38f833b02 --- /dev/null +++ b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md @@ -0,0 +1,17 @@ +## 1. Async Simulator Support + +- [x] 1.1 Update `CallableRuntimeHarness` to await awaitable simulator `next_turn` results. +- [x] 1.2 Preserve existing scripted and single-prompt simulator behavior. + +## 2. Adaptive LLM User Simulator + +- [x] 2.1 Add `LLMUserSimulator`. +- [x] 2.2 Pass case, target, rollout state, last output, and turn index to its generator. +- [x] 2.3 Support string, mapping, `RolloutTurn`, `None`, and explicit stop outputs. +- [x] 2.4 Filter generated metadata through existing serializable turn serialization. + +## 3. Verification + +- [x] 3.1 Add focused tests for async simulator support, adaptive generation, stop behavior, and metadata filtering. +- [x] 3.2 Run runtime/evaluator regression tests. +- [x] 3.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-llm-user-simulator-2026-06-10 --strict`. diff --git a/tests/evaluations/test_llm_user_simulator.py b/tests/evaluations/test_llm_user_simulator.py new file mode 100644 index 000000000..60a11ebc5 --- /dev/null +++ b/tests/evaluations/test_llm_user_simulator.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import pytest + +from aworld.evaluations.runtime_composition import ( + CallableRuntimeHarness, + LLMUserSimulator, + RolloutState, + RolloutTurn, +) +from aworld.evaluations.substrate import EvalCaseDef + + +@pytest.mark.asyncio +async def test_callable_runtime_harness_awaits_async_simulator(): + class AsyncSimulator: + async def next_turn(self, *, case, target, state, last_output=None): + if any(turn.role == "user" for turn in state.turns): + return None + return RolloutTurn(role="user", content="async hello") + + async def assistant_step(*, user_turn, state, case, target): + return {"answer": f"ack:{user_turn.content}"} + + harness = CallableRuntimeHarness( + simulator=AsyncSimulator(), + assistant_step=assistant_step, + max_turns=1, + ) + + state = await harness.run_rollout( + case=EvalCaseDef(case_id="case-1", input={}), + target={}, + ) + + assert state.answer == "ack:async hello" + assert state.turns[0].content == "async hello" + + +@pytest.mark.asyncio +async def test_llm_user_simulator_generates_adaptive_turns_from_context(): + calls = [] + + async def turn_generator(*, case, target, state, last_output, turn_index): + calls.append( + { + "case_id": case.case_id, + "goal": target["goal"], + "last_output": last_output, + "turn_index": turn_index, + "turn_count": len(state.turns), + } + ) + if turn_index == 0: + return "start" + if turn_index == 1: + return { + "content": f"clarify after {last_output}", + "metadata": {"intent": "clarify", "client": object()}, + } + return {"stop": True, "metadata": {"reason": "done"}} + + async def assistant_step(*, user_turn, state, case, target): + return {"answer": f"assistant:{user_turn.content}"} + + harness = CallableRuntimeHarness( + simulator=LLMUserSimulator(turn_generator=turn_generator), + assistant_step=assistant_step, + max_turns=3, + ) + + state = await harness.run_rollout( + case=EvalCaseDef(case_id="case-1", input={}), + target={"goal": "resolve ticket"}, + ) + + assert [turn.content for turn in state.turns if turn.role == "user"] == [ + "start", + "clarify after assistant:start", + ] + assert calls[0]["turn_index"] == 0 + assert calls[1]["last_output"] == "assistant:start" + assert calls[2]["turn_count"] == 4 + assert state.turns[2].metadata["intent"] == "clarify" + assert "client" not in state.trajectory[2]["metadata"] + + +def test_llm_user_simulator_accepts_rollout_turn_output(): + simulator = LLMUserSimulator( + turn_generator=lambda **kwargs: RolloutTurn( + role="user", + content="custom", + metadata={"safe": True}, + ) + ) + + turn = simulator.next_turn( + case=EvalCaseDef(case_id="case-1", input={}), + target={}, + state=RolloutState(case_id="case-1"), + last_output=None, + ) + + assert turn == RolloutTurn(role="user", content="custom", metadata={"safe": True}) From de008e9d18afb623ddbbf4bd583f5890bbe9c7db Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 17:03:08 +0800 Subject: [PATCH 29/41] test: add manual trajectory evaluator replay case --- eval/trajectory_evaluator/agent.md | 217 +++++++ tests/evaluations/conftest.py | 46 ++ .../test_trajectory_log_manual_case.py | 585 ++++++++++++++++++ 3 files changed, 848 insertions(+) create mode 100644 eval/trajectory_evaluator/agent.md create mode 100644 tests/evaluations/conftest.py create mode 100644 tests/evaluations/test_trajectory_log_manual_case.py diff --git a/eval/trajectory_evaluator/agent.md b/eval/trajectory_evaluator/agent.md new file mode 100644 index 000000000..832216f0c --- /dev/null +++ b/eval/trajectory_evaluator/agent.md @@ -0,0 +1,217 @@ +--- +name: trajectory-evaluator +description: 使用 LLM-as-judge 对 AWorld agent 的单条 trajectory 做「输出质量 + 执行过程」双维评估。无参考基准(reference-free):以轨迹内实际抓取到的源内容作为 groundedness 依据。必须显式传入 task_id、trajectory log 与输出目录。 +tools: Bash, Read, Write +model: opus +--- + +# Trajectory Evaluator(LLM-as-Judge) + +你是一名严格、以证据为准的 **AI agent 轨迹评审员**。你的职责是对一条 AWorld trajectory 做可复现、可量化的评估,覆盖**最终输出质量**与**执行过程质量**两个范围,并产出结构化评估报告。 + +你**就是**这里的 LLM judge:所有打分由你基于抽取出的证据完成,不调用外部模型。 + +## 评估输入(参数) + +- `TRAJECTORY_LOG`:轨迹日志路径,必须显式提供 +- `TASK_ID`:待评估任务 id,必须显式提供 +- `OUT_DIR`:报告输出目录,必须显式提供 + +若用户在 directive 中给出了不同的值,以用户提供的为准。 + +--- + +## 阶段 0 · 解析与抽取(确定性,必须先做) + +日志为「每行一个 Python dict repr」格式,且尾部可能带 ANSI 转义码;`trajectory` 字段是一个 **JSON 字符串**。**禁止**用 Read 直接读整行(单行可达数百 KB,会污染上下文)。必须用下面这段已验证的脚本抽取,把干净的结构化数据落盘后再读: + +```bash +mkdir -p "${OUT_DIR:?OUT_DIR is required}" +python3 - "$@" << 'PYEOF' +import ast, json, re, os, sys, glob + +LOG = os.path.expanduser(os.environ["TRAJECTORY_LOG"]) +TASK_ID = os.environ["TASK_ID"] +OUT_DIR = os.environ["OUT_DIR"] +os.makedirs(OUT_DIR, exist_ok=True) + +# 1) 定位 task_id 所在行(每行一条记录) +target = None +with open(LOG, encoding="utf-8", errors="replace") as f: + for line in f: + if TASK_ID in line: + target = line + break +if target is None: + sys.exit(f"[FATAL] task_id {TASK_ID} not found in {LOG}") + +# 2) 去 ANSI + 去首尾空白,再用 literal_eval 解析 Python dict repr +clean = re.sub(r'\x1b\[[0-9;]*m', '', target).strip() +rec = ast.literal_eval(clean) +traj = json.loads(rec["trajectory"]) # trajectory 是 JSON 字符串 + +def first_str(x): # is_agent_finished 在该数据里是字符串 "True"/"False" + return str(x).strip().lower() in ("true", "1") + +# 3) 抽取关键字段 +question = (traj[0].get("state", {}).get("input", {}) or {}).get("content") +system_prompt = "" +msgs0 = traj[0].get("state", {}).get("messages", []) or [] +if msgs0 and msgs0[0].get("role") == "system": + system_prompt = str(msgs0[0].get("content") or "") + +steps = [] +final_answer = None +for s in traj: + meta = s.get("meta", {}) + act = s.get("action") or {} + tcs = act.get("tool_calls") or [] + calls = [] + for tc in tcs: + fn = tc.get("function") or {} + calls.append({"name": fn.get("name"), "arguments": str(fn.get("arguments"))}) + finished = first_str(act.get("is_agent_finished")) + steps.append({ + "step": meta.get("step"), + "pre_agent": meta.get("pre_agent"), + "agent_id": meta.get("agent_id"), + "tool_calls": calls, + "assistant_content": str(act.get("content") or ""), + "is_agent_finished": finished, + }) + if finished and act.get("content"): + final_answer = str(act.get("content")) + +# 4) 抽取「源证据」= 最终对话里的所有 tool 结果(groundedness 依据) +final_msgs = traj[-1].get("state", {}).get("messages", []) or [] +evidence = [] +for i, m in enumerate(final_msgs): + if m.get("role") == "tool": + evidence.append({"msg_index": i, "content": str(m.get("content") or "")}) + +extract = { + "task_id": TASK_ID, + "is_sub_task": rec.get("is_sub_task"), + "num_steps": len(traj), + "question": question, + "system_prompt_excerpt": system_prompt[:8000], # 仅用于约束合规检查,截断以省 token + "steps": steps, + "final_answer": final_answer, + "evidence": evidence, +} +out = os.path.join(OUT_DIR, f"extracted_{TASK_ID}.json") +with open(out, "w", encoding="utf-8") as f: + json.dump(extract, f, ensure_ascii=False, indent=2) + +# 控制台打印一份紧凑摘要,便于你快速判断 +print(f"[OK] task_id={TASK_ID} steps={len(traj)} evidence_blocks={len(evidence)}") +print(f"[OK] question: {question}") +print(f"[OK] final_answer_len: {len(final_answer or '')}") +print(f"[OK] extracted -> {out}") +for st in steps: + names = [c['name'] for c in st['tool_calls']] + print(f" step{st['step']}: tools={names} finished={st['is_agent_finished']}") +PYEOF +``` + +> 运行前用环境变量传参:`TRAJECTORY_LOG=... TASK_ID=... OUT_DIR=... python3 ...`。 +> 运行后用 `Read` 读取 `OUT_DIR/extracted_.json`,再进入评估阶段。**只读这个抽取文件**,不要回头读原始日志行。 + +--- + +## 阶段 1 · 构建证据集(groundedness 基线) + +从 `evidence[]`(所有 tool 结果)中归纳出「本次运行实际获取到的事实集合」——即 agent 真正从外部(网页 Show Notes、`innerText`、命令输出等)拿到的内容。这是判断**忠实度/幻觉**的唯一基准。 + +关键判据: +- 系统提示中明确「模型知识截止 2024」。若 `final_answer` 中出现具体的、**证据集里不存在**的事实性断言(人名、数字、专有名词、引述金句、章节结构),默认按**潜在幻觉**处理,除非能在 evidence 中找到出处。 +- 区分「可被证据支撑的断言」与「模型基于先验/常识的合理推断」——后者也要标注为「未经证据证实」。 + +--- + +## 阶段 2 · 评分(八维,1–5 分,带锚点) + +对每个维度给出 1–5 的整数分,并**引用证据**(步骤号 / `msg_index` / final_answer 中的具体句子)作为依据。严禁仅凭印象打分。 + +锚点统一含义:**5=优秀无明显问题 / 4=良好有小瑕疵 / 3=合格但有明确缺陷 / 2=较差影响可用性 / 1=不合格**。 + +### A. 输出质量(权重合计 60%) + +| 维度 | 权重 | 评什么 | 扣分信号 | +|---|---|---|---| +| A1 忠实度 / Groundedness | 25% | 每条事实性断言是否被证据集支撑,是否有幻觉 | 出现证据集外的具体事实;把先验当事实;编造引述/数字 | +| A2 覆盖度 / Completeness | 15% | 是否同时覆盖「核心内容」与「关键洞察」(本任务的双重诉求) | 只复述梗概无洞察;漏掉主线 | +| A3 相关性 / 目标贴合 | 10% | 是否回答了实际问题、是否锁定了正确的对象(该 episode),无主题漂移 | 答非所问;张冠李戴;偷换目标 | +| A4 结构与可读性 | 10% | 组织、清晰度、长度适配、语言与提问一致 | 冗长堆砌;无结构;语言不一致 | + +### B. 执行过程质量(权重合计 40%) + +| 维度 | 权重 | 评什么 | 扣分信号 | +|---|---|---|---| +| B1 工具使用恰当性 | 12% | 工具选择与参数是否合理、是否达成目的 | 错用工具;参数错误;无效调用 | +| B2 效率 | 10% | 步数 / 调用数相对必要工作量是否经济 | 冗余探测、重复弯路、无谓的全量重试 | +| B3 约束合规 | 10% | 是否遵守 system_prompt 的硬性约束(工作目录、不 rm -rf、不写 /tmp、完成校验、不臆造) | 违反明确禁令;越权 | +| B4 鲁棒性 / 错误恢复 | 8% | 失败后是否定位并转向有效路径 | 反复撞同一墙;放弃;忽略错误 | + +> 本条轨迹的已知特征(供参考,不要照抄结论,须自行用证据复核):steps 1–3 尝试 curl+grep+regex 抽取网页失败;steps 4–7 在探测 `kimi-webbridge`/`agent-browser` 工具(疑似弯路,计入 B2/B4);steps 8–9 改用 `agent-browser` 复用 CDP 9222 成功抓到 Show Notes 与 `innerText`;step 10 输出。请据此核实 B2(效率)与 B4(恢复)的真实表现。 + +--- + +## 阶段 3 · 汇总与判定 + +1. 计算加权总分(百分制):`score = Σ(dim_score/5 × weight) × 100`。 +2. 给出等级:`≥85 优秀(Excellent) / 70–84 合格(Pass) / 55–69 需改进(Marginal) / <55 不合格(Fail)`。 +3. **一票否决项**:若 A1 忠实度 ≤2(存在实质性幻觉),无论总分多少,最终判定不得高于「需改进」,并在报告中显著标红。 +4. 列出 **Top-3 优点** 与 **Top-3 待改进项**,每条附证据指针与可执行的改进建议。 + +### 评判纪律(消除 judge 偏差) + +- 不因答案**更长/更华丽**而加分;只认证据与目标贴合度。 +- 不被 agent 的自信措辞影响——「已成功拿到完整 Show Notes」这类自述必须用 evidence 核实。 +- 不确定是否有出处时,标注为「未证实」而非默认正确。 +- 打分先写推理(引证),后给分数,避免先入为主。 + +--- + +## 阶段 4 · 产出报告(两份) + +用 `Write` 写出: + +1. `OUT_DIR/eval_report_.json` —— 机器可读,严格遵循以下 schema: + +```json +{ + "task_id": "string", + "question": "string", + "verdict": "Excellent|Pass|Marginal|Fail", + "weighted_score": 0, + "veto_triggered": false, + "dimensions": { + "A1_groundedness": {"score": 0, "weight": 0.25, "evidence": ["..."], "rationale": "..."}, + "A2_completeness": {"score": 0, "weight": 0.15, "evidence": ["..."], "rationale": "..."}, + "A3_relevance": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, + "A4_readability": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, + "B1_tool_use": {"score": 0, "weight": 0.12, "evidence": ["..."], "rationale": "..."}, + "B2_efficiency": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, + "B3_compliance": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, + "B4_robustness": {"score": 0, "weight": 0.08, "evidence": ["..."], "rationale": "..."} + }, + "hallucinations": [{"claim": "...", "why_unsupported": "..."}], + "top_strengths": ["..."], + "top_improvements": [{"issue": "...", "evidence": "...", "suggestion": "..."}] +} +``` + +2. `OUT_DIR/eval_report_.md` —— 人类可读报告,包含:评估对象与问题、判定与总分、八维评分表(分数+证据+理由)、幻觉清单、Top-3 优点、Top-3 改进建议。语言与被评估答案保持一致(本任务为中文)。 + +最后在对话中回复一段 ≤8 行的高信号摘要:判定 + 总分 + 最关键的 1–2 个发现 + 两份报告的路径。 + +--- + +## 执行清单(按序) + +- [ ] 阶段 0:运行解析脚本,落盘 `extracted_.json`,Read 之 +- [ ] 阶段 1:构建证据集,标出无出处的断言 +- [ ] 阶段 2:八维逐项打分(先证据后分数) +- [ ] 阶段 3:加权汇总 + 一票否决检查 + 优缺点 +- [ ] 阶段 4:写 JSON + MD 报告,回复摘要 diff --git a/tests/evaluations/conftest.py b/tests/evaluations/conftest.py new file mode 100644 index 000000000..baa2605bf --- /dev/null +++ b/tests/evaluations/conftest.py @@ -0,0 +1,46 @@ +from __future__ import annotations + + +def pytest_addoption(parser): + group = parser.getgroup("trajectory evaluator") + group.addoption( + "--task-id", + "--task_id", + action="store", + dest="trajectory_task_id", + default=None, + help="Task id to replay from the trajectory log.", + ) + group.addoption( + "--trajectory-log", + "--trajectory_log", + action="store", + dest="trajectory_log", + default=None, + help="Path to the trajectory log used by the manual replay test.", + ) + group.addoption( + "--agent-prompt", + "--agent_prompt", + action="store", + dest="trajectory_agent_prompt", + default=None, + help="Path to the trajectory evaluator agent.md prompt.", + ) + group.addoption( + "--out-dir", + "--out_dir", + action="store", + dest="trajectory_out_dir", + default=None, + help="Directory for extracted trajectory and evaluator report outputs.", + ) + group.addoption( + "--judge-timeout", + "--judge_timeout", + action="store", + dest="trajectory_judge_timeout", + default=None, + type=float, + help="Timeout in seconds for the trajectory evaluator judge agent.", + ) diff --git a/tests/evaluations/test_trajectory_log_manual_case.py b/tests/evaluations/test_trajectory_log_manual_case.py new file mode 100644 index 000000000..6c85a6fcd --- /dev/null +++ b/tests/evaluations/test_trajectory_log_manual_case.py @@ -0,0 +1,585 @@ +from __future__ import annotations + +import ast +import asyncio +import json +import os +import re +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal, Mapping + +import pytest +from pydantic import BaseModel, model_validator + +from aworld.config.task_loader import _load_skill_agent +from aworld.evaluations.runtime_composition import RolloutState +from aworld.evaluations.substrate import ( + EvalCaseDef, + EvalSuiteDef, + EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + JudgeExecution, + JudgeSchemaDef, + StateCheckGrader, + _coerce_judge_payload, + run_evaluation_flow, +) +from aworld.evaluations.report import validate_evaluator_report +from aworld.runner import Runners +from aworld.utils.skill_loader import extract_front_matter + + +DEFAULT_JUDGE_TIMEOUT_SECONDS = 600.0 + + +class _FakePytestConfig: + def __init__(self, values: Mapping[str, Any]): + self._values = values + + def getoption(self, name: str) -> Any: + return self._values.get(name) + + +class TrajectoryEvalJudgeOutput(BaseModel): + score: float + verdict: Literal["Excellent", "Pass", "Marginal", "Fail"] + A1_groundedness: int + A2_completeness: int + A3_relevance: int + A4_readability: int + B1_tool_use: int + B2_efficiency: int + B3_compliance: int + B4_robustness: int + veto_triggered: bool = False + + @model_validator(mode="before") + @classmethod + def flatten_agent_report(cls, value: Any) -> Any: + if not isinstance(value, Mapping) or "dimensions" not in value: + return value + flattened = dict(value) + if "score" not in flattened and "weighted_score" in flattened: + flattened["score"] = flattened["weighted_score"] + dimensions = value.get("dimensions") or {} + for metric_name in ( + "A1_groundedness", + "A2_completeness", + "A3_relevance", + "A4_readability", + "B1_tool_use", + "B2_efficiency", + "B3_compliance", + "B4_robustness", + ): + metric_payload = dimensions.get(metric_name) if isinstance(dimensions, Mapping) else None + if isinstance(metric_payload, Mapping) and "score" in metric_payload: + flattened[metric_name] = metric_payload["score"] + return flattened + + +def _truthy_string(value: Any) -> bool: + return str(value).strip().lower() in {"true", "1"} + + +def _manual_replay_config(pytest_config: Any) -> dict[str, Any]: + required_options = { + "--task-id": pytest_config.getoption("trajectory_task_id"), + "--trajectory-log": pytest_config.getoption("trajectory_log"), + "--agent-prompt": pytest_config.getoption("trajectory_agent_prompt"), + "--out-dir": pytest_config.getoption("trajectory_out_dir"), + } + missing = [name for name, value in required_options.items() if not value] + if missing: + raise pytest.UsageError( + "manual trajectory replay requires explicit pytest options: " + + ", ".join(missing) + ) + + task_id = required_options["--task-id"] + log_path = Path(str(required_options["--trajectory-log"])).expanduser() + agent_prompt_path = Path(str(required_options["--agent-prompt"])) + out_dir = Path(str(required_options["--out-dir"])) + judge_timeout_seconds = pytest_config.getoption("trajectory_judge_timeout") or DEFAULT_JUDGE_TIMEOUT_SECONDS + return { + "task_id": str(task_id), + "log_path": log_path, + "agent_prompt_path": agent_prompt_path, + "out_dir": out_dir, + "judge_timeout_seconds": float(judge_timeout_seconds), + } + + +def _safe_skill_name(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-._") or "markdown-agent" + + +def _frontmatter_scalar(value: Any, default: str) -> str: + text = str(value if value not in (None, "") else default) + return " ".join(text.splitlines()).strip() + + +def _normalize_tool_list(value: Any) -> dict[str, Any]: + if isinstance(value, Mapping): + return dict(value) + if isinstance(value, str) and value.strip(): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return {} + if isinstance(parsed, Mapping): + return dict(parsed) + return {} + + +def _materialize_agent_markdown_as_skill( + agent_markdown_path: Path, + *, + skills_root: Path, + skill_name: str, +) -> Path: + lines = agent_markdown_path.read_text(encoding="utf-8").splitlines() + frontmatter, body_start = extract_front_matter(lines) + body = "\n".join(lines[body_start:]).strip() + description = _frontmatter_scalar( + frontmatter.get("description", frontmatter.get("desc")), + f"Agent loaded from {agent_markdown_path}", + ) + tool_list = _normalize_tool_list(frontmatter.get("tool_list", {})) + + skill_dir = skills_root / skill_name + skill_dir.mkdir(parents=True, exist_ok=True) + skill_path = skill_dir / "SKILL.md" + skill_path.write_text( + "---\n" + f"name: {_frontmatter_scalar(frontmatter.get('name'), skill_name)}\n" + f"description: {description}\n" + "type: agent\n" + f"tool_list: {json.dumps(tool_list, ensure_ascii=False)}\n" + "---\n\n" + f"{body}\n", + encoding="utf-8", + ) + return skill_path + + +async def _load_agent_markdown_as_aworld_agent(agent_markdown_path: Path, *, agent_id: str) -> Any: + skill_name = _safe_skill_name(agent_id) + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + with tempfile.TemporaryDirectory(prefix="aworld-agent-md-") as tmp_dir: + skills_root = Path(tmp_dir) / "skills" + _materialize_agent_markdown_as_skill( + agent_markdown_path, + skills_root=skills_root, + skill_name=skill_name, + ) + return await _load_skill_agent( + agent_id=agent_id, + agent_def={ + "skill_name": skill_name, + "config": { + "llm_config": { + "llm_model_name": os.getenv("LLM_MODEL_NAME"), + "llm_provider": os.getenv("LLM_PROVIDER"), + "llm_api_key": api_key, + "llm_base_url": os.getenv("LLM_BASE_URL"), + } + }, + }, + skills_path=skills_root, + global_mcp_config=None, + ) + + +@dataclass(frozen=True) +class MarkdownAgentJudgeBackend: + backend_id: str + agent_markdown_path: Path + prompt_builder: Any + timeout_seconds: float | None = None + + def is_available(self) -> bool: + model_name = os.getenv("LLM_MODEL_NAME") + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + return self.agent_markdown_path.exists() and bool(model_name and api_key) + + async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> JudgeExecution: + if not self.is_available(): + raise RuntimeError(f"judge backend '{self.backend_id}' is not available") + + prompt = self.prompt_builder(case_input, target, suite) + if isinstance(prompt, tuple): + raise ValueError("MarkdownAgentJudgeBackend only supports text prompts in this manual replay test") + + agent = await _load_agent_markdown_as_aworld_agent( + self.agent_markdown_path, + agent_id=self.backend_id, + ) + + async def _run_agent() -> str: + response = await Runners.run(input=str(prompt), agent=agent) + return str(getattr(response, "answer", response)) + + if self.timeout_seconds is not None: + response_text = await asyncio.wait_for(_run_agent(), timeout=self.timeout_seconds) + else: + response_text = await _run_agent() + return JudgeExecution(backend_id=self.backend_id, payload=_coerce_judge_payload(response_text)) + + +def test_manual_replay_config_requires_explicit_pytest_options(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("AWORLD_TRAJECTORY_TASK_ID", "task_from_env") + monkeypatch.setenv("AWORLD_TRAJECTORY_LOG", "~/env/trajectory.log") + monkeypatch.setenv("AWORLD_TRAJECTORY_AGENT_PROMPT", "env/agent.md") + monkeypatch.setenv("AWORLD_TRAJECTORY_OUT_DIR", "env/reports") + + with pytest.raises(pytest.UsageError, match="--task-id"): + _manual_replay_config(_FakePytestConfig({})) + + config = _manual_replay_config( + _FakePytestConfig( + { + "trajectory_task_id": "task_from_cli", + "trajectory_log": "~/cli/trajectory.log", + "trajectory_agent_prompt": "cli/agent.md", + "trajectory_out_dir": "cli/reports", + "trajectory_judge_timeout": 12.5, + } + ) + ) + + assert config["task_id"] == "task_from_cli" + assert config["log_path"] == Path("~/cli/trajectory.log").expanduser() + assert config["agent_prompt_path"] == Path("cli/agent.md") + assert config["out_dir"] == Path("cli/reports") + assert config["judge_timeout_seconds"] == 12.5 + + +@pytest.mark.asyncio +async def test_agent_markdown_loads_as_aworld_agent_via_existing_skill_loader( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + monkeypatch.setenv("LLM_MODEL_NAME", "test-model") + monkeypatch.setenv("LLM_API_KEY", "test-key") + + agent_md = tmp_path / "agent.md" + agent_md.write_text( + "---\n" + "name: custom trajectory judge\n" + "description: Evaluates trajectories\n" + "tools: Bash, Read\n" + "model: opus\n" + "---\n\n" + "# Judge Contract\n" + "Return strict JSON.\n", + encoding="utf-8", + ) + + agent = await _load_agent_markdown_as_aworld_agent(agent_md, agent_id="custom-judge") + + assert agent.name() == "custom-judge" + assert agent.desc() == "Evaluates trajectories" + assert agent.mcp_servers == [] + assert "Return strict JSON." in agent.system_prompt + + +@pytest.mark.asyncio +async def test_markdown_agent_judge_backend_runs_loaded_agent_with_runners( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +): + monkeypatch.setenv("LLM_MODEL_NAME", "test-model") + monkeypatch.setenv("LLM_API_KEY", "test-key") + + agent_md = tmp_path / "agent.md" + agent_md.write_text( + "---\n" + "name: trajectory judge\n" + "description: Test judge\n" + "---\n\n" + "You are the test judge.\n", + encoding="utf-8", + ) + calls: dict[str, Any] = {} + + class _FakeTaskResponse: + answer = json.dumps( + { + "weighted_score": 88, + "verdict": "Pass", + "dimensions": {"A1_groundedness": {"score": 4}}, + } + ) + + async def fake_run(input: str, agent: Any, **kwargs: Any) -> _FakeTaskResponse: + calls["input"] = input + calls["agent_name"] = agent.name() + calls["system_prompt"] = agent.system_prompt + return _FakeTaskResponse() + + monkeypatch.setattr("aworld.runner.Runners.run", fake_run) + + backend = MarkdownAgentJudgeBackend( + backend_id="trajectory-evaluator-agent-md", + agent_markdown_path=agent_md, + prompt_builder=lambda case_input, target, suite: "judge this trajectory", + ) + execution = await backend.execute({}, {}, object()) + + assert calls == { + "input": "judge this trajectory", + "agent_name": "trajectory-evaluator-agent-md", + "system_prompt": "You are the test judge.", + } + assert execution.backend_id == "trajectory-evaluator-agent-md" + assert execution.payload["weighted_score"] == 88 + assert execution.payload["dimensions"]["A1_groundedness"]["score"] == 4 + + +def test_trajectory_step_assertion_uses_extracted_num_steps(tmp_path: Path): + extracted_path = tmp_path / "extracted_task.json" + extracted_path.write_text(json.dumps({"num_steps": 81}), encoding="utf-8") + result = { + "state_summary": {"trajectory_steps": 81}, + "metadata": {"extracted_path": str(extracted_path)}, + } + + _assert_report_trajectory_steps_match_extracted(result) + + +def _assert_report_trajectory_steps_match_extracted(result: Mapping[str, Any]) -> None: + extracted_path = Path(str(result["metadata"]["extracted_path"])) + extracted = json.loads(extracted_path.read_text(encoding="utf-8")) + assert result["state_summary"]["trajectory_steps"] == extracted["num_steps"] + + +def _extract_trajectory_record(log_path: Path, task_id: str) -> dict[str, Any]: + target_line = None + with log_path.open(encoding="utf-8", errors="replace") as handle: + for line in handle: + if task_id in line: + target_line = line + break + if target_line is None: + raise AssertionError(f"task_id {task_id} not found in {log_path}") + + clean = re.sub(r"\x1b\[[0-9;]*m", "", target_line).strip() + record = ast.literal_eval(clean) + trajectory = json.loads(record["trajectory"]) + + question = (trajectory[0].get("state", {}).get("input", {}) or {}).get("content") + system_prompt = "" + first_messages = trajectory[0].get("state", {}).get("messages", []) or [] + if first_messages and first_messages[0].get("role") == "system": + system_prompt = str(first_messages[0].get("content") or "") + + steps = [] + final_answer = None + for item in trajectory: + meta = item.get("meta", {}) + action = item.get("action") or {} + calls = [] + for tool_call in action.get("tool_calls") or []: + function = tool_call.get("function") or {} + calls.append({"name": function.get("name"), "arguments": str(function.get("arguments"))}) + finished = _truthy_string(action.get("is_agent_finished")) + steps.append( + { + "step": meta.get("step"), + "pre_agent": meta.get("pre_agent"), + "agent_id": meta.get("agent_id"), + "tool_calls": calls, + "assistant_content": str(action.get("content") or ""), + "is_agent_finished": finished, + } + ) + if finished and action.get("content"): + final_answer = str(action.get("content")) + + final_messages = trajectory[-1].get("state", {}).get("messages", []) or [] + evidence = [ + {"msg_index": index, "content": str(message.get("content") or "")} + for index, message in enumerate(final_messages) + if message.get("role") == "tool" + ] + + return { + "task_id": task_id, + "is_sub_task": record.get("is_sub_task"), + "num_steps": len(trajectory), + "question": question, + "system_prompt_excerpt": system_prompt[:8000], + "steps": steps, + "final_answer": final_answer, + "evidence": evidence, + } + + +class TrajectoryLogReplayHarness: + def __init__(self, *, out_dir: Path): + self.out_dir = out_dir + + async def run_rollout(self, *, case: EvalCaseDef, target: Mapping[str, Any]) -> RolloutState: + log_path = Path(str(case.input["trajectory_log"])).expanduser() + task_id = str(case.input["task_id"]) + extracted = _extract_trajectory_record(log_path, task_id) + self.out_dir.mkdir(parents=True, exist_ok=True) + extracted_path = self.out_dir / f"extracted_{task_id}.json" + extracted_path.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8") + + final_answer = extracted.get("final_answer") or "" + is_finished = any(step.get("is_agent_finished") for step in extracted["steps"]) + return RolloutState( + case_id=case.case_id, + status="success" if is_finished and final_answer else "failed", + answer=final_answer, + trajectory=list(extracted["steps"]), + outcome={ + "task_id": task_id, + "question": extracted.get("question"), + "evidence_blocks": len(extracted["evidence"]), + "num_steps": extracted["num_steps"], + "is_finished": is_finished, + "final_answer_len": len(final_answer), + "extracted_path": str(extracted_path), + }, + metadata={ + "trajectory_log": str(log_path), + "judge_agent_prompt": str(case.input["judge_agent_prompt"]), + "extracted_path": str(extracted_path), + }, + ) + + +def _trajectory_judge_prompt(case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> str: + outcome = (target.get("artifacts") or {}).get("outcome") or {} + extracted_path = outcome.get("extracted_path") + extracted_payload: dict[str, Any] = {} + if extracted_path: + extracted_payload = json.loads(Path(str(extracted_path)).read_text(encoding="utf-8")) + + payload = { + "case": { + "task_id": case_input["task_id"], + "trajectory_log": case_input["trajectory_log"], + }, + "extracted_trajectory": extracted_payload, + "required_output_schema": { + "score": "number, weighted score from 0 to 100", + "verdict": "Excellent|Pass|Marginal|Fail", + "A1_groundedness": "integer 1-5", + "A2_completeness": "integer 1-5", + "A3_relevance": "integer 1-5", + "A4_readability": "integer 1-5", + "B1_tool_use": "integer 1-5", + "B2_efficiency": "integer 1-5", + "B3_compliance": "integer 1-5", + "B4_robustness": "integer 1-5", + "veto_triggered": "boolean", + }, + "instruction": ( + "Apply the trajectory-evaluator agent contract to the extracted trajectory. " + "Do not call tools and do not re-read the raw log; all required evidence is in extracted_trajectory. " + "Return exactly one JSON object matching required_output_schema, with no markdown." + ), + } + return json.dumps(payload, ensure_ascii=False, indent=2) + + +@pytest.mark.asyncio +async def test_manual_trajectory_log_case_runs_end_to_end_for_human_replay(request: pytest.FixtureRequest): + try: + config = _manual_replay_config(request.config) + except pytest.UsageError as exc: + pytest.skip(str(exc)) + task_id = config["task_id"] + log_path = config["log_path"] + agent_prompt_path = config["agent_prompt_path"] + out_dir = config["out_dir"] + judge_timeout_seconds = config["judge_timeout_seconds"] + + if not log_path.exists(): + pytest.skip(f"manual trajectory log not found: {log_path}") + if not agent_prompt_path.exists(): + pytest.skip(f"manual trajectory evaluator agent prompt not found: {agent_prompt_path}") + if not os.getenv("LLM_MODEL_NAME") or not (os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")): + pytest.skip("real trajectory judge requires LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY") + + suite = EvalSuiteDef( + suite_id="trajectory-log-manual-replay", + cases=[ + EvalCaseDef( + case_id=task_id, + input={ + "trajectory_log": str(log_path), + "task_id": task_id, + "judge_agent_prompt": str(agent_prompt_path), + }, + ) + ], + runtime_harness=TrajectoryLogReplayHarness(out_dir=out_dir), + judge_schema=JudgeSchemaDef(output_model=TrajectoryEvalJudgeOutput), + judge_backend=MarkdownAgentJudgeBackend( + backend_id="trajectory-evaluator-agent-md", + agent_markdown_path=agent_prompt_path, + prompt_builder=_trajectory_judge_prompt, + timeout_seconds=judge_timeout_seconds, + ), + outcome_scorers=( + StateCheckGrader( + metric_name="has_evidence", + source="outcome", + path=("evidence_blocks",), + op=">", + expected=0, + ), + StateCheckGrader( + metric_name="agent_finished", + source="outcome", + path=("is_finished",), + op="==", + expected=True, + ), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), + GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), + GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), + ) + ), + metadata={ + "manual_replay": True, + "judge_agent_prompt": str(agent_prompt_path), + "trajectory_log": str(log_path), + }, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": str(log_path), "target_kind": "trajectory_log"}}, + suite=suite, + ) + ) + + report_dict = report.to_dict() + validate_evaluator_report(report_dict) + out_dir.mkdir(parents=True, exist_ok=True) + report_path = out_dir / f"evaluator_report_{task_id}.json" + report_path.write_text(json.dumps(report_dict, ensure_ascii=False, indent=2), encoding="utf-8") + + assert report["gate"]["status"] in {"pass", "fail", "needs_approval"} + assert report["metrics"]["has_evidence"]["mean"] == 1.0 + assert report["metrics"]["agent_finished"]["mean"] == 1.0 + assert report["judge_backend"]["backend_id"] == "trajectory-evaluator-agent-md" + assert report["results"][0]["judge"]["verdict"] in {"Excellent", "Pass", "Marginal", "Fail"} + assert 0 <= report["results"][0]["judge"]["score"] <= 100 + assert report["results"][0]["state_summary"]["answer"] + assert Path(report["results"][0]["metadata"]["extracted_path"]).exists() + _assert_report_trajectory_steps_match_extracted(report["results"][0]) + assert report_path.exists() From 56225deb4df3281a3a8e204c36bb5431cc67f65b Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 17:45:37 +0800 Subject: [PATCH 30/41] fix: trim evaluator report metadata and replay metrics --- aworld/evaluations/substrate.py | 4 +- .../evaluations/test_evaluation_substrate.py | 30 ++++++++ .../test_trajectory_log_manual_case.py | 71 +++++++++++++++++++ 3 files changed, 104 insertions(+), 1 deletion(-) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 9ecf59889..827afdeff 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -1150,7 +1150,9 @@ async def run_evaluation_flow(flow: EvaluationFlowDef) -> EvaluatorReport: case_metrics[metric_name]["status"] = status metadata = metric_result.get("metadata") or {} if isinstance(metadata, Mapping) and metadata: - case_metric_details[metric_name] = dict(metadata) + is_judge_metric = "_judge_backend" in metadata + if not is_judge_metric or metric_name == "score": + case_metric_details[metric_name] = dict(metadata) if case_backend_id is None and isinstance(metadata, Mapping): case_backend_id = metadata.get("_judge_backend") else: diff --git a/tests/evaluations/test_evaluation_substrate.py b/tests/evaluations/test_evaluation_substrate.py index 5c0a411bc..5c085debc 100644 --- a/tests/evaluations/test_evaluation_substrate.py +++ b/tests/evaluations/test_evaluation_substrate.py @@ -896,6 +896,36 @@ async def fake_judge(case_input, target): assert result.metric_results["score"]["metadata"]["answer"] == "from-state" +@pytest.mark.asyncio +async def test_report_keeps_full_judge_metadata_only_on_score_metric() -> None: + async def fake_judge(case_input, target): + return { + "score": 0.5, + "verdict": "Fail", + "A1_groundedness": 1, + "veto_triggered": True, + } + + suite = EvalSuiteDef( + suite_id="demo-suite", + cases=[EvalCaseDef(case_id="case-1", input={"query": "demo"})], + judge=fake_judge, + ) + + report = await run_evaluation_flow( + EvaluationFlowDef( + target={"kind": "inline", "value": {"target_path": "demo"}}, + suite=suite, + ) + ) + + result = report["results"][0] + assert result["judge"]["A1_groundedness"] == 1 + assert result["metrics"]["verdict"]["value"] == "Fail" + assert set(result["metric_details"]) == {"score"} + assert result["metric_details"]["score"]["veto_triggered"] is True + + def test_builtin_app_evaluator_suite_has_required_schema_and_score_gate() -> None: suite = get_builtin_eval_suite("app-evaluator") diff --git a/tests/evaluations/test_trajectory_log_manual_case.py b/tests/evaluations/test_trajectory_log_manual_case.py index 6c85a6fcd..9e636703b 100644 --- a/tests/evaluations/test_trajectory_log_manual_case.py +++ b/tests/evaluations/test_trajectory_log_manual_case.py @@ -340,6 +340,59 @@ async def fake_run(input: str, agent: Any, **kwargs: Any) -> _FakeTaskResponse: assert execution.payload["dimensions"]["A1_groundedness"]["score"] == 4 +@pytest.mark.asyncio +async def test_trajectory_log_replay_harness_populates_tool_calls_and_standard_metrics(tmp_path: Path): + task_id = "task_with_tool" + trajectory = [ + { + "state": { + "input": {"content": "question"}, + "messages": [{"role": "system", "content": "system"}], + }, + "meta": {"step": 1, "pre_agent": "user", "agent_id": "agent"}, + "action": { + "tool_calls": [ + {"function": {"name": "search", "arguments": "{}"}}, + {"function": {"name": "open", "arguments": "{\"url\":\"https://example.com\"}"}}, + ], + "is_agent_finished": "False", + }, + }, + { + "state": { + "messages": [ + {"role": "tool", "content": "search result"}, + {"role": "tool", "content": "page text"}, + ], + }, + "meta": {"step": 2, "pre_agent": "agent", "agent_id": "agent"}, + "action": {"content": "final", "is_agent_finished": "True"}, + }, + ] + log_path = tmp_path / "trajectory.log" + log_path.write_text( + repr({"task_id": task_id, "is_sub_task": False, "trajectory": json.dumps(trajectory)}) + + "\n", + encoding="utf-8", + ) + case = EvalCaseDef( + case_id=task_id, + input={ + "trajectory_log": str(log_path), + "task_id": task_id, + "judge_agent_prompt": "agent.md", + }, + ) + + state = await TrajectoryLogReplayHarness(out_dir=tmp_path).run_rollout(case=case, target={}) + + assert [call["name"] for call in state.tool_calls] == ["search", "open"] + assert state.usage == {"total_tokens": 0} + assert state.timing == {"duration_ms": 0} + assert state.standard_metrics["n_turns"] == 2 + assert state.standard_metrics["n_tool_calls"] == 2 + + def test_trajectory_step_assertion_uses_extracted_num_steps(tmp_path: Path): extracted_path = tmp_path / "extracted_task.json" extracted_path.write_text(json.dumps({"num_steps": 81}), encoding="utf-8") @@ -433,11 +486,29 @@ async def run_rollout(self, *, case: EvalCaseDef, target: Mapping[str, Any]) -> final_answer = extracted.get("final_answer") or "" is_finished = any(step.get("is_agent_finished") for step in extracted["steps"]) + tool_calls = [ + dict(tool_call) + for step in extracted["steps"] + for tool_call in step.get("tool_calls", []) + if isinstance(tool_call, Mapping) + ] + usage = {"total_tokens": 0} + timing = {"duration_ms": 0} + standard_metrics = { + "n_turns": len(extracted["steps"]), + "n_tool_calls": len(tool_calls), + "n_tokens": usage["total_tokens"], + "duration_ms": timing["duration_ms"], + } return RolloutState( case_id=case.case_id, status="success" if is_finished and final_answer else "failed", answer=final_answer, trajectory=list(extracted["steps"]), + tool_calls=tool_calls, + usage=usage, + timing=timing, + standard_metrics=standard_metrics, outcome={ "task_id": task_id, "question": extracted.get("question"), From 5f8bc4fb7e4e7cf8a1b71d2699836b19c60121a8 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 19:47:08 +0800 Subject: [PATCH 31/41] feat: add evaluator input source framework --- aworld/evaluations/sources.py | 293 +++++++++++++++ aworld/evaluations/state_adapters.py | 116 ++++++ aworld/evaluations/substrate.py | 122 +++++++ aworld/evaluations/trajectory_judge.py | 54 +++ .../.openspec.yaml | 2 + .../design.md | 167 +++++++++ .../implementation-plan.md | 37 ++ .../proposal.md | 32 ++ .../specs/cli-evaluator-flow/spec.md | 91 +++++ .../tasks.md | 40 ++ .../.openspec.yaml | 2 + .../design.md | 238 ++++++++++++ .../implementation-plan.md | 85 +++++ .../proposal.md | 36 ++ .../specs/evaluation-substrate/spec.md | 105 ++++++ .../tasks.md | 60 +++ .../test_evaluation_input_sources.py | 202 ++++++++++ .../test_trajectory_log_manual_case.py | 344 ++---------------- 18 files changed, 1706 insertions(+), 320 deletions(-) create mode 100644 aworld/evaluations/sources.py create mode 100644 aworld/evaluations/state_adapters.py create mode 100644 aworld/evaluations/trajectory_judge.py create mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml create mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md create mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md create mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md create mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md create mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md create mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml create mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md create mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md create mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md create mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md create mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md create mode 100644 tests/evaluations/test_evaluation_input_sources.py diff --git a/aworld/evaluations/sources.py b/aworld/evaluations/sources.py new file mode 100644 index 000000000..67eaddfa4 --- /dev/null +++ b/aworld/evaluations/sources.py @@ -0,0 +1,293 @@ +# coding: utf-8 +from __future__ import annotations + +import ast +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Iterable, Mapping, Protocol + + +_SCALAR_TYPES = (str, int, float, bool, type(None)) + + +def _is_serializable_value(value: Any) -> bool: + if isinstance(value, _SCALAR_TYPES): + return True + if isinstance(value, list): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, tuple): + return all(_is_serializable_value(item) for item in value) + if isinstance(value, Mapping): + return all(isinstance(key, str) and _is_serializable_value(item) for key, item in value.items()) + return False + + +def _serializable_dict(payload: Mapping[str, Any] | None) -> dict[str, Any]: + return { + str(key): value + for key, value in dict(payload or {}).items() + if isinstance(key, str) and _is_serializable_value(value) + } + + +@dataclass(frozen=True) +class EvalSourceRecord: + case_id: str + input: Mapping[str, Any] + expected: Any | None = None + answer: Any | None = None + state: Mapping[str, Any] | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + raw_payload: Mapping[str, Any] = field(default_factory=dict) + + def to_dict(self, *, include_raw_payload: bool = False) -> dict[str, Any]: + payload = { + "case_id": self.case_id, + "input": _serializable_dict(self.input), + "expected": self.expected, + "answer": self.answer, + "state": _serializable_dict(self.state), + "metadata": _serializable_dict(self.metadata), + } + if include_raw_payload: + payload["raw_payload"] = _serializable_dict(self.raw_payload) + return {key: value for key, value in payload.items() if value not in (None, {}, [])} + + @classmethod + def from_dict(cls, payload: Mapping[str, Any]) -> "EvalSourceRecord": + return cls( + case_id=str(payload["case_id"]), + input=dict(payload.get("input") or {}), + expected=payload.get("expected"), + answer=payload.get("answer"), + state=dict(payload.get("state") or {}) if isinstance(payload.get("state"), Mapping) else None, + metadata=dict(payload.get("metadata") or {}), + raw_payload=dict(payload.get("raw_payload") or {}), + ) + + def to_case(self): + from aworld.evaluations.substrate import EvalCaseDef + + return EvalCaseDef( + case_id=self.case_id, + input=dict(self.input), + expected=self.expected, + metadata={ + **dict(self.metadata or {}), + "source_record": self.to_dict(), + }, + ) + + +class EvalSource(Protocol): + def iter_records(self) -> Iterable[EvalSourceRecord]: + ... + + def to_cases(self): + ... + + def default_adapter(self): + ... + + +class _BaseEvalSource: + def to_cases(self): + return tuple(record.to_case() for record in self.iter_records()) + + +@dataclass(frozen=True) +class JsonlTaskAnswerSource(_BaseEvalSource): + path: str | Path + id_field: str = "id" + input_field: str = "input" + answer_field: str = "answer" + expected_field: str | None = None + metadata_field: str | None = None + + def iter_records(self) -> Iterable[EvalSourceRecord]: + path = Path(self.path).expanduser() + with path.open(encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + payload = json.loads(stripped) + if not isinstance(payload, Mapping): + raise ValueError(f"{path}:{line_number} must contain a JSON object") + for field_name in (self.id_field, self.input_field, self.answer_field): + if field_name not in payload: + raise ValueError(f"{path}:{line_number} missing required field: {field_name}") + metadata = {} + if self.metadata_field is not None and isinstance(payload.get(self.metadata_field), Mapping): + metadata.update(dict(payload[self.metadata_field])) + metadata.update({"source_kind": "task-answer", "source_path": str(path), "line_number": line_number}) + expected = payload.get(self.expected_field) if self.expected_field else None + yield EvalSourceRecord( + case_id=str(payload[self.id_field]), + input={"input": payload[self.input_field]}, + expected=expected, + answer=payload[self.answer_field], + metadata=metadata, + raw_payload=dict(payload), + ) + + def default_adapter(self): + from aworld.evaluations.state_adapters import AnswerStateAdapter + + return AnswerStateAdapter() + + +def _truthy_string(value: Any) -> bool: + return str(value).strip().lower() in {"true", "1", "yes"} + + +def _tool_calls_from_action(action: Mapping[str, Any]) -> list[dict[str, Any]]: + calls: list[dict[str, Any]] = [] + for tool_call in action.get("tool_calls") or []: + if not isinstance(tool_call, Mapping): + continue + function = tool_call.get("function") or {} + if isinstance(function, Mapping): + calls.append({"name": function.get("name"), "arguments": str(function.get("arguments"))}) + return calls + + +def extract_aworld_trajectory_record(log_path: str | Path, task_id: str) -> dict[str, Any]: + path = Path(log_path).expanduser() + target_line = None + with path.open(encoding="utf-8", errors="replace") as handle: + for line in handle: + if task_id in line: + target_line = line + break + if target_line is None: + raise ValueError(f"task_id {task_id} not found in {path}") + + clean = re.sub(r"\x1b\[[0-9;]*m", "", target_line).strip() + record = ast.literal_eval(clean) + trajectory = json.loads(record["trajectory"]) + if not isinstance(trajectory, list): + raise ValueError(f"task_id {task_id} trajectory must be a list") + + question = None + system_prompt = "" + if trajectory: + first_state = trajectory[0].get("state", {}) if isinstance(trajectory[0], Mapping) else {} + question = (first_state.get("input", {}) or {}).get("content") if isinstance(first_state, Mapping) else None + first_messages = first_state.get("messages", []) if isinstance(first_state, Mapping) else [] + if first_messages and isinstance(first_messages[0], Mapping) and first_messages[0].get("role") == "system": + system_prompt = str(first_messages[0].get("content") or "") + + steps = [] + final_answer = None + for item in trajectory: + if not isinstance(item, Mapping): + continue + meta = item.get("meta", {}) if isinstance(item.get("meta"), Mapping) else {} + action = item.get("action", {}) if isinstance(item.get("action"), Mapping) else {} + finished = _truthy_string(action.get("is_agent_finished")) + content = str(action.get("content") or "") + steps.append( + { + "step": meta.get("step"), + "pre_agent": meta.get("pre_agent"), + "agent_id": meta.get("agent_id"), + "tool_calls": _tool_calls_from_action(action), + "assistant_content": content, + "is_agent_finished": finished, + } + ) + if finished and content: + final_answer = content + + final_messages = [] + if trajectory and isinstance(trajectory[-1], Mapping): + final_state = trajectory[-1].get("state", {}) + if isinstance(final_state, Mapping): + final_messages = final_state.get("messages", []) or [] + evidence = [ + {"msg_index": index, "content": str(message.get("content") or "")} + for index, message in enumerate(final_messages) + if isinstance(message, Mapping) and message.get("role") == "tool" + ] + + return { + "task_id": task_id, + "is_sub_task": record.get("is_sub_task"), + "num_steps": len(trajectory), + "question": question, + "system_prompt_excerpt": system_prompt[:8000], + "steps": steps, + "final_answer": final_answer, + "evidence": evidence, + } + + +@dataclass(frozen=True) +class AWorldTrajectoryLogSource(_BaseEvalSource): + path: str | Path + task_ids: Iterable[str] + extraction_dir: str | Path | None = None + + def iter_records(self) -> Iterable[EvalSourceRecord]: + path = Path(self.path).expanduser() + for task_id in self.task_ids: + task_id = str(task_id) + extracted = extract_aworld_trajectory_record(path, task_id) + yield EvalSourceRecord( + case_id=task_id, + input={"task_id": task_id, "trajectory_log": str(path)}, + answer=extracted.get("final_answer"), + metadata={ + "source_kind": "aworld-trajectory-log", + "source_path": str(path), + "extraction_dir": str(Path(self.extraction_dir).expanduser()) if self.extraction_dir else None, + }, + raw_payload=extracted, + ) + + def default_adapter(self): + from aworld.evaluations.state_adapters import TrajectoryLogStateAdapter + + return TrajectoryLogStateAdapter(extraction_dir=self.extraction_dir) + + +def create_source_eval_suite( + *, + suite_id: str, + source: EvalSource, + judge_backend, + judge_schema, + gate_policy=None, + state_adapter=None, + outcome_scorers=tuple(), + reward_metrics=tuple(), + standard_metrics=tuple(), + trajectory_scorers=tuple(), + metadata: Mapping[str, Any] | None = None, +): + from aworld.evaluations.state_adapters import ReplayRuntimeHarness + from aworld.evaluations.substrate import EvalSuiteDef + + records = list(source.iter_records()) + adapter = state_adapter + if adapter is None: + adapter = source.default_adapter() + return EvalSuiteDef( + suite_id=suite_id, + cases=[record.to_case() for record in records], + runtime_harness=ReplayRuntimeHarness(adapter=adapter, records=tuple(records)), + judge_backend=judge_backend, + judge_schema=judge_schema, + gate_policy=gate_policy, + outcome_scorers=tuple(outcome_scorers), + reward_metrics=tuple(reward_metrics), + standard_metrics=tuple(standard_metrics), + trajectory_scorers=tuple(trajectory_scorers), + metadata={ + **dict(metadata or {}), + "source_backed": True, + }, + ) diff --git a/aworld/evaluations/state_adapters.py b/aworld/evaluations/state_adapters.py new file mode 100644 index 000000000..5810ecefe --- /dev/null +++ b/aworld/evaluations/state_adapters.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Mapping, Protocol + +from aworld.evaluations.runtime_composition import RolloutState +from aworld.evaluations.sources import EvalSourceRecord + + +class EvalStateAdapter(Protocol): + def adapt(self, *, record: EvalSourceRecord, case: Any, target: Mapping[str, Any]) -> RolloutState: + ... + + +@dataclass(frozen=True) +class AnswerStateAdapter: + def adapt(self, *, record: EvalSourceRecord, case: Any, target: Mapping[str, Any]) -> RolloutState: + return RolloutState( + case_id=str(getattr(case, "case_id", record.case_id)), + status="success", + answer=record.answer, + outcome={"has_answer": record.answer is not None}, + metadata={ + **dict(record.metadata or {}), + "source_case_id": record.case_id, + }, + ) + + +@dataclass(frozen=True) +class TrajectoryLogStateAdapter: + extraction_dir: str | Path | None = None + + def adapt(self, *, record: EvalSourceRecord, case: Any, target: Mapping[str, Any]) -> RolloutState: + extracted = dict(record.raw_payload or {}) + final_answer = extracted.get("final_answer") or "" + steps = list(extracted.get("steps") or []) + is_finished = any(bool(step.get("is_agent_finished")) for step in steps if isinstance(step, Mapping)) + tool_calls = [ + dict(tool_call) + for step in steps + if isinstance(step, Mapping) + for tool_call in step.get("tool_calls", []) + if isinstance(tool_call, Mapping) + ] + usage = {"total_tokens": 0} + timing = {"duration_ms": 0} + standard_metrics = { + "n_turns": len(steps), + "n_tool_calls": len(tool_calls), + "n_tokens": usage["total_tokens"], + "duration_ms": timing["duration_ms"], + } + extracted_path = self._write_extracted(record, extracted) + metadata = { + **dict(record.metadata or {}), + "source_case_id": record.case_id, + } + if extracted_path is not None: + metadata["extracted_path"] = str(extracted_path) + return RolloutState( + case_id=str(getattr(case, "case_id", record.case_id)), + status="success" if is_finished and final_answer else "failed", + answer=final_answer, + trajectory=steps, + tool_calls=tool_calls, + usage=usage, + timing=timing, + standard_metrics=standard_metrics, + outcome={ + "task_id": record.case_id, + "question": extracted.get("question"), + "evidence_blocks": len(extracted.get("evidence") or []), + "num_steps": extracted.get("num_steps", len(steps)), + "is_finished": is_finished, + "final_answer_len": len(final_answer), + **({"extracted_path": str(extracted_path)} if extracted_path is not None else {}), + }, + metadata=metadata, + ) + + def _write_extracted(self, record: EvalSourceRecord, extracted: Mapping[str, Any]) -> Path | None: + extraction_dir = self.extraction_dir or record.metadata.get("extraction_dir") + if not extraction_dir: + return None + out_dir = Path(str(extraction_dir)).expanduser() + out_dir.mkdir(parents=True, exist_ok=True) + path = out_dir / f"extracted_{record.case_id}.json" + path.write_text(json.dumps(dict(extracted), ensure_ascii=False, indent=2), encoding="utf-8") + return path + + +@dataclass(frozen=True) +class ReplayRuntimeHarness: + adapter: EvalStateAdapter + records: tuple[EvalSourceRecord, ...] = tuple() + + async def run_rollout(self, *, case: Any, target: Mapping[str, Any]) -> RolloutState: + metadata = getattr(case, "metadata", {}) or {} + record_payload = metadata.get("source_record") + if not isinstance(record_payload, Mapping): + record_payload = (getattr(case, "input", {}) or {}).get("_source_record") + if not isinstance(record_payload, Mapping): + raise ValueError("replay source case is missing source_record metadata") + record = self._resolve_record(record_payload) + return self.adapter.adapt(record=record, case=case, target=target) + + def _resolve_record(self, record_payload: Mapping[str, Any]) -> EvalSourceRecord: + case_id = str(record_payload.get("case_id")) + for record in self.records: + if record.case_id == case_id: + return record + return EvalSourceRecord.from_dict(record_payload) diff --git a/aworld/evaluations/substrate.py b/aworld/evaluations/substrate.py index 827afdeff..a873e6b4c 100644 --- a/aworld/evaluations/substrate.py +++ b/aworld/evaluations/substrate.py @@ -8,6 +8,7 @@ import inspect import os import re +import tempfile import uuid from dataclasses import dataclass, field, replace from datetime import datetime, timezone @@ -113,11 +114,17 @@ def to_dict(self) -> dict[str, Any]: class JudgeSchemaDef: required_fields: tuple[str, ...] = tuple() output_model: type[BaseModel] | None = None + normalizer: Callable[[Mapping[str, Any]], Mapping[str, Any]] | None = None def validate(self, payload: Mapping[str, Any]) -> None: self.validate_payload(payload) def validate_payload(self, payload: Mapping[str, Any]) -> dict[str, Any]: + if self.normalizer is not None: + payload = self.normalizer(dict(payload)) + if not isinstance(payload, Mapping): + raise ValueError("judge schema normalizer must return a mapping") + if self.output_model is not None: try: model = self.output_model.model_validate(dict(payload)) @@ -318,6 +325,35 @@ class AgentJudgeBackend: prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], JudgePrompt] | None = None timeout_seconds: float | None = None + @classmethod + def from_agent_markdown( + cls, + path: str | Path, + *, + backend_id: str | None = None, + prompt_builder: Callable[[dict[str, Any], dict[str, Any], "EvalSuiteDef"], JudgePrompt] | None = None, + timeout_seconds: float | None = None, + ) -> "AgentJudgeBackend": + agent_markdown_path = Path(path).expanduser() + resolved_backend_id = backend_id or agent_markdown_path.stem + + async def _executor(prompt: JudgePrompt, system_prompt: str) -> str: + if isinstance(prompt, tuple): + raise ValueError("agent markdown judge backend only supports text prompts") + from aworld.runner import Runners + + agent = await load_agent_markdown(agent_markdown_path, agent_id=resolved_backend_id) + response = await Runners.run(input=str(prompt), agent=agent) + return str(getattr(response, "answer", response)) + + return cls( + backend_id=resolved_backend_id, + system_prompt=f"Agent loaded from {agent_markdown_path}", + executor=_executor, + prompt_builder=prompt_builder, + timeout_seconds=timeout_seconds, + ) + def is_available(self) -> bool: if self.executor is not None: return True @@ -358,6 +394,92 @@ async def judge(self, case_input: dict[str, Any], target: dict[str, Any], suite: return execution.payload +def _safe_agent_markdown_name(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-._") or "markdown-agent" + + +def _frontmatter_scalar(value: Any, default: str) -> str: + text = str(value if value not in (None, "") else default) + return " ".join(text.splitlines()).strip() + + +def _normalize_markdown_tool_list(value: Any) -> dict[str, Any]: + if isinstance(value, Mapping): + return dict(value) + if isinstance(value, str) and value.strip(): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return {} + if isinstance(parsed, Mapping): + return dict(parsed) + return {} + + +def _materialize_agent_markdown_as_skill( + agent_markdown_path: Path, + *, + skills_root: Path, + skill_name: str, +) -> Path: + from aworld.utils.skill_loader import extract_front_matter + + lines = agent_markdown_path.read_text(encoding="utf-8").splitlines() + frontmatter, body_start = extract_front_matter(lines) + body = "\n".join(lines[body_start:]).strip() + description = _frontmatter_scalar( + frontmatter.get("description", frontmatter.get("desc")), + f"Agent loaded from {agent_markdown_path}", + ) + tool_list = _normalize_markdown_tool_list(frontmatter.get("tool_list", {})) + + skill_dir = skills_root / skill_name + skill_dir.mkdir(parents=True, exist_ok=True) + skill_path = skill_dir / "SKILL.md" + skill_path.write_text( + "---\n" + f"name: {_frontmatter_scalar(frontmatter.get('name'), skill_name)}\n" + f"description: {description}\n" + "type: agent\n" + f"tool_list: {json.dumps(tool_list, ensure_ascii=False)}\n" + "---\n\n" + f"{body}\n", + encoding="utf-8", + ) + return skill_path + + +async def load_agent_markdown(path: str | Path, *, agent_id: str): + from aworld.config.task_loader import _load_skill_agent + + agent_markdown_path = Path(path).expanduser() + skill_name = _safe_agent_markdown_name(agent_id) + api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") + with tempfile.TemporaryDirectory(prefix="aworld-agent-md-") as tmp_dir: + skills_root = Path(tmp_dir) / "skills" + _materialize_agent_markdown_as_skill( + agent_markdown_path, + skills_root=skills_root, + skill_name=skill_name, + ) + return await _load_skill_agent( + agent_id=agent_id, + agent_def={ + "skill_name": skill_name, + "config": { + "llm_config": { + "llm_model_name": os.getenv("LLM_MODEL_NAME"), + "llm_provider": os.getenv("LLM_PROVIDER"), + "llm_api_key": api_key, + "llm_base_url": os.getenv("LLM_BASE_URL"), + } + }, + }, + skills_path=skills_root, + global_mcp_config=None, + ) + + @dataclass(frozen=True) class FallbackJudgeBackend: backend_id: str diff --git a/aworld/evaluations/trajectory_judge.py b/aworld/evaluations/trajectory_judge.py new file mode 100644 index 000000000..e4f96fae1 --- /dev/null +++ b/aworld/evaluations/trajectory_judge.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import annotations + +from typing import Any, Literal, Mapping + +from pydantic import BaseModel + +from aworld.evaluations.substrate import JudgeSchemaDef + + +class TrajectoryEvalJudgeOutput(BaseModel): + score: float + verdict: Literal["Excellent", "Pass", "Marginal", "Fail"] + A1_groundedness: int + A2_completeness: int + A3_relevance: int + A4_readability: int + B1_tool_use: int + B2_efficiency: int + B3_compliance: int + B4_robustness: int + veto_triggered: bool = False + + +def normalize_trajectory_judge_payload(payload: Mapping[str, Any]) -> dict[str, Any]: + if "dimensions" not in payload: + return dict(payload) + flattened = dict(payload) + if "score" not in flattened and "weighted_score" in flattened: + flattened["score"] = flattened["weighted_score"] + dimensions = payload.get("dimensions") or {} + for metric_name in ( + "A1_groundedness", + "A2_completeness", + "A3_relevance", + "A4_readability", + "B1_tool_use", + "B2_efficiency", + "B3_compliance", + "B4_robustness", + ): + metric_payload = dimensions.get(metric_name) if isinstance(dimensions, Mapping) else None + if isinstance(metric_payload, Mapping) and "score" in metric_payload: + flattened[metric_name] = metric_payload["score"] + return flattened + + +class TrajectoryJudgeSchema: + @staticmethod + def default() -> JudgeSchemaDef: + return JudgeSchemaDef( + output_model=TrajectoryEvalJudgeOutput, + normalizer=normalize_trajectory_judge_payload, + ) diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml new file mode 100644 index 000000000..2cb80411e --- /dev/null +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-10 diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md new file mode 100644 index 000000000..b12fac5fe --- /dev/null +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md @@ -0,0 +1,167 @@ +# AWorld CLI Evaluator Source Run + +## Context + +`aworld-cli evaluator` currently runs suite-backed evaluations for local targets. It is already exposed through the builtin plugin command model and uses evaluator hooks around discovery, pre-run, post-run, and summary rendering. That is the right extension surface for CLI-level concerns. + +The new framework input-source layer will make evaluation inputs first-class: task-only files, task+answer files, serialized states, and AWorld trajectory logs all normalize into source records and framework state adapters. The CLI should not duplicate parsing or replay logic. Its job is to assemble a source-backed flow from user-facing arguments and then call the same `run_evaluation_flow` substrate used by code callers. + +## Goals / Non-Goals + +**Goals:** + +- Provide a simple CLI path for evaluating files/logs without writing a Python test harness. +- Keep the evaluator command plugin-backed and compatible with existing top-level command registration. +- Reuse existing evaluator hooks and extend their payloads for source-backed runs. +- Make common source kinds usable with a small argument set. +- Keep framework evaluation semantics in `aworld.evaluations`, not in CLI handlers. +- Preserve existing `--target` / `--suite` evaluator behavior. + +**Non-Goals:** + +- Adding a separate `aworld-cli trajectory-eval` command. +- Making trajectory logs a special CLI-only feature. +- Implementing source parsing, state replay, judge normalization, scoring, or gate logic in `aworld-cli`. +- Replacing the plugin command system or hook infrastructure. +- Adding remote storage connectors, sandbox lifecycle management, or training/optimizer flows. + +## Command Shape + +The canonical source-backed path should be: + +```bash +aworld-cli evaluator run \ + --input ~/Documents/logs/trajectory.log \ + --kind aworld-trajectory-log \ + --task-id task_20260609193335 \ + --judge-agent eval/trajectory_evaluator/agent.md \ + --out-dir eval/trajectory_evaluator/reports +``` + +Task+answer files: + +```bash +aworld-cli evaluator run \ + --input task_answers.jsonl \ + --kind task-answer \ + --judge-agent eval/answer_judge/agent.md \ + --out-dir reports +``` + +The default JSONL fields are `id`, `input`, and `answer`. `--id-field`, `--task-field`, and `--answer-field` are override flags for files that do not follow that convention. + +Task-only files are a follow-on source kind once the framework input-source layer adds task-only source support: + +```bash +aworld-cli evaluator run \ + --input tasks.jsonl \ + --kind task \ + --id-field task_id \ + --task-field task \ + --agent ./agent.md \ + --judge-agent eval/answer_judge/agent.md \ + --out-dir reports +``` + +`--kind auto` can be added once detection is reliable, but the first version should require explicit `--kind` to keep failures predictable. + +## CLI Boundary + +The evaluator command owns: + +- argument parsing and validation +- path normalization +- selecting a framework source class by `--kind` +- passing field mappings and task filters to the source +- selecting a framework state adapter or execution spec +- loading a judge agent through framework helpers +- invoking framework flow execution +- writing the report and rendering a summary +- invoking evaluator hooks with source-aware payloads + +The evaluator command does not own: + +- parsing trajectory internals +- converting source records into `EvalState` or `RolloutState` +- judge payload normalization +- scorer implementation +- gate implementation +- report schema semantics +- trial, sandbox, or simulator semantics + +## Plugin And Hook Integration + +The implementation should follow existing CLI conventions: + +- keep `EvaluatorTopLevelCommand` as the command object exposed through the builtin evaluator plugin entrypoint +- add `run` as a subparser under `evaluator`, or otherwise route source-backed arguments through the same command object without creating a new top-level command +- keep source-backed flow assembly in `aworld_cli.evaluator_runtime` +- use `PluginManager`, `get_builtin_plugin_roots`, `load_plugin_hooks`, and `_run_evaluator_hooks` as the hook path + +Hook payloads should gain source-aware fields while preserving existing keys: + +- `mode`: `target` or `source` +- `input`: resolved input path for source mode +- `kind`: source kind for source mode +- `task_id` or `task_ids` when provided +- `judge_agent`: resolved judge-agent path when provided +- `agent`: resolved execution agent path/name when provided +- `workspace_path` +- `output_path` or report path after resolution + +Allowed hook behavior remains CLI-scoped: + +- pre-discover/pre-run hooks may add metadata or override summary fields +- post-run hooks may upload, notify, or record report metadata +- render hooks may append summary text +- hooks must not replace framework execution, scoring, gate decisions, or report contracts + +## Data Flow + +```text +CLI args + -> EvaluatorTopLevelCommand parser + -> aworld_cli.evaluator_runtime source runner + -> framework EvalSource + EvalStateAdapter / execution spec + -> create source-backed EvalSuiteDef / EvaluationFlowDef + -> run_evaluation_flow + -> report write + render summary + hooks +``` + +For the trajectory-log manual case, the CLI path should be equivalent to the current pytest invocation but without test-local glue: + +```text +--input trajectory.log + -> AWorldTrajectoryLogSource + -> TrajectoryLogStateAdapter + -> ReplayRuntimeHarness + -> AgentJudgeBackend.from_agent_markdown(agent.md) + -> typed schema + gate + report +``` + +## Compatibility + +Existing usage remains valid: + +```bash +aworld-cli evaluator --target ./some-target --suite app-evaluator +``` + +The new `evaluator run` source path should not break `--list-suites`, `--print-report-schema`, `--validate-report`, or interactive approval behavior. + +## Risks / Trade-offs + +- [Command ambiguity] `evaluator --target` and `evaluator run --input` can coexist, but parser errors must clearly explain which mode is active. +- [Too many flags] Field mappings are necessary for generic JSONL. Presets can reduce repeated arguments later. +- [Case-specific drift] Avoid canonical `evaluator trajectory-log`; if aliases are added later, they should delegate to `evaluator run --kind aworld-trajectory-log`. +- [Plugin overreach] Hook contracts must state that plugins customize CLI assembly and side effects only. + +## Migration Plan + +1. Land framework input sources first, including the manual test refactor. +2. Add source-run parser mode to the existing evaluator command. +3. Add source-run runtime helper that calls framework APIs. +4. Extend evaluator hook event payloads with source mode fields. +5. Add CLI tests for argument validation and runtime delegation. +6. Add one opt-in manual command example for the trajectory evaluator case. +7. Keep old target/suite command behavior unchanged. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md new file mode 100644 index 000000000..15b5feae5 --- /dev/null +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md @@ -0,0 +1,37 @@ +# Implementation Plan + +## Commit 1: Parser Shape + +- Confirm the framework input-source change has landed. +- Add source-backed `run` parsing to `EvaluatorTopLevelCommand`. +- Add JSONL field defaults for task+answer sources. +- Keep the builtin evaluator plugin command as the registration path. +- Add tests for command parsing and incompatible argument combinations. + +## Commit 2: Runtime Delegation + +- Add a `run_evaluator_source_cli(...)` helper in `aworld_cli.evaluator_runtime`. +- Map initially supported `--kind` values to framework input-source APIs. +- Return clear unsupported-kind errors for source kinds not yet implemented by the framework layer. +- Build source-backed flows through framework helpers only. +- Add runtime delegation tests with monkeypatched framework helpers. + +## Commit 3: Hooks And Reporting + +- Extend evaluator hook payloads for source-backed mode. +- Preserve existing target-mode hook payloads. +- Add automation/report metadata for source input, kind, task ids, and output path. +- Add hook payload and summary tests. + +## Commit 4: Examples And Manual Regression + +- Document the trajectory-log command that replaces the pytest-specific manual invocation. +- Add task+answer examples. +- Mention task-only examples only after the framework source layer supports task-only sources. +- Keep the existing pytest manual regression as a lower-level framework e2e until the source API is fully adopted. + +## Verification + +- `pytest` for evaluator CLI tests. +- Evaluator framework tests from the input-source change. +- `openspec validate aworld-cli-evaluator-source-run-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md new file mode 100644 index 000000000..331b8ecf8 --- /dev/null +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md @@ -0,0 +1,32 @@ +# AWorld CLI Evaluator Source Run + +## Why + +The manual trajectory evaluator regression proved that AWorld can evaluate real task outputs, trajectories, outcome checks, typed judge schemas, and composite gates. It also showed that a user-facing CLI must not expose the full substrate assembly surface just to run a simple evaluation. + +The framework input-source change is responsible for normalizing task files, task+answer files, serialized states, and AWorld trajectory logs into framework-owned evaluation records and replay state. The CLI should be a thin consumer of that layer: parse user intent, select a source adapter and judge agent, run the suite-backed flow, and write a report. + +The existing CLI already has an official evaluator command implemented through the builtin plugin command path, with evaluator lifecycle hooks for discovery, pre-run, post-run, and rendering. This change extends that command shape instead of adding an ad hoc script or a separate evaluator CLI. + +## What Changes + +- Add a source-backed `aworld-cli evaluator run` mode to the existing evaluator command. +- Support source-oriented arguments: `--input`, `--kind`, optional field mappings, optional `--task-id`, `--agent`, `--judge-agent`, and output options. +- Use conventional JSONL field defaults (`id`, `input`, `answer`) so simple task+answer files do not require field-mapping flags. +- Keep the canonical command source-oriented rather than case-specific; trajectory-log, task-only, and task+answer are input kinds, not separate evaluator stacks. +- Build source-backed evaluation flows by calling the framework input-source APIs from `aworld.evaluations`. +- Preserve the existing target/suite evaluator path for current users. +- Integrate through the existing builtin plugin command and evaluator hook model; plugins may observe or customize CLI assembly metadata, but they may not redefine framework execution, scoring, gate, or report semantics. + +## Capabilities + +### Modified Capabilities + +- `cli-evaluator-flow`: add a source-backed run path for simple file/log based evaluation while preserving plugin-backed command registration and hook extensibility. + +## Impact + +- Affected code: `aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py`, `aworld-cli/src/aworld_cli/evaluator_runtime.py`, builtin evaluator plugin command wiring, CLI rendering/tests. +- Affected APIs: additive CLI flags and runtime helpers; existing `aworld-cli evaluator --target ...` behavior remains compatible. +- Dependencies: this change depends on the framework input-source layer from `aworld-evaluator-input-sources-2026-06-10` and should land after that change. +- Non-goals: no new framework source semantics, no case-specific `trajectory-log` command as the canonical API, no new plugin system, no CLI-owned scoring or gate implementation, no training/optimizer integration. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md new file mode 100644 index 000000000..d50dacc06 --- /dev/null +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md @@ -0,0 +1,91 @@ +## MODIFIED Requirements + +### Requirement: CLI evaluator command runs a complete evaluation flow + +The CLI SHALL provide an evaluator command that can run a complete evaluation flow against a supported local target or a supported evaluation input source such as a task file, task+answer file, serialized evaluation state, or AWorld trajectory log. + +#### Scenario: User evaluates a local target +- **WHEN** a user invokes the evaluator command with a supported local evaluation target +- **THEN** the CLI SHALL resolve the target, build an evaluation flow, execute the selected suite, and return a completed evaluation result + +#### Scenario: User evaluates a source input +- **WHEN** a user invokes the evaluator command with a supported source input and source kind +- **THEN** the CLI SHALL resolve the input, select the matching framework source adapter, build a source-backed evaluation flow, and return a completed evaluation result + +### Requirement: CLI evaluator is an official plugin-backed command + +The evaluator command SHALL integrate with the CLI through the same builtin plugin command model used by other official top-level commands. + +#### Scenario: CLI loads official evaluator command +- **WHEN** the CLI initializes builtin top-level command providers +- **THEN** the evaluator command SHALL be exposed through a builtin plugin-backed command entry rather than only through an ad hoc direct registration path + +#### Scenario: Source-backed evaluator mode uses existing command registration +- **WHEN** the CLI exposes source-backed evaluator usage +- **THEN** it SHALL do so through the existing evaluator command object and builtin evaluator plugin registration rather than a separate top-level command or standalone script + +### Requirement: CLI evaluator extensibility uses hooks for peripheral customization + +The evaluator command SHALL support plugin and hook-based extensibility for CLI-specific discovery, assembly, and output concerns without moving framework evaluation semantics into CLI handlers. + +#### Scenario: Plugin customizes evaluator discovery or assembly +- **WHEN** an installed or builtin CLI plugin participates in evaluator discovery or pre-run assembly +- **THEN** the CLI SHALL provide hook points for those lifecycle stages without requiring the plugin to redefine framework execution, scoring, or gate logic + +#### Scenario: Plugin extends evaluator rendering or post-run handling +- **WHEN** an installed or builtin CLI plugin needs to append summary output, upload reports, or trigger notifications after evaluation +- **THEN** the CLI SHALL provide hook points for rendering and post-run handling while preserving the framework-owned evaluation result and report contract + +#### Scenario: Source-backed evaluator flow invokes evaluator hooks +- **WHEN** a source-backed evaluator run is assembled or completed +- **THEN** the CLI SHALL invoke the same evaluator hook infrastructure used by target-backed runs, with source-aware event fields that identify mode, input path, source kind, task filters, judge agent, execution agent, workspace path, and output path when available + +### Requirement: CLI evaluator hook contracts are explicit + +The evaluator command SHALL document the event payloads, mutable state surface, and allowed side effects for evaluator-specific CLI hooks. + +#### Scenario: Plugin author implements an evaluator lifecycle hook +- **WHEN** a plugin author uses an evaluator-specific hook such as pre-run, post-run, or summary rendering +- **THEN** the CLI SHALL provide a documented hook contract describing which fields are guaranteed and what a hook may modify + +#### Scenario: Source-backed hook remains CLI-scoped +- **WHEN** a plugin hook observes or customizes a source-backed evaluator run +- **THEN** the hook SHALL be limited to CLI assembly metadata, side effects, and rendering, and SHALL NOT replace framework source parsing, state adaptation, execution, scoring, gate decisions, or report schema semantics + +## ADDED Requirements + +### Requirement: CLI evaluator supports source-backed run mode + +The evaluator command SHALL provide a source-backed run mode that accepts an input path, source kind, optional field mappings, optional task filters, optional execution agent, and judge agent configuration. + +#### Scenario: User evaluates an AWorld trajectory log +- **WHEN** a user runs the evaluator with `--input`, `--kind aworld-trajectory-log`, `--task-id`, and `--judge-agent` +- **THEN** the CLI SHALL use framework trajectory-log source and replay adapters to evaluate the selected task without implementing trajectory parsing in CLI code + +#### Scenario: User evaluates task and answer records +- **WHEN** a user runs the evaluator with `--input`, `--kind task-answer`, and `--judge-agent` +- **THEN** the CLI SHALL use framework task+answer source and answer-state adapters to evaluate existing answers without re-executing the target + +#### Scenario: User overrides task and answer field names +- **WHEN** a user runs the evaluator with `--kind task-answer` and custom field mapping flags +- **THEN** the CLI SHALL pass those mappings to the framework source while defaulting omitted mappings to `id`, `input`, and `answer` + +#### Scenario: User requests a deferred source kind +- **WHEN** a user runs the evaluator with a source kind that is defined as a future framework source but not implemented yet +- **THEN** the CLI SHALL fail with a clear unsupported-kind error rather than implementing that source kind in CLI code + +### Requirement: CLI evaluator preserves source-oriented canonical commands + +The evaluator CLI SHALL treat source kinds as input adapters under a single canonical source-backed command path rather than creating independent evaluator stacks for each source format. + +#### Scenario: Source kind selects adapter +- **WHEN** a user specifies a supported source kind such as `task-answer` or `aworld-trajectory-log` +- **THEN** the CLI SHALL select the matching framework source adapter while preserving the same evaluation flow and report semantics + +#### Scenario: Source kind is not yet supported by framework +- **WHEN** a user specifies a source kind that the framework source layer has not implemented yet +- **THEN** the CLI SHALL fail with a clear unsupported-kind error instead of implementing source parsing in CLI code + +#### Scenario: Case-specific alias delegates to canonical flow +- **WHEN** a future CLI alias is added for a common source kind +- **THEN** that alias SHALL delegate to the canonical source-backed evaluator flow rather than implementing separate parsing, judging, scoring, or gating behavior diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md new file mode 100644 index 000000000..a7c574b03 --- /dev/null +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md @@ -0,0 +1,40 @@ +## 1. Parser And Command Shape + +- [ ] 1.1 Confirm `aworld-evaluator-input-sources-2026-06-10` has landed before implementing source-backed CLI behavior. +- [ ] 1.2 Extend the existing `EvaluatorTopLevelCommand` parser with a source-backed `run` mode. +- [ ] 1.3 Add `--input`, `--kind`, `--judge-agent`, `--out-dir`, `--output`, `--task-id`, `--agent`, and optional JSONL field mapping arguments for source mode. +- [ ] 1.4 Default task+answer JSONL field mappings to `id`, `input`, and `answer`. +- [ ] 1.5 Preserve existing `--target`, `--suite`, `--list-suites`, `--print-report-schema`, `--validate-report`, and `--interactive-approval` behavior. +- [ ] 1.6 Add clear validation errors for mixing incompatible target-mode and source-mode arguments. + +## 2. Runtime Assembly + +- [ ] 2.1 Add a source-backed runtime helper in `aworld_cli.evaluator_runtime`. +- [ ] 2.2 Resolve source kind to framework source/adapters from `aworld.evaluations`. +- [ ] 2.3 Resolve `agent.md` judge path through framework `AgentJudgeBackend.from_agent_markdown`. +- [ ] 2.4 For task+answer and trajectory-log sources, use framework replay/state adapters without re-execution. +- [ ] 2.5 Treat task-only and serialized-state source kinds as unsupported until the framework source layer provides those built-ins. +- [ ] 2.6 Persist reports with deterministic default names under the requested output directory. + +## 3. Plugin And Hook Integration + +- [ ] 3.1 Keep evaluator command exposure through the existing builtin plugin command entrypoint. +- [ ] 3.2 Reuse `_load_evaluator_hooks` and `_run_evaluator_hooks` for source-backed runs. +- [ ] 3.3 Extend evaluator hook event payloads with `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, and output path fields. +- [ ] 3.4 Document that hooks may customize CLI metadata, side effects, and rendering but must not redefine framework execution, scoring, gate, or report semantics. + +## 4. UX And Reporting + +- [ ] 4.1 Render the same evaluator summary shape for source-backed reports. +- [ ] 4.2 Include resolved source mode, input path, kind, selected task ids, and report path in summary or automation metadata. +- [ ] 4.3 Keep exit codes based on gate status and approval state. +- [ ] 4.4 Add examples for trajectory-log, task+answer, and task-only evaluation. + +## 5. Tests + +- [ ] 5.1 Add parser tests for source-backed `evaluator run` arguments. +- [ ] 5.2 Add validation tests for required source-mode arguments and incompatible argument combinations. +- [ ] 5.3 Add runtime delegation tests using fake framework source helpers. +- [ ] 5.4 Add hook payload tests for source-backed pre-run/post-run/render events. +- [ ] 5.5 Add compatibility tests for the existing target/suite evaluator path. +- [ ] 5.6 Validate this OpenSpec change with `openspec validate aworld-cli-evaluator-source-run-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml new file mode 100644 index 000000000..2cb80411e --- /dev/null +++ b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md new file mode 100644 index 000000000..2bb1c3dfa --- /dev/null +++ b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md @@ -0,0 +1,238 @@ +# AWorld Evaluator Input Sources + +## Context + +AWorld's evaluator stack now has the core pieces for serious agent evaluation: + +- suite/case/judge/gate/report substrate +- execution adapters for static, agent, task, and program modes +- runtime-composed rollout harnesses and serializable `RolloutState` +- outcome/state checks, trajectory scorers, standard metrics, trials, environment isolation hooks, and LLM user simulators + +The missing layer is input normalization. The framework can evaluate well once a caller has produced `EvalCaseDef` plus `EvalState` or `RolloutState`, but external evaluation data usually arrives as files or logs. The current manual trajectory-log test manually implements parsing, replay, markdown-agent loading, schema flattening, and suite wiring. That is useful as a spike, but it is not the framework-level integration experience AWorld should expose. + +This change introduces a framework-owned input source layer. It should not create a separate evaluator stack. It should feed existing `EvalSuiteDef`, `EvalCaseDef`, `EvalExecutionSpec`, `RuntimeHarness`, `JudgeBackend`, and report assembly paths. + +## Goals / Non-Goals + +**Goals:** + +- Provide reusable source primitives for external evaluation input records. +- Support task+answer inputs that should be judged without runtime execution. +- Support AWorld trajectory-log inputs by parsing them once in framework code and replaying them into `RolloutState`. +- Keep task-only and serialized-state sources as follow-on implementations of the same protocol rather than first-version built-ins. +- Keep source parsing, state adaptation, judge backend wiring, and suite creation discoverable from `aworld.evaluations`. +- Make the manual trajectory-log regression a small consumer of framework APIs rather than a copy of framework internals. + +**Non-Goals:** + +- Adding CLI commands or argument parsing in this change. +- Adding first-version built-ins for task-only execution sources or generic serialized-state files. +- Adding database, object-store, or remote log connectors. +- Executing untrusted code from input files. +- Running shell commands or external environment checks from source adapters. +- Replacing `EvaluationConfig`, `EvaluateRunner`, `EvalSuiteDef`, or runtime-composition harnesses. +- Adding production sandbox or clean-environment reset implementations. + +## Proposed Abstractions + +### 1. `EvalSource` + +`EvalSource` is a trusted framework object that enumerates evaluation records and converts them into cases. + +Conceptually: + +```python +class EvalSource(Protocol): + def iter_records(self) -> Iterable[EvalSourceRecord]: ... + def to_cases(self) -> tuple[EvalCaseDef, ...]: ... +``` + +`EvalSourceRecord` should contain: + +- `case_id` +- `input` +- optional `expected` +- optional existing `answer` +- optional existing `state` +- optional source metadata +- optional raw source payload for trusted adapters + +Source records must be serializable or sanitize non-serializable values before report state. + +If a source kind uniquely determines its replay adapter, the source should expose `default_adapter()` or equivalent metadata. Callers may override the adapter for advanced cases, but the happy path should not require both `source=AWorldTrajectoryLogSource(...)` and `state_adapter=TrajectoryLogStateAdapter()`. + +### 2. `EvalStateAdapter` + +`EvalStateAdapter` converts source records that already contain outputs into normalized state. + +First-version examples: + +- `AnswerStateAdapter`: turns a task+answer record into `EvalState(answer=answer, completion=[answer])` +- `TrajectoryLogStateAdapter`: turns one AWorld trajectory-log record into `RolloutState` + +Task-only records do not use a replay adapter; they flow through existing execution modes (`AGENT`, `TASK`, `PROGRAM`, or `STATIC` when judge-only). That path is intentionally deferred from this first version because the current simplification target is existing-output replay. + +### 3. `ReplayRuntimeHarness` + +`ReplayRuntimeHarness` is a runtime harness that receives source records and state adapters, then returns `RolloutState` or bridgeable state without re-executing the target. + +The harness owns: + +- selecting the source record for the case +- applying the adapter +- preserving source metadata +- deriving tool calls, usage, timing, and standard metrics where available + +It does not own judging, scoring, gate decisions, trial expansion, or environment reset. + +### 4. Built-in Sources + +The first implementation should include the file-backed sources that have immediate consumers: + +- `JsonlTaskAnswerSource` + - default fields: `id`, `input`, `answer`; optional `expected`, optional metadata + - field names may be overridden by constructor options + - used with `AnswerStateAdapter` +- `AWorldTrajectoryLogSource` + - reads AWorld line-oriented trajectory logs + - extracts records by task id + - used with `TrajectoryLogStateAdapter` + +The API should not hardcode these as evaluator types. They are source/adapters that feed the same suite-backed evaluator. + +Deferred source implementations: + +- `JsonlTaskSource` for task-only records that require runtime execution. +- `RolloutStateFileSource` for generic serialized `EvalState` or `RolloutState` records. + +### 5. Markdown Agent Loading + +The manual regression showed a separate but related gap: a judge agent may be provided as `agent.md`, while framework loading currently favors `SKILL.md`. + +This change should add a framework helper, not a test-local workaround: + +```python +load_agent_markdown(path) -> Agent +AgentJudgeBackend.from_agent_markdown(path, prompt_builder=..., timeout_seconds=...) +``` + +The helper can internally reuse skill loading or instantiate an AWorld agent directly, but callers should not materialize temporary `SKILL.md` files. + +### 6. Judge Payload Normalization + +The trajectory evaluator agent currently returns: + +```json +{ + "weighted_score": 78, + "dimensions": { + "A1_groundedness": {"score": 4} + } +} +``` + +The evaluator substrate prefers flat judge payload fields: + +```json +{"score": 78, "A1_groundedness": 4} +``` + +This change should avoid hidden global flattening. Instead, add explicit normalization support: + +- `JudgeSchemaDef(normalizer=callable)` or equivalent +- a built-in trajectory judge output model/normalizer for dimensions-style reports + +Suite authors should opt into a normalizer so report contracts remain explicit. + +## Data Flow + +### Deferred: Task-only file + +```text +JsonlTaskSource -> EvalCaseDef -> existing EvalExecutionSpec -> EvalState -> judge/scorers/gate/report +``` + +### Task + answer file + +```text +JsonlTaskAnswerSource -> EvalCaseDef + answer record -> AnswerStateAdapter -> EvalState -> judge/scorers/gate/report +``` + +### Deferred: Serialized rollout state file + +```text +RolloutStateFileSource -> RolloutStateAdapter -> RolloutState/EvalState -> judge/scorers/gate/report +``` + +### AWorld trajectory log + +```text +AWorldTrajectoryLogSource -> TrajectoryLogStateAdapter -> RolloutState -> judge/scorers/gate/report +``` + +## API Shape + +Expected high-level usage: + +```python +source = JsonlTaskAnswerSource( + path="task_answers.jsonl", +) + +suite = create_source_eval_suite( + source=source, + judge_backend=AgentJudgeBackend.from_agent_markdown("eval/judge/agent.md"), + judge_schema=JudgeSchemaDef(output_model=AnswerJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70), +) + +report = await run_evaluation_flow(EvaluationFlowDef(target={"kind": "source"}, suite=suite)) +``` + +`create_source_eval_suite(...)` must return a normal `EvalSuiteDef`. It is syntax sugar over the existing suite substrate, not a second suite type or execution stack. + +Expected trajectory-log usage: + +```python +source = AWorldTrajectoryLogSource( + path="~/Documents/logs/trajectory.log", + task_ids=["task_20260609193335"], +) + +suite = create_source_eval_suite( + source=source, + judge_backend=AgentJudgeBackend.from_agent_markdown("eval/trajectory_evaluator/agent.md"), + judge_schema=TrajectoryJudgeSchema.default(), + gate_policy=TrajectoryJudgeGate.default(), +) +``` + +## Risks / Trade-offs + +- [Too much abstraction] -> Mitigation: keep first version limited to task+answer and trajectory-log file-backed sources; allow explicit adapter overrides only for advanced callers. +- [Case-by-case source creep] -> Mitigation: require new sources to implement the same record/state adapter contracts rather than custom evaluator flows. +- [Untrusted file assumptions] -> Mitigation: sources parse data only; they do not execute code or commands. +- [Schema normalization ambiguity] -> Mitigation: make normalizers explicit on schema/suite. +- [Runtime vs replay confusion] -> Mitigation: document that current task+answer and trajectory sources replay existing outputs; future task-only sources will execute through normal execution specs. + +## Migration Plan + +1. Add source record and source protocols, including optional source-provided default adapters. +2. Add JSONL task+answer source with default `id`, `input`, and `answer` field names. +3. Add answer-record state adapter. +4. Add trajectory-log source and trajectory-log state adapter. +5. Add replay harness and source-backed suite factory. +6. Add markdown-agent judge backend factory. +7. Add explicit judge payload normalizer support or built-in trajectory judge schema. +8. Refactor the manual trajectory-log test to use the new framework APIs. +9. Keep existing suite-backed APIs compatible. + +## Deferred Questions + +- Task-only execution sources and generic serialized-state file sources should be separate follow-on implementations of the same protocol. +- Concrete remote source connectors should be separate changes. +- CLI integration should be a later consumer of this framework layer. +- Dataset registry integration can be considered after file-backed sources settle. +- Environment reset remains owned by the environment-isolation capability, not by sources. +- The trajectory evaluator `agent.md` currently contains prompt-local trajectory extraction guidance. This change removes the test-local parser duplication; a later cleanup should either feed the agent framework-extracted trajectory content directly or explicitly keep the prompt-local parsing instructions as evaluator-agent policy. diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md new file mode 100644 index 000000000..5d2810a9f --- /dev/null +++ b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md @@ -0,0 +1,85 @@ +# Implementation Plan: AWorld Evaluator Input Sources + +## Phase 1: Core Source Contracts + +1. Add `aworld/evaluations/sources.py`. +2. Define `EvalSourceRecord` as a serializable dataclass. +3. Define `EvalSource` protocol/base with `iter_records()` and `to_cases()`. +4. Add source-provided default adapter support for obvious replay pairs. +5. Add unit tests for record serialization, case lowering, and default adapter selection. + +Expected commit: source contracts and default adapter tests. + +## Phase 2: File Sources + +1. Implement JSONL reader helpers with clear field mapping. +2. Implement `JsonlTaskAnswerSource` with default fields `id`, `input`, and `answer`. +3. Implement override options for JSONL field names. +4. Add tests for valid records, missing required fields, and metadata preservation. + +Expected commit: task+answer file-backed source implementation. + +## Phase 3: State Adapters and Replay + +1. Add `aworld/evaluations/state_adapters.py`. +2. Define `EvalStateAdapter`. +3. Implement `AnswerStateAdapter`. +4. Implement `ReplayRuntimeHarness` or add it to `runtime_composition.py` if it fits better with existing harnesses. +5. Add tests proving source records can be replayed into state and reports. + +Expected commit: replay adapter path for existing outputs. + +## Phase 4: AWorld Trajectory Log Source + +1. Move trajectory log parsing out of the manual test into framework code. +2. Implement `AWorldTrajectoryLogSource`. +3. Implement `TrajectoryLogStateAdapter`. +4. Expose `TrajectoryLogStateAdapter` as the trajectory source default adapter. +5. Derive evidence, final answer, trajectory steps, tool calls, outcome, usage/timing defaults, and standard metrics. +6. Add focused tests with small synthetic trajectory logs. + +Expected commit: trajectory log source and replay adapter. + +## Phase 5: Suite Helpers + +1. Add `create_source_eval_suite(...)` helper. +2. Support replay-backed sources through `ReplayRuntimeHarness`. +3. Use source default adapters when `state_adapter` is omitted. +4. Ensure the helper returns a normal `EvalSuiteDef`. +5. Ensure helper remains optional; callers can still manually construct `EvalSuiteDef`. + +Expected commit: suite factory helpers. + +## Phase 6: Markdown Agent Judge Backend + +1. Add framework-level `load_agent_markdown(path)` helper. +2. Add `AgentJudgeBackend.from_agent_markdown(...)` factory or an equivalent named constructor. +3. Reuse existing AWorld Agent execution path. +4. Add tests proving `agent.md` metadata/body become an executable judge agent. + +Expected commit: markdown agent judge backend. + +## Phase 7: Judge Normalization + +1. Add explicit normalizer hook to `JudgeSchemaDef` or introduce a trajectory judge schema helper. +2. Normalize dimensions-style judge reports before typed validation. +3. Add tests for nested dimensions input and flat output. + +Expected commit: explicit judge payload normalization. + +## Phase 8: Manual Test Refactor + +1. Refactor `tests/evaluations/test_trajectory_log_manual_case.py` to use source/adapters/backend factories. +2. Remove test-local parser, replay harness, markdown-agent materializer, and schema flattening. +3. Keep explicit pytest parameters and LLM skip behavior. +4. Run the manual e2e with an explicit local task id when credentials/logs are available. + +Expected commit: manual trajectory regression uses framework APIs. + +## Verification Commands + +```bash +pytest tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_runtime_composition.py -q +pytest tests/evaluations/test_trajectory_log_manual_case.py -q +openspec validate aworld-evaluator-input-sources-2026-06-10 --strict +``` diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md new file mode 100644 index 000000000..6628616e7 --- /dev/null +++ b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md @@ -0,0 +1,36 @@ +# AWorld Evaluator Input Sources + +## Why + +The current manual trajectory-log regression proves that AWorld's evaluator substrate can judge final answers, inspect trajectories, run typed LLM-as-judge schemas, apply outcome checks, and gate reports. It also exposes an integration problem: callers must hand-write too much glue code to get external evaluation inputs into `EvalCaseDef` and normalized evaluator state. + +This is not specific to trajectory logs. The same problem appears for several common inputs: + +- a file containing task + answer pairs, where the evaluator should judge existing outputs without re-execution +- an AWorld trajectory log, where the evaluator should reconstruct `RolloutState` from prior execution +- future files containing tasks only or serialized rollout/task responses, which should implement the same source contracts when they gain real consumers + +Adding a dedicated `trajectory_log.py` top-level path would solve one case but repeat the same problem for task files, answer files, rollout dumps, and future stores. The framework needs a small input-source and state-adapter layer that converts heterogeneous evaluation inputs into the existing suite/case/state substrate. + +## What Changes + +- Add framework-owned `EvalSource` primitives that load external evaluation inputs into `EvalCaseDef` rows plus source metadata. +- Add state adapters that convert source records with existing outputs into normalized `EvalState` or `RolloutState`. +- Let sources expose a default state adapter when the adapter is uniquely implied by the source kind. +- Add first built-in source/adapters for task+answer files and AWorld trajectory logs. +- Add a reusable replay harness that uses source adapters to provide rollout/eval state without re-executing an agent. +- Add helper factories so callers can create suite-backed evaluations from sources without hand-writing parser, replay, schema-normalization, and report plumbing. +- Keep CLI integration out of scope; this change is framework-only under `aworld/evaluations/`. + +## Capabilities + +### Modified Capabilities + +- `evaluation-substrate`: add source-backed evaluation input normalization so suite-backed evaluation can consume task+answer and trajectory-log inputs through one framework path, with protocols that allow task-only and serialized-state sources to be added later. + +## Impact + +- Affected code: `aworld/evaluations/**`, especially new source/adapters, runtime-composition replay harness integration, and suite factory helpers. +- Affected APIs: additive framework APIs; existing suite-backed and runtime-composition APIs remain compatible. +- Affected tests: replace manual trajectory-log test-local glue with framework source/adapters; add focused coverage for task+answer and trajectory-log source behavior. +- Non-goals: no `aworld-cli` command shape changes, no untrusted file execution, no production storage connectors, no sandbox reset or external environment management. diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md new file mode 100644 index 000000000..6c2e941a0 --- /dev/null +++ b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md @@ -0,0 +1,105 @@ +## MODIFIED Requirements + +### Requirement: Source-backed evaluation inputs + +Suite-backed evaluation flows SHALL support framework-owned input sources that normalize external evaluation records into cases and optional existing evaluator state. + +#### Scenario: Source provides task and answer records +- **WHEN** an input source provides records with case id, task input, and an existing answer +- **THEN** the framework SHALL allow that source or an explicit state adapter to convert each record into evaluator state without re-executing an agent, task, or program + +#### Scenario: Source metadata is reported safely +- **WHEN** source records include metadata +- **THEN** the framework SHALL preserve serializable source metadata and exclude live file handles, clients, process objects, and other runtime handles + +#### Scenario: Source default adapter is available +- **WHEN** a source kind has one obvious replay adapter +- **THEN** the framework SHALL allow suite construction to use that default adapter without requiring the caller to pass both source and adapter explicitly + +### Requirement: Source state adapters + +Source-backed evaluation flows SHALL separate reading input records from converting existing outputs into evaluator state, while allowing sources to declare a default adapter for the common path. + +#### Scenario: Answer adapter converts existing answer +- **WHEN** an answer state adapter receives a task+answer record +- **THEN** it SHALL produce an evaluator state with terminal answer, completion view, success status, source metadata, and no runtime execution + +#### Scenario: Adapter fails on malformed state +- **WHEN** a source record claims to contain existing output state but required fields are malformed +- **THEN** the framework SHALL raise a clear validation error before judging or reporting the case + +### Requirement: Replay harness for existing outputs + +Suite-backed evaluation flows SHALL support replaying existing outputs through a runtime harness without re-executing the target. + +#### Scenario: Replay harness returns adapted state +- **WHEN** a replay harness is configured with a source record and state adapter +- **THEN** it SHALL return the adapted `RolloutState` or bridgeable evaluator state as the case rollout result + +#### Scenario: Replay is distinct from execution +- **WHEN** a source already contains answer, trajectory, or rollout state +- **THEN** the framework SHALL NOT invoke the suite's agent, task, or program execution adapter for that case unless explicitly configured to do so + +#### Scenario: Replay state feeds existing scorers +- **WHEN** replayed state contains answer, outcome, trajectory, tool calls, usage, timing, or standard metrics +- **THEN** existing judge, trajectory, outcome, reward, standard metric, gate, and report paths SHALL consume that state through the same normalized evaluator interfaces used by runtime-composed execution + +### Requirement: AWorld trajectory log source + +The framework SHALL provide a source and adapter for trusted AWorld trajectory log records. + +#### Scenario: Trajectory log source selects task ids +- **WHEN** a trajectory log source is configured with one or more task ids +- **THEN** it SHALL extract the matching line-oriented AWorld trajectory records and expose one source record per task id + +#### Scenario: Trajectory log record is parsed +- **WHEN** a trajectory log record contains ANSI-decorated Python dict repr with a JSON-string `trajectory` field +- **THEN** the framework SHALL clean ANSI escapes, parse the record, decode the trajectory, and surface a structured record or a clear parse error + +#### Scenario: Trajectory log adapter builds rollout state +- **WHEN** a trajectory log adapter receives a parsed trajectory record +- **THEN** it SHALL produce rollout state containing terminal answer, ordered trajectory steps, extracted tool calls, evidence summary, outcome metadata, usage/timing defaults, and standard metrics + +### Requirement: Source suite factory remains syntax sugar + +Framework helpers for source-backed evaluation SHALL construct ordinary suite-backed evaluation definitions and SHALL NOT introduce a parallel suite type. + +#### Scenario: Source helper creates suite +- **WHEN** a caller uses `create_source_eval_suite` with a supported source, judge backend, judge schema, and gate policy +- **THEN** the helper SHALL return a normal `EvalSuiteDef` that can be passed to existing suite-backed flow execution + +#### Scenario: Source helper uses default adapter +- **WHEN** a caller omits `state_adapter` and the source provides a default adapter +- **THEN** the helper SHALL use the source default adapter for replay construction + +### Requirement: Markdown agent judge loading + +Evaluator judge backends SHALL support loading trusted markdown agent definitions without requiring callers to create temporary skill directories. + +#### Scenario: Judge backend loads agent markdown +- **WHEN** a caller supplies an `agent.md` path to a supported judge backend factory +- **THEN** the framework SHALL create an executable AWorld judge agent from the markdown metadata and body + +#### Scenario: Existing system-prompt judge remains compatible +- **WHEN** a caller uses the existing `AgentJudgeBackend(system_prompt=...)` form +- **THEN** behavior SHALL remain compatible + +#### Scenario: Markdown agent execution is trusted +- **WHEN** markdown agent loading is used +- **THEN** the framework SHALL treat the markdown definition as trusted local evaluator configuration and SHALL NOT execute arbitrary shell commands from the file during loading + +### Requirement: Explicit judge payload normalization + +Suite-backed judge validation SHALL support explicit payload normalization before typed schema validation. + +#### Scenario: Suite declares a normalizer +- **WHEN** a suite or judge schema declares a payload normalizer +- **THEN** the framework SHALL apply that normalizer before typed model validation, metric extraction, and report assembly + +#### Scenario: Dimensions-style trajectory judge output is normalized +- **WHEN** a trajectory judge output contains `weighted_score` and nested `dimensions..score` fields and the suite opts into the built-in trajectory normalizer +- **THEN** the framework SHALL normalize the payload into flat `score` and metric fields before validation + +#### Scenario: No hidden global normalization +- **WHEN** a judge output contains nested dimensions but no normalizer is configured +- **THEN** the framework SHALL preserve current validation behavior and SHALL NOT silently flatten the payload diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md new file mode 100644 index 000000000..ba3d65d36 --- /dev/null +++ b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md @@ -0,0 +1,60 @@ +## 1. Source Model + +- [x] 1.1 Add `EvalSourceRecord` with case id, input, expected, answer/state payload, source metadata, and raw payload. +- [x] 1.2 Add an `EvalSource` protocol/base class that enumerates records and lowers them into `EvalCaseDef` values. +- [x] 1.3 Add source-provided default adapter support for source kinds with one obvious replay adapter. +- [x] 1.4 Ensure source metadata remains serializable and does not retain file handles, clients, or live runtime objects. + +## 2. Built-in File Sources + +- [x] 2.1 Add `JsonlTaskAnswerSource` for task + answer records that should be judged without re-execution, with default field names `id`, `input`, and `answer`. +- [x] 2.2 Add override options for task+answer field names. +- [x] 2.3 Add `AWorldTrajectoryLogSource` for line-oriented AWorld trajectory logs with task-id selection. +- [x] 2.4 Add validation and error messages for missing id/input/answer fields and missing trajectory task ids. +- [x] 2.5 Defer task-only and generic serialized-state sources to follow-on implementations of the same protocol. + +## 3. State Adapters and Replay Harness + +- [x] 3.1 Add `EvalStateAdapter` protocol/base class. +- [x] 3.2 Add `AnswerStateAdapter` that converts task+answer records into `EvalState`. +- [x] 3.3 Add `TrajectoryLogStateAdapter` that converts AWorld trajectory-log records into `RolloutState`, including answer, evidence, trajectory, tool calls, usage, timing, outcome, and standard metrics. +- [x] 3.4 Ensure `JsonlTaskAnswerSource` and `AWorldTrajectoryLogSource` expose their default adapters. +- [x] 3.5 Add `ReplayRuntimeHarness` that applies a source record and adapter without re-executing the target. + +## 4. Suite Factory Helpers + +- [x] 4.1 Add `create_source_eval_suite(...)` helper that wires source cases, replay harness, judge schema/backend, scorers, and gate policy. +- [x] 4.2 Make `state_adapter` optional when the source provides a default adapter. +- [x] 4.3 Ensure `create_source_eval_suite(...)` returns a normal `EvalSuiteDef`. +- [x] 4.4 Support task+answer and trajectory-log sources with replay adapters. +- [x] 4.5 Preserve existing `run_evaluation_flow` report shape and gate behavior. + +## 5. Markdown Agent Judge Backend + +- [x] 5.1 Add framework helper to load `agent.md` into an AWorld `Agent` without test-local temporary `SKILL.md` materialization. +- [x] 5.2 Add `AgentJudgeBackend.from_agent_markdown(...)` or equivalent factory. +- [x] 5.3 Preserve existing `AgentJudgeBackend(system_prompt=...)` behavior. + +## 6. Judge Payload Normalization + +- [x] 6.1 Add explicit judge payload normalization support on `JudgeSchemaDef` or a closely scoped suite helper. +- [x] 6.2 Add built-in normalizer/model for dimensions-style trajectory judge reports. +- [x] 6.3 Ensure normalizers run before typed model validation and report assembly. +- [x] 6.4 Do not add hidden global `dimensions -> flat` behavior. + +## 7. Refactor Manual Trajectory Regression + +- [x] 7.1 Replace test-local trajectory parser with `AWorldTrajectoryLogSource`. +- [x] 7.2 Replace test-local replay harness with `ReplayRuntimeHarness` plus `TrajectoryLogStateAdapter`. +- [x] 7.3 Replace test-local markdown agent backend with framework `AgentJudgeBackend.from_agent_markdown`. +- [x] 7.4 Replace test-local schema flattening with explicit trajectory judge normalizer/model. +- [x] 7.5 Keep the manual LLM-backed regression opt-in through explicit pytest parameters. + +## 8. Verification + +- [x] 8.1 Add focused tests for task+answer source replay without execution. +- [x] 8.2 Add focused tests for AWorld trajectory-log replay. +- [x] 8.3 Add focused tests for source default adapter selection. +- [x] 8.4 Add focused tests for markdown-agent judge backend loading. +- [x] 8.5 Run evaluator regression tests. +- [x] 8.6 Validate this OpenSpec change with `openspec validate aworld-evaluator-input-sources-2026-06-10 --strict`. diff --git a/tests/evaluations/test_evaluation_input_sources.py b/tests/evaluations/test_evaluation_input_sources.py new file mode 100644 index 000000000..1d1d63ac0 --- /dev/null +++ b/tests/evaluations/test_evaluation_input_sources.py @@ -0,0 +1,202 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Literal, Mapping + +import pytest +from pydantic import BaseModel + +from aworld.evaluations.sources import ( + AWorldTrajectoryLogSource, + JsonlTaskAnswerSource, + create_source_eval_suite, +) +from aworld.evaluations.state_adapters import ( + AnswerStateAdapter, + TrajectoryLogStateAdapter, +) +from aworld.evaluations.substrate import ( + CallableJudgeBackend, + EvalSuiteDef, + EvaluationFlowDef, + GatePolicyDef, + JudgeSchemaDef, + run_evaluation_flow, +) +from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema + + +class _ScoreJudgeOutput(BaseModel): + score: float + verdict: Literal["pass", "fail"] + + +def test_jsonl_task_answer_source_defaults_fields_and_default_adapter(tmp_path: Path) -> None: + path = tmp_path / "answers.jsonl" + path.write_text( + json.dumps({"id": "case-1", "input": "What is 2+2?", "answer": "4"}) + "\n", + encoding="utf-8", + ) + + source = JsonlTaskAnswerSource(path=path) + records = list(source.iter_records()) + cases = source.to_cases() + + assert records[0].case_id == "case-1" + assert records[0].input == {"input": "What is 2+2?"} + assert records[0].answer == "4" + assert isinstance(source.default_adapter(), AnswerStateAdapter) + assert cases[0].case_id == "case-1" + assert cases[0].input == {"input": "What is 2+2?"} + assert cases[0].metadata["source_record"]["answer"] == "4" + + +@pytest.mark.asyncio +async def test_source_eval_suite_replays_task_answer_without_execution(tmp_path: Path) -> None: + path = tmp_path / "answers.jsonl" + path.write_text( + "\n".join( + [ + json.dumps({"id": "case-1", "input": "question", "answer": "existing answer"}), + ] + ), + encoding="utf-8", + ) + captured: dict[str, Any] = {} + + async def judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, Any]: + captured["answer"] = target["answer"] + captured["status"] = target["status"] + return {"score": 1.0, "verdict": "pass"} + + suite = create_source_eval_suite( + suite_id="task-answer-source", + source=JsonlTaskAnswerSource(path=path), + judge_backend=CallableJudgeBackend(backend_id="judge", judge=judge), + judge_schema=JudgeSchemaDef(output_model=_ScoreJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=1.0), + ) + + assert isinstance(suite, EvalSuiteDef) + assert suite.runtime_harness is not None + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "source", "target_path": str(path)}, suite=suite) + ) + + assert captured == {"answer": "existing answer", "status": "success"} + assert report["gate"]["status"] == "pass" + assert report["results"][0]["state_summary"]["answer"] == "existing answer" + + +@pytest.mark.asyncio +async def test_trajectory_log_source_replays_rollout_state_with_standard_metrics(tmp_path: Path) -> None: + task_id = "task-1" + trajectory = [ + { + "state": { + "input": {"content": "question"}, + "messages": [{"role": "system", "content": "system prompt"}], + }, + "meta": {"step": 1, "pre_agent": "user", "agent_id": "agent"}, + "action": { + "tool_calls": [ + {"function": {"name": "search", "arguments": "{}"}}, + ], + "is_agent_finished": "False", + }, + }, + { + "state": { + "messages": [ + {"role": "tool", "content": "search result"}, + {"role": "assistant", "content": "final"}, + ], + }, + "meta": {"step": 2, "pre_agent": "agent", "agent_id": "agent"}, + "action": {"content": "final answer", "is_agent_finished": "True"}, + }, + ] + log_path = tmp_path / "trajectory.log" + log_path.write_text( + repr({"task_id": task_id, "is_sub_task": False, "trajectory": json.dumps(trajectory)}) + "\n", + encoding="utf-8", + ) + + source = AWorldTrajectoryLogSource(path=log_path, task_ids=[task_id], extraction_dir=tmp_path) + suite = create_source_eval_suite( + suite_id="trajectory-source", + source=source, + judge_backend=CallableJudgeBackend( + backend_id="judge", + judge=lambda case_input, target: {"score": 1.0, "verdict": "pass"}, + ), + judge_schema=JudgeSchemaDef(output_model=_ScoreJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=1.0), + ) + + assert isinstance(source.default_adapter(), TrajectoryLogStateAdapter) + assert "raw_payload" not in source.to_cases()[0].metadata["source_record"] + + report = await run_evaluation_flow( + EvaluationFlowDef(target={"kind": "source", "target_path": str(log_path)}, suite=suite) + ) + result = report["results"][0] + + assert result["state_summary"]["answer"] == "final answer" + assert result["state_summary"]["tool_call_count"] == 1 + assert result["metadata"]["extracted_path"].endswith(f"extracted_{task_id}.json") + assert result["artifacts"]["outcome"]["evidence_blocks"] == 1 + assert result["metadata"]["standard_metrics"]["n_turns"] == 2 + assert result["metadata"]["standard_metrics"]["n_tool_calls"] == 1 + + +def test_judge_schema_normalizer_runs_before_typed_validation() -> None: + schema = JudgeSchemaDef( + output_model=_ScoreJudgeOutput, + normalizer=lambda payload: { + "score": payload["weighted_score"], + "verdict": payload["final_verdict"], + }, + ) + + payload = schema.validate_payload({"weighted_score": 0.9, "final_verdict": "pass"}) + + assert payload == {"score": 0.9, "verdict": "pass"} + + +def test_trajectory_log_source_reports_missing_task_id(tmp_path: Path) -> None: + path = tmp_path / "trajectory.log" + path.write_text("", encoding="utf-8") + + source = AWorldTrajectoryLogSource(path=path, task_ids=["missing-task"]) + + with pytest.raises(ValueError, match="missing-task"): + list(source.iter_records()) + + +def test_trajectory_judge_schema_normalizes_dimensions_report() -> None: + schema = TrajectoryJudgeSchema.default() + + payload = schema.validate_payload( + { + "weighted_score": 76, + "verdict": "Pass", + "dimensions": { + "A1_groundedness": {"score": 4}, + "A2_completeness": {"score": 3}, + "A3_relevance": {"score": 4}, + "A4_readability": {"score": 5}, + "B1_tool_use": {"score": 4}, + "B2_efficiency": {"score": 2}, + "B3_compliance": {"score": 4}, + "B4_robustness": {"score": 3}, + }, + "veto_triggered": False, + } + ) + + assert payload["score"] == 76 + assert payload["A1_groundedness"] == 4 + assert payload["B2_efficiency"] == 2 diff --git a/tests/evaluations/test_trajectory_log_manual_case.py b/tests/evaluations/test_trajectory_log_manual_case.py index 9e636703b..d7e264f37 100644 --- a/tests/evaluations/test_trajectory_log_manual_case.py +++ b/tests/evaluations/test_trajectory_log_manual_case.py @@ -1,35 +1,25 @@ from __future__ import annotations -import ast -import asyncio import json import os -import re -import tempfile -from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal, Mapping +from typing import Any, Mapping import pytest -from pydantic import BaseModel, model_validator -from aworld.config.task_loader import _load_skill_agent -from aworld.evaluations.runtime_composition import RolloutState +from aworld.evaluations.sources import AWorldTrajectoryLogSource, create_source_eval_suite from aworld.evaluations.substrate import ( - EvalCaseDef, - EvalSuiteDef, - EvaluationFlowDef, + AgentJudgeBackend, GateMetricCondition, GatePolicyDef, - JudgeExecution, - JudgeSchemaDef, StateCheckGrader, - _coerce_judge_payload, + EvaluationFlowDef, + EvalSuiteDef, + load_agent_markdown, run_evaluation_flow, ) from aworld.evaluations.report import validate_evaluator_report -from aworld.runner import Runners -from aworld.utils.skill_loader import extract_front_matter +from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema DEFAULT_JUDGE_TIMEOUT_SECONDS = 600.0 @@ -43,48 +33,6 @@ def getoption(self, name: str) -> Any: return self._values.get(name) -class TrajectoryEvalJudgeOutput(BaseModel): - score: float - verdict: Literal["Excellent", "Pass", "Marginal", "Fail"] - A1_groundedness: int - A2_completeness: int - A3_relevance: int - A4_readability: int - B1_tool_use: int - B2_efficiency: int - B3_compliance: int - B4_robustness: int - veto_triggered: bool = False - - @model_validator(mode="before") - @classmethod - def flatten_agent_report(cls, value: Any) -> Any: - if not isinstance(value, Mapping) or "dimensions" not in value: - return value - flattened = dict(value) - if "score" not in flattened and "weighted_score" in flattened: - flattened["score"] = flattened["weighted_score"] - dimensions = value.get("dimensions") or {} - for metric_name in ( - "A1_groundedness", - "A2_completeness", - "A3_relevance", - "A4_readability", - "B1_tool_use", - "B2_efficiency", - "B3_compliance", - "B4_robustness", - ): - metric_payload = dimensions.get(metric_name) if isinstance(dimensions, Mapping) else None - if isinstance(metric_payload, Mapping) and "score" in metric_payload: - flattened[metric_name] = metric_payload["score"] - return flattened - - -def _truthy_string(value: Any) -> bool: - return str(value).strip().lower() in {"true", "1"} - - def _manual_replay_config(pytest_config: Any) -> dict[str, Any]: required_options = { "--task-id": pytest_config.getoption("trajectory_task_id"), @@ -113,123 +61,6 @@ def _manual_replay_config(pytest_config: Any) -> dict[str, Any]: } -def _safe_skill_name(value: str) -> str: - return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-._") or "markdown-agent" - - -def _frontmatter_scalar(value: Any, default: str) -> str: - text = str(value if value not in (None, "") else default) - return " ".join(text.splitlines()).strip() - - -def _normalize_tool_list(value: Any) -> dict[str, Any]: - if isinstance(value, Mapping): - return dict(value) - if isinstance(value, str) and value.strip(): - try: - parsed = json.loads(value) - except json.JSONDecodeError: - return {} - if isinstance(parsed, Mapping): - return dict(parsed) - return {} - - -def _materialize_agent_markdown_as_skill( - agent_markdown_path: Path, - *, - skills_root: Path, - skill_name: str, -) -> Path: - lines = agent_markdown_path.read_text(encoding="utf-8").splitlines() - frontmatter, body_start = extract_front_matter(lines) - body = "\n".join(lines[body_start:]).strip() - description = _frontmatter_scalar( - frontmatter.get("description", frontmatter.get("desc")), - f"Agent loaded from {agent_markdown_path}", - ) - tool_list = _normalize_tool_list(frontmatter.get("tool_list", {})) - - skill_dir = skills_root / skill_name - skill_dir.mkdir(parents=True, exist_ok=True) - skill_path = skill_dir / "SKILL.md" - skill_path.write_text( - "---\n" - f"name: {_frontmatter_scalar(frontmatter.get('name'), skill_name)}\n" - f"description: {description}\n" - "type: agent\n" - f"tool_list: {json.dumps(tool_list, ensure_ascii=False)}\n" - "---\n\n" - f"{body}\n", - encoding="utf-8", - ) - return skill_path - - -async def _load_agent_markdown_as_aworld_agent(agent_markdown_path: Path, *, agent_id: str) -> Any: - skill_name = _safe_skill_name(agent_id) - api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") - with tempfile.TemporaryDirectory(prefix="aworld-agent-md-") as tmp_dir: - skills_root = Path(tmp_dir) / "skills" - _materialize_agent_markdown_as_skill( - agent_markdown_path, - skills_root=skills_root, - skill_name=skill_name, - ) - return await _load_skill_agent( - agent_id=agent_id, - agent_def={ - "skill_name": skill_name, - "config": { - "llm_config": { - "llm_model_name": os.getenv("LLM_MODEL_NAME"), - "llm_provider": os.getenv("LLM_PROVIDER"), - "llm_api_key": api_key, - "llm_base_url": os.getenv("LLM_BASE_URL"), - } - }, - }, - skills_path=skills_root, - global_mcp_config=None, - ) - - -@dataclass(frozen=True) -class MarkdownAgentJudgeBackend: - backend_id: str - agent_markdown_path: Path - prompt_builder: Any - timeout_seconds: float | None = None - - def is_available(self) -> bool: - model_name = os.getenv("LLM_MODEL_NAME") - api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") - return self.agent_markdown_path.exists() and bool(model_name and api_key) - - async def execute(self, case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> JudgeExecution: - if not self.is_available(): - raise RuntimeError(f"judge backend '{self.backend_id}' is not available") - - prompt = self.prompt_builder(case_input, target, suite) - if isinstance(prompt, tuple): - raise ValueError("MarkdownAgentJudgeBackend only supports text prompts in this manual replay test") - - agent = await _load_agent_markdown_as_aworld_agent( - self.agent_markdown_path, - agent_id=self.backend_id, - ) - - async def _run_agent() -> str: - response = await Runners.run(input=str(prompt), agent=agent) - return str(getattr(response, "answer", response)) - - if self.timeout_seconds is not None: - response_text = await asyncio.wait_for(_run_agent(), timeout=self.timeout_seconds) - else: - response_text = await _run_agent() - return JudgeExecution(backend_id=self.backend_id, payload=_coerce_judge_payload(response_text)) - - def test_manual_replay_config_requires_explicit_pytest_options(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("AWORLD_TRAJECTORY_TASK_ID", "task_from_env") monkeypatch.setenv("AWORLD_TRAJECTORY_LOG", "~/env/trajectory.log") @@ -279,7 +110,7 @@ async def test_agent_markdown_loads_as_aworld_agent_via_existing_skill_loader( encoding="utf-8", ) - agent = await _load_agent_markdown_as_aworld_agent(agent_md, agent_id="custom-judge") + agent = await load_agent_markdown(agent_md, agent_id="custom-judge") assert agent.name() == "custom-judge" assert agent.desc() == "Evaluates trajectories" @@ -323,9 +154,9 @@ async def fake_run(input: str, agent: Any, **kwargs: Any) -> _FakeTaskResponse: monkeypatch.setattr("aworld.runner.Runners.run", fake_run) - backend = MarkdownAgentJudgeBackend( + backend = AgentJudgeBackend.from_agent_markdown( + agent_md, backend_id="trajectory-evaluator-agent-md", - agent_markdown_path=agent_md, prompt_builder=lambda case_input, target, suite: "judge this trajectory", ) execution = await backend.execute({}, {}, object()) @@ -341,7 +172,7 @@ async def fake_run(input: str, agent: Any, **kwargs: Any) -> _FakeTaskResponse: @pytest.mark.asyncio -async def test_trajectory_log_replay_harness_populates_tool_calls_and_standard_metrics(tmp_path: Path): +async def test_trajectory_log_source_default_adapter_populates_tool_calls_and_standard_metrics(tmp_path: Path): task_id = "task_with_tool" trajectory = [ { @@ -375,16 +206,11 @@ async def test_trajectory_log_replay_harness_populates_tool_calls_and_standard_m + "\n", encoding="utf-8", ) - case = EvalCaseDef( - case_id=task_id, - input={ - "trajectory_log": str(log_path), - "task_id": task_id, - "judge_agent_prompt": "agent.md", - }, - ) + source = AWorldTrajectoryLogSource(path=log_path, task_ids=[task_id], extraction_dir=tmp_path) + record = next(iter(source.iter_records())) + case = source.to_cases()[0] - state = await TrajectoryLogReplayHarness(out_dir=tmp_path).run_rollout(case=case, target={}) + state = source.default_adapter().adapt(record=record, case=case, target={}) assert [call["name"] for call in state.tool_calls] == ["search", "open"] assert state.usage == {"total_tokens": 0} @@ -410,122 +236,6 @@ def _assert_report_trajectory_steps_match_extracted(result: Mapping[str, Any]) - assert result["state_summary"]["trajectory_steps"] == extracted["num_steps"] -def _extract_trajectory_record(log_path: Path, task_id: str) -> dict[str, Any]: - target_line = None - with log_path.open(encoding="utf-8", errors="replace") as handle: - for line in handle: - if task_id in line: - target_line = line - break - if target_line is None: - raise AssertionError(f"task_id {task_id} not found in {log_path}") - - clean = re.sub(r"\x1b\[[0-9;]*m", "", target_line).strip() - record = ast.literal_eval(clean) - trajectory = json.loads(record["trajectory"]) - - question = (trajectory[0].get("state", {}).get("input", {}) or {}).get("content") - system_prompt = "" - first_messages = trajectory[0].get("state", {}).get("messages", []) or [] - if first_messages and first_messages[0].get("role") == "system": - system_prompt = str(first_messages[0].get("content") or "") - - steps = [] - final_answer = None - for item in trajectory: - meta = item.get("meta", {}) - action = item.get("action") or {} - calls = [] - for tool_call in action.get("tool_calls") or []: - function = tool_call.get("function") or {} - calls.append({"name": function.get("name"), "arguments": str(function.get("arguments"))}) - finished = _truthy_string(action.get("is_agent_finished")) - steps.append( - { - "step": meta.get("step"), - "pre_agent": meta.get("pre_agent"), - "agent_id": meta.get("agent_id"), - "tool_calls": calls, - "assistant_content": str(action.get("content") or ""), - "is_agent_finished": finished, - } - ) - if finished and action.get("content"): - final_answer = str(action.get("content")) - - final_messages = trajectory[-1].get("state", {}).get("messages", []) or [] - evidence = [ - {"msg_index": index, "content": str(message.get("content") or "")} - for index, message in enumerate(final_messages) - if message.get("role") == "tool" - ] - - return { - "task_id": task_id, - "is_sub_task": record.get("is_sub_task"), - "num_steps": len(trajectory), - "question": question, - "system_prompt_excerpt": system_prompt[:8000], - "steps": steps, - "final_answer": final_answer, - "evidence": evidence, - } - - -class TrajectoryLogReplayHarness: - def __init__(self, *, out_dir: Path): - self.out_dir = out_dir - - async def run_rollout(self, *, case: EvalCaseDef, target: Mapping[str, Any]) -> RolloutState: - log_path = Path(str(case.input["trajectory_log"])).expanduser() - task_id = str(case.input["task_id"]) - extracted = _extract_trajectory_record(log_path, task_id) - self.out_dir.mkdir(parents=True, exist_ok=True) - extracted_path = self.out_dir / f"extracted_{task_id}.json" - extracted_path.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8") - - final_answer = extracted.get("final_answer") or "" - is_finished = any(step.get("is_agent_finished") for step in extracted["steps"]) - tool_calls = [ - dict(tool_call) - for step in extracted["steps"] - for tool_call in step.get("tool_calls", []) - if isinstance(tool_call, Mapping) - ] - usage = {"total_tokens": 0} - timing = {"duration_ms": 0} - standard_metrics = { - "n_turns": len(extracted["steps"]), - "n_tool_calls": len(tool_calls), - "n_tokens": usage["total_tokens"], - "duration_ms": timing["duration_ms"], - } - return RolloutState( - case_id=case.case_id, - status="success" if is_finished and final_answer else "failed", - answer=final_answer, - trajectory=list(extracted["steps"]), - tool_calls=tool_calls, - usage=usage, - timing=timing, - standard_metrics=standard_metrics, - outcome={ - "task_id": task_id, - "question": extracted.get("question"), - "evidence_blocks": len(extracted["evidence"]), - "num_steps": extracted["num_steps"], - "is_finished": is_finished, - "final_answer_len": len(final_answer), - "extracted_path": str(extracted_path), - }, - metadata={ - "trajectory_log": str(log_path), - "judge_agent_prompt": str(case.input["judge_agent_prompt"]), - "extracted_path": str(extracted_path), - }, - ) - - def _trajectory_judge_prompt(case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> str: outcome = (target.get("artifacts") or {}).get("outcome") or {} extracted_path = outcome.get("extracted_path") @@ -580,23 +290,17 @@ async def test_manual_trajectory_log_case_runs_end_to_end_for_human_replay(reque if not os.getenv("LLM_MODEL_NAME") or not (os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")): pytest.skip("real trajectory judge requires LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY") - suite = EvalSuiteDef( + suite = create_source_eval_suite( suite_id="trajectory-log-manual-replay", - cases=[ - EvalCaseDef( - case_id=task_id, - input={ - "trajectory_log": str(log_path), - "task_id": task_id, - "judge_agent_prompt": str(agent_prompt_path), - }, - ) - ], - runtime_harness=TrajectoryLogReplayHarness(out_dir=out_dir), - judge_schema=JudgeSchemaDef(output_model=TrajectoryEvalJudgeOutput), - judge_backend=MarkdownAgentJudgeBackend( + source=AWorldTrajectoryLogSource( + path=log_path, + task_ids=[task_id], + extraction_dir=out_dir, + ), + judge_schema=TrajectoryJudgeSchema.default(), + judge_backend=AgentJudgeBackend.from_agent_markdown( + agent_prompt_path, backend_id="trajectory-evaluator-agent-md", - agent_markdown_path=agent_prompt_path, prompt_builder=_trajectory_judge_prompt, timeout_seconds=judge_timeout_seconds, ), From 2429022ba0c2c57291254392378c0840d8f765c9 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 20:05:10 +0800 Subject: [PATCH 32/41] feat: add source-backed evaluator cli run --- .../src/aworld_cli/evaluator_runtime.py | 301 +++++++++++++++++- .../top_level_commands/evaluator_cmd.py | 51 +++ docs/AWorld CLI/Commands/Evaluator.md | 36 ++- .../tasks.md | 52 +-- tests/core/test_evaluator_runtime.py | 129 ++++++++ .../core/test_evaluator_top_level_command.py | 138 ++++++++ tests/docs/test_evaluator_report_docs.py | 3 + 7 files changed, 673 insertions(+), 37 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 57bf92acf..bb124c0b8 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import builtins import json from pathlib import Path @@ -15,10 +16,18 @@ validate_evaluator_report as _validate_evaluator_report, ) from aworld.evaluations.substrate import ( + AgentJudgeBackend, EvaluationFlowDef, + GateMetricCondition, + GatePolicyDef, + JudgeSchemaDef, + StateCheckGrader, describe_eval_target, run_evaluation_flow, ) +from aworld.evaluations.sources import AWorldTrajectoryLogSource, JsonlTaskAnswerSource, create_source_eval_suite +from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema +from pydantic import BaseModel from aworld_cli.core.plugin_manager import PluginManager, get_builtin_plugin_roots from aworld_cli.evaluator_rendering import render_evaluator_summary as _render_evaluator_summary from aworld_cli.evaluator_workspace import ( @@ -87,7 +96,7 @@ def _build_automation_summary(report: dict) -> dict[str, object]: gate = report.get("gate") or {} approval = report.get("approval") or {} result_counts = report.get("result_counts") or {} - return { + automation = { "gate_status": gate.get("status"), "metric_name": gate.get("metric_name"), "metric_value": gate.get("value"), @@ -98,6 +107,12 @@ def _build_automation_summary(report: dict) -> dict[str, object]: "case_count": result_counts.get("cases_total", len(report.get("results") or [])), "judge_backend": (report.get("judge_backend") or {}).get("backend_id"), } + source_selection = report.get("source_selection") or {} + if source_selection: + automation["source_kind"] = source_selection.get("kind") + automation["source_input"] = source_selection.get("input") + automation["task_id"] = source_selection.get("task_id") + return automation def get_declared_evaluator_suite_schema() -> dict[str, object]: @@ -133,8 +148,10 @@ def _run_evaluator_hooks( Evaluator hook contract: - `evaluator.pre_discover` event payload: `target`, `workspace_path` - `evaluator.post_discover` event payload: `target`, `workspace_path`, `suite_names` - - `evaluator.pre_run` event payload: `target`, `suite`, `workspace_path` - - `evaluator.post_run` event payload: `report`, `target`, `suite`, `workspace_path` + - `evaluator.pre_run` event payload for target mode: `mode=target`, `target`, `suite`, `workspace_path` + - `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` + - `evaluator.post_run` event payload for target mode: `mode=target`, `report`, `target`, `suite`, `workspace_path` + - `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` - `evaluator.render_summary` event payload: `report`, `workspace_path` - mutable state: lightweight CLI assembly metadata only - allowed side effects: report upload, notifications, summary augmentation @@ -149,6 +166,267 @@ def _run_evaluator_hooks( return merged +class _SourceJudgeOutput(BaseModel): + score: float + verdict: str + + +def _source_report_path( + *, + input_path: Path, + suite_id: str, + task_id: str | None, + output: str | None, + out_dir: str | None, +) -> Path: + if output: + return Path(output).expanduser().resolve() + root = Path(out_dir).expanduser().resolve() if out_dir else Path.cwd() / ".aworld" / "evaluations" + root.mkdir(parents=True, exist_ok=True) + token = _sanitize_path_token(task_id or input_path.stem or input_path.name) + return root / f"{token}.{_sanitize_path_token(suite_id)}.json" + + +def _build_source_prompt(case_input: dict, target: dict, suite) -> str: + payload = { + "case": {key: value for key, value in case_input.items() if not str(key).startswith("_")}, + "state": { + "answer": target.get("answer"), + "status": target.get("status"), + "artifacts": target.get("artifacts"), + "trajectory": target.get("trajectory"), + "tool_calls": target.get("tool_calls"), + }, + "required_output_schema": {"score": "number, weighted score from 0 to 100", "verdict": "string"}, + "instruction": "Evaluate the existing answer/state and return exactly one JSON object.", + } + return json.dumps(payload, ensure_ascii=False, indent=2) + + +def _build_trajectory_prompt(case_input: dict, target: dict, suite) -> str: + outcome = (target.get("artifacts") or {}).get("outcome") or {} + extracted_path = outcome.get("extracted_path") + extracted_payload = {} + if extracted_path: + extracted_payload = json.loads(Path(str(extracted_path)).read_text(encoding="utf-8")) + payload = { + "case": {key: value for key, value in case_input.items() if not str(key).startswith("_")}, + "extracted_trajectory": extracted_payload, + "required_output_schema": { + "score": "number, weighted score from 0 to 100", + "verdict": "Excellent|Pass|Marginal|Fail", + "A1_groundedness": "integer 1-5", + "A2_completeness": "integer 1-5", + "A3_relevance": "integer 1-5", + "A4_readability": "integer 1-5", + "B1_tool_use": "integer 1-5", + "B2_efficiency": "integer 1-5", + "B3_compliance": "integer 1-5", + "B4_robustness": "integer 1-5", + "veto_triggered": "boolean", + }, + "instruction": ( + "Apply the trajectory evaluator contract to the extracted trajectory. " + "Do not call tools and do not re-read the raw log; all required evidence is in extracted_trajectory. " + "Return exactly one JSON object matching required_output_schema, with no markdown." + ), + } + return json.dumps(payload, ensure_ascii=False, indent=2) + + +def _build_source_suite( + *, + kind: str, + input_path: Path, + judge_agent_path: Path, + task_id: str | None, + id_field: str, + task_field: str, + answer_field: str, + out_dir: str | None, +): + if kind == "task-answer": + source = JsonlTaskAnswerSource( + path=input_path, + id_field=id_field, + input_field=task_field, + answer_field=answer_field, + ) + return create_source_eval_suite( + suite_id="source-evaluator", + source=source, + judge_backend=AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id="source-agent-md", + prompt_builder=_build_source_prompt, + ), + judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70.0), + ) + + if kind == "aworld-trajectory-log": + if not task_id: + raise ValueError("--task-id is required for aworld-trajectory-log source") + source = AWorldTrajectoryLogSource( + path=input_path, + task_ids=[task_id], + extraction_dir=out_dir, + ) + return create_source_eval_suite( + suite_id="trajectory-log-source-evaluator", + source=source, + judge_backend=AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id="trajectory-evaluator-agent-md", + prompt_builder=_build_trajectory_prompt, + ), + judge_schema=TrajectoryJudgeSchema.default(), + outcome_scorers=( + StateCheckGrader( + metric_name="has_evidence", + source="outcome", + path=("evidence_blocks",), + op=">", + expected=0, + ), + StateCheckGrader( + metric_name="agent_finished", + source="outcome", + path=("is_finished",), + op="==", + expected=True, + ), + ), + gate_policy=GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), + GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), + GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), + ) + ), + ) + + raise ValueError(f"unsupported source kind: {kind}") + + +def run_evaluator_source_cli( + *, + input: str, + kind: str, + judge_agent: str, + out_dir: str | None = None, + output: str | None = None, + task_id: str | None = None, + agent: str | None = None, + id_field: str = "id", + task_field: str = "input", + answer_field: str = "answer", + interactive_approval: bool = False, +) -> dict: + hooks = _load_evaluator_hooks() + input_path = Path(input).expanduser().resolve() + if not input_path.exists(): + raise FileNotFoundError(f"source input does not exist: {input_path}") + judge_agent_path = Path(judge_agent).expanduser().resolve() + if not judge_agent_path.exists(): + raise FileNotFoundError(f"judge agent does not exist: {judge_agent_path}") + + workspace_path = str(input_path.parent if input_path.is_file() else input_path) + event_base = { + "mode": "source", + "input": str(input_path), + "kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + "agent": agent, + "workspace_path": workspace_path, + "output_path": str(Path(output).expanduser().resolve()) if output else None, + } + hook_state = _run_evaluator_hooks( + hooks, + "evaluator.pre_run", + event=event_base, + state={ + "mode": "source", + "input": str(input_path), + "kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + "agent": agent, + "interactive_approval": interactive_approval, + }, + ) + suite = _build_source_suite( + kind=kind, + input_path=input_path, + judge_agent_path=judge_agent_path, + task_id=task_id, + id_field=id_field, + task_field=task_field, + answer_field=answer_field, + out_dir=out_dir, + ) + target_info = { + "target_kind": "source", + "target_path": str(input_path), + "source_kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + } + for key, value in hook_state.items(): + if key not in {"mode", "input", "kind", "task_id", "judge_agent", "agent", "interactive_approval", "summary_suffix"}: + target_info[key] = value + flow = EvaluationFlowDef( + target=target_info, + suite=suite, + interactive_approval=interactive_approval, + output_path=output, + ) + report = asyncio.run(run_evaluation_flow(flow)) + if hasattr(report, "to_dict"): + report = report.to_dict() + approval = dict(report.get("approval") or {}) + approval.setdefault("required", report.get("gate", {}).get("status") == "needs_approval") + approval.setdefault("resolved", False) + approval.setdefault("approved", None) + if approval["required"] and interactive_approval: + approved = builtins.input("Evaluation requires approval. Approve? [y/N]: ").strip().lower() in {"y", "yes"} + approval["resolved"] = True + approval["approved"] = approved + report["approval"] = approval + report["source_selection"] = { + "mode": "source", + "input": str(input_path), + "kind": kind, + "task_id": task_id, + "judge_agent": str(judge_agent_path), + } + report["automation"] = _build_automation_summary(report) + output_path = _source_report_path( + input_path=input_path, + suite_id=report["suite_id"], + task_id=task_id, + output=output, + out_dir=out_dir, + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + report["report_path"] = str(output_path) + post_event = { + **event_base, + "output_path": str(output_path), + "report": report, + } + _run_evaluator_hooks( + hooks, + "evaluator.post_run", + event=post_event, + state=hook_state, + ) + output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + return report + + def run_evaluator_cli( *, target: str, @@ -167,8 +445,18 @@ def run_evaluator_cli( hook_state = _run_evaluator_hooks( hooks, "evaluator.pre_run", - event={"target": str(target_path), "suite": suite_selection["resolved"], "workspace_path": workspace_path}, - state={"target": str(target_path), "suite": suite, "interactive_approval": interactive_approval}, + event={ + "mode": "target", + "target": str(target_path), + "suite": suite_selection["resolved"], + "workspace_path": workspace_path, + }, + state={ + "mode": "target", + "target": str(target_path), + "suite": suite, + "interactive_approval": interactive_approval, + }, ) target_info = describe_eval_target(target_path) for key, value in hook_state.items(): @@ -188,7 +476,7 @@ def run_evaluator_cli( approval.setdefault("resolved", False) approval.setdefault("approved", None) if approval["required"] and interactive_approval: - approved = input("Evaluation requires approval. Approve? [y/N]: ").strip().lower() in {"y", "yes"} + approved = builtins.input("Evaluation requires approval. Approve? [y/N]: ").strip().lower() in {"y", "yes"} approval["resolved"] = True approval["approved"] = approved report["approval"] = approval @@ -205,6 +493,7 @@ def run_evaluator_cli( hooks, "evaluator.post_run", event={ + "mode": "target", "report": report, "target": str(target_path), "suite": suite_selection["resolved"], diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index 77a0b03a3..64db5ec8e 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -10,6 +10,7 @@ get_evaluator_report_schema, render_evaluator_summary, run_evaluator_cli, + run_evaluator_source_cli, validate_evaluator_report, ) @@ -41,8 +42,58 @@ def register_parser(self, subparsers) -> None: parser.add_argument("--list-suites", action="store_true") parser.add_argument("--print-report-schema", action="store_true") parser.add_argument("--validate-report", type=str) + subparsers = parser.add_subparsers(dest="evaluator_action") + run_parser = subparsers.add_parser( + "run", + help="Run a source-backed evaluator flow.", + description="Run a source-backed evaluator flow.", + prog="aworld-cli evaluator run", + ) + run_parser.add_argument("--input", required=True) + run_parser.add_argument("--kind", required=True) + run_parser.add_argument("--judge-agent", required=True) + run_parser.add_argument("--out-dir") + run_parser.add_argument("--output") + run_parser.add_argument("--task-id") + run_parser.add_argument("--agent") + run_parser.add_argument("--id-field", default="id") + run_parser.add_argument("--task-field", default="input") + run_parser.add_argument("--answer-field", default="answer") + run_parser.add_argument("--interactive-approval", action="store_true") def run(self, args, context) -> int: + if getattr(args, "evaluator_action", None) == "run": + incompatible_args = ( + ("target", "--target"), + ("suite", "--suite"), + ("list_suites", "--list-suites"), + ("print_report_schema", "--print-report-schema"), + ("validate_report", "--validate-report"), + ) + for attr_name, flag_name in incompatible_args: + if getattr(args, attr_name, None): + print(f"Evaluator error: {flag_name} cannot be used with evaluator run") + return 1 + try: + report = run_evaluator_source_cli( + input=args.input, + kind=args.kind, + judge_agent=args.judge_agent, + out_dir=args.out_dir, + output=args.output, + task_id=args.task_id, + agent=args.agent, + id_field=args.id_field, + task_field=args.task_field, + answer_field=args.answer_field, + interactive_approval=args.interactive_approval, + ) + except (FileNotFoundError, ValueError, KeyError) as exc: + print(f"Evaluator error: {exc}") + return 1 + print(render_evaluator_summary(report)) + return evaluator_exit_code(report) + if getattr(args, "print_report_schema", False): print(json.dumps(get_evaluator_report_schema(), ensure_ascii=False, indent=2)) return 0 diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index aa55d4208..ceb6ccb7f 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -12,6 +12,7 @@ Use it when you want to: - run a built-in evaluator suite such as `app-evaluator` - load declaration-backed evaluator suites from workspace manifests +- evaluate existing source records such as task+answer JSONL files or AWorld trajectory logs - inspect which suites match a target - export the evaluator report schema - validate a saved evaluator report in automation @@ -29,11 +30,31 @@ aworld-cli evaluator --print-report-schema aworld-cli evaluator --validate-report ./.aworld/evaluations/artifact.app-evaluator.json ``` +Source-backed usage: + +```bash +aworld-cli evaluator run \ + --input ./task_answers.jsonl \ + --kind task-answer \ + --judge-agent ./eval/answer_judge/agent.md \ + --out-dir ./reports + +aworld-cli evaluator run \ + --input ~/Documents/logs/trajectory.log \ + --kind aworld-trajectory-log \ + --task-id task_20260609193335 \ + --judge-agent ./eval/trajectory_evaluator/agent.md \ + --out-dir ./reports +``` + +For `task-answer` JSONL inputs, the default fields are `id`, `input`, and `answer`. Use `--id-field`, `--task-field`, and `--answer-field` only when the file uses different names. + Useful options: ```bash aworld-cli evaluator --target ./artifact --output ./report.json aworld-cli evaluator --target ./artifact --interactive-approval +aworld-cli evaluator run --input ./task_answers.jsonl --kind task-answer --judge-agent ./agent.md --output ./report.json ``` ## Declared Suite Manifests @@ -91,8 +112,10 @@ Current event payloads: - `evaluator.pre_discover`: `target`, `workspace_path` - `evaluator.post_discover`: `target`, `workspace_path`, `suite_names` -- `evaluator.pre_run`: `target`, `suite`, `workspace_path` -- `evaluator.post_run`: `report`, `target`, `suite`, `workspace_path` +- `evaluator.pre_run` for target mode: `mode`, `target`, `suite`, `workspace_path` +- `evaluator.pre_run` for source mode: `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` +- `evaluator.post_run` for target mode: `mode`, `report`, `target`, `suite`, `workspace_path` +- `evaluator.post_run` for source mode: `mode`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` - `evaluator.render_summary`: `report`, `workspace_path` Hook boundaries: @@ -121,6 +144,7 @@ Key report sections: - `gate`: structured `pass` / `fail` / `needs_approval` decision - `automation`: exit-code-oriented summary fields for scripts and CI - `suite_selection`: resolved/defaulted suite selection diagnostics +- `source_selection`: source input diagnostics for `aworld-cli evaluator run` - `approval`: approval decision metadata when the gate requires human confirmation See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/evaluator_report.example.json) for a minimal example. @@ -129,9 +153,10 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ 1. Inspect matching suites with `aworld-cli evaluator --list-suites --target ./artifact`. 2. Run evaluation with `aworld-cli evaluator --target ./artifact`. -3. Save or collect the emitted JSON report. -4. Validate persisted reports with `aworld-cli evaluator --validate-report `. -5. Export the current JSON Schema with `aworld-cli evaluator --print-report-schema` when integrating with external tooling. +3. For existing outputs, run source-backed evaluation with `aworld-cli evaluator run --input --kind task-answer --judge-agent `. +4. Save or collect the emitted JSON report. +5. Validate persisted reports with `aworld-cli evaluator --validate-report `. +6. Export the current JSON Schema with `aworld-cli evaluator --print-report-schema` when integrating with external tooling. ## Exit Codes @@ -147,4 +172,5 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ - declared suite manifests currently layer on `app-evaluator` only; they are not a generic suite authoring format yet. - `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. - `--validate-report` validates an existing JSON report against that schema without re-running evaluation. +- `aworld-cli evaluator run` currently supports `task-answer` and `aworld-trajectory-log`; task-only execution sources and generic serialized-state sources are intentionally deferred until the framework provides those source kinds. - the CLI command is an assembly/product layer; reusable evaluator building blocks stay in `aworld/evaluations/**`. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md index a7c574b03..fe3ac6d1c 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md @@ -1,40 +1,40 @@ ## 1. Parser And Command Shape -- [ ] 1.1 Confirm `aworld-evaluator-input-sources-2026-06-10` has landed before implementing source-backed CLI behavior. -- [ ] 1.2 Extend the existing `EvaluatorTopLevelCommand` parser with a source-backed `run` mode. -- [ ] 1.3 Add `--input`, `--kind`, `--judge-agent`, `--out-dir`, `--output`, `--task-id`, `--agent`, and optional JSONL field mapping arguments for source mode. -- [ ] 1.4 Default task+answer JSONL field mappings to `id`, `input`, and `answer`. -- [ ] 1.5 Preserve existing `--target`, `--suite`, `--list-suites`, `--print-report-schema`, `--validate-report`, and `--interactive-approval` behavior. -- [ ] 1.6 Add clear validation errors for mixing incompatible target-mode and source-mode arguments. +- [x] 1.1 Confirm `aworld-evaluator-input-sources-2026-06-10` has landed before implementing source-backed CLI behavior. +- [x] 1.2 Extend the existing `EvaluatorTopLevelCommand` parser with a source-backed `run` mode. +- [x] 1.3 Add `--input`, `--kind`, `--judge-agent`, `--out-dir`, `--output`, `--task-id`, `--agent`, and optional JSONL field mapping arguments for source mode. +- [x] 1.4 Default task+answer JSONL field mappings to `id`, `input`, and `answer`. +- [x] 1.5 Preserve existing `--target`, `--suite`, `--list-suites`, `--print-report-schema`, `--validate-report`, and `--interactive-approval` behavior. +- [x] 1.6 Add clear validation errors for mixing incompatible target-mode and source-mode arguments. ## 2. Runtime Assembly -- [ ] 2.1 Add a source-backed runtime helper in `aworld_cli.evaluator_runtime`. -- [ ] 2.2 Resolve source kind to framework source/adapters from `aworld.evaluations`. -- [ ] 2.3 Resolve `agent.md` judge path through framework `AgentJudgeBackend.from_agent_markdown`. -- [ ] 2.4 For task+answer and trajectory-log sources, use framework replay/state adapters without re-execution. -- [ ] 2.5 Treat task-only and serialized-state source kinds as unsupported until the framework source layer provides those built-ins. -- [ ] 2.6 Persist reports with deterministic default names under the requested output directory. +- [x] 2.1 Add a source-backed runtime helper in `aworld_cli.evaluator_runtime`. +- [x] 2.2 Resolve source kind to framework source/adapters from `aworld.evaluations`. +- [x] 2.3 Resolve `agent.md` judge path through framework `AgentJudgeBackend.from_agent_markdown`. +- [x] 2.4 For task+answer and trajectory-log sources, use framework replay/state adapters without re-execution. +- [x] 2.5 Treat task-only and serialized-state source kinds as unsupported until the framework source layer provides those built-ins. +- [x] 2.6 Persist reports with deterministic default names under the requested output directory. ## 3. Plugin And Hook Integration -- [ ] 3.1 Keep evaluator command exposure through the existing builtin plugin command entrypoint. -- [ ] 3.2 Reuse `_load_evaluator_hooks` and `_run_evaluator_hooks` for source-backed runs. -- [ ] 3.3 Extend evaluator hook event payloads with `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, and output path fields. -- [ ] 3.4 Document that hooks may customize CLI metadata, side effects, and rendering but must not redefine framework execution, scoring, gate, or report semantics. +- [x] 3.1 Keep evaluator command exposure through the existing builtin plugin command entrypoint. +- [x] 3.2 Reuse `_load_evaluator_hooks` and `_run_evaluator_hooks` for source-backed runs. +- [x] 3.3 Extend evaluator hook event payloads with `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, and output path fields. +- [x] 3.4 Document that hooks may customize CLI metadata, side effects, and rendering but must not redefine framework execution, scoring, gate, or report semantics. ## 4. UX And Reporting -- [ ] 4.1 Render the same evaluator summary shape for source-backed reports. -- [ ] 4.2 Include resolved source mode, input path, kind, selected task ids, and report path in summary or automation metadata. -- [ ] 4.3 Keep exit codes based on gate status and approval state. -- [ ] 4.4 Add examples for trajectory-log, task+answer, and task-only evaluation. +- [x] 4.1 Render the same evaluator summary shape for source-backed reports. +- [x] 4.2 Include resolved source mode, input path, kind, selected task ids, and report path in summary or automation metadata. +- [x] 4.3 Keep exit codes based on gate status and approval state. +- [x] 4.4 Add examples for trajectory-log and task+answer evaluation, and document task-only evaluation as deferred until the framework source exists. ## 5. Tests -- [ ] 5.1 Add parser tests for source-backed `evaluator run` arguments. -- [ ] 5.2 Add validation tests for required source-mode arguments and incompatible argument combinations. -- [ ] 5.3 Add runtime delegation tests using fake framework source helpers. -- [ ] 5.4 Add hook payload tests for source-backed pre-run/post-run/render events. -- [ ] 5.5 Add compatibility tests for the existing target/suite evaluator path. -- [ ] 5.6 Validate this OpenSpec change with `openspec validate aworld-cli-evaluator-source-run-2026-06-10 --strict`. +- [x] 5.1 Add parser tests for source-backed `evaluator run` arguments. +- [x] 5.2 Add validation tests for required source-mode arguments and incompatible argument combinations. +- [x] 5.3 Add runtime delegation tests using fake framework source helpers. +- [x] 5.4 Add hook payload tests for source-backed pre-run/post-run/render events. +- [x] 5.5 Add compatibility tests for the existing target/suite evaluator path. +- [x] 5.6 Validate this OpenSpec change with `openspec validate aworld-cli-evaluator-source-run-2026-06-10 --strict`. diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index caea0b751..df01df930 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -13,11 +13,13 @@ from aworld.evaluations.manifests import get_declared_eval_suite_schema from aworld.evaluations.report import EvaluatorReport from aworld_cli.evaluator_runtime import ( + _build_source_prompt, available_evaluator_suites, evaluator_exit_code, get_declared_evaluator_suite_schema, get_evaluator_report_schema, run_evaluator_cli, + run_evaluator_source_cli, validate_evaluator_report, ) from aworld_cli.evaluator_rendering import render_evaluator_summary @@ -72,6 +74,133 @@ async def fake_run_evaluation_flow(flow): assert persisted["judge_backend"]["backend_id"] == "stub-agent" +def test_run_evaluator_source_cli_builds_task_answer_flow_with_default_fields( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + output = tmp_path / "report.json" + captured = {} + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + return { + "report_version": 1, + "suite_id": "source-evaluator", + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="task-answer", + judge_agent=str(judge_agent), + output=str(output), + ) + + flow = captured["flow"] + assert flow.target["target_kind"] == "source" + assert flow.target["source_kind"] == "task-answer" + assert flow.suite.cases[0].case_id == "case-1" + assert flow.suite.cases[0].input == {"input": "question"} + assert flow.suite.judge_backend.backend_id == "source-agent-md" + assert report["source_selection"]["kind"] == "task-answer" + assert report["automation"]["source_kind"] == "task-answer" + assert output.exists() + + +def test_source_prompt_uses_zero_to_hundred_score_contract() -> None: + prompt = _build_source_prompt( + {"input": "question"}, + {"answer": "existing"}, + suite=None, + ) + + payload = json.loads(prompt) + assert payload["required_output_schema"]["score"] == "number, weighted score from 0 to 100" + + +def test_run_evaluator_source_cli_rejects_unsupported_source_kind(tmp_path: Path) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + with pytest.raises(ValueError, match="unsupported source kind"): + run_evaluator_source_cli( + input=str(input_path), + kind="task", + judge_agent=str(judge_agent), + ) + + +def test_run_evaluator_source_cli_passes_source_fields_to_hooks( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + events: list[tuple[str, dict]] = [] + + class CaptureHook: + def __init__(self, hook_point: str): + self.hook_point = hook_point + + async def run(self, *, event, state): + events.append((self.hook_point, dict(event))) + return {"metadata": {"hook_tag": "source-hook"}} + + async def fake_run_evaluation_flow(flow): + assert flow.target["hook_tag"] == "source-hook" + return { + "report_version": 1, + "suite_id": "source-evaluator", + "summary": {"source-evaluator": {"score": {"mean": 0.9}}}, + "metrics": {"score": {"mean": 0.9}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "approval": {"required": False, "resolved": False, "approved": None}, + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_evaluator_hooks", + lambda: { + "evaluator.pre_run": (CaptureHook("pre"),), + "evaluator.post_run": (CaptureHook("post"),), + }, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + run_evaluator_source_cli( + input=str(input_path), + kind="task-answer", + judge_agent=str(judge_agent), + task_id="case-1", + output=str(tmp_path / "report.json"), + ) + + assert events[0][0] == "pre" + assert events[0][1]["mode"] == "source" + assert events[0][1]["input"] == str(input_path.resolve()) + assert events[0][1]["kind"] == "task-answer" + assert events[0][1]["task_id"] == "case-1" + assert events[0][1]["judge_agent"] == str(judge_agent.resolve()) + assert events[1][0] == "post" + assert events[1][1]["mode"] == "source" + assert events[1][1]["report"]["source_selection"]["kind"] == "task-answer" + + @pytest.mark.asyncio async def test_framework_run_evaluation_flow_returns_report_object() -> None: async def fake_judge(case_input, target): diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index d7704fef5..00d2e1d3c 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -68,6 +68,61 @@ def fake_run_evaluator_cli(**kwargs): assert "pass" in output +def test_maybe_dispatch_top_level_command_runs_source_evaluator_command( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "source-evaluator", + "gate": {"status": "pass"}, + "summary": {"source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + [ + "aworld-cli", + "evaluator", + "run", + "--input", + str(input_path), + "--kind", + "task-answer", + "--judge-agent", + str(judge_agent), + "--out-dir", + str(tmp_path / "reports"), + ] + ) + output = capsys.readouterr().out + + assert handled is True + assert calls["input"] == str(input_path) + assert calls["kind"] == "task-answer" + assert calls["judge_agent"] == str(judge_agent) + assert calls["out_dir"] == str(tmp_path / "reports") + assert calls["id_field"] == "id" + assert calls["task_field"] == "input" + assert calls["answer_field"] == "answer" + assert "source-evaluator" in output + assert "pass" in output + + def test_evaluator_command_returns_nonzero_for_failed_gate( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -93,6 +148,89 @@ def test_evaluator_command_returns_nonzero_for_failed_gate( assert exit_code == 2 +def test_evaluator_source_run_rejects_target_mode_arguments( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + evaluator_action="run", + target="artifact.txt", + input="answers.jsonl", + kind="task-answer", + judge_agent="agent.md", + out_dir=None, + output=None, + task_id=None, + agent=None, + id_field="id", + task_field="input", + answer_field="answer", + interactive_approval=False, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert "--target cannot be used with evaluator run" in output + + +@pytest.mark.parametrize( + ("arg_name", "expected"), + [ + ("suite", "--suite cannot be used with evaluator run"), + ("list_suites", "--list-suites cannot be used with evaluator run"), + ("print_report_schema", "--print-report-schema cannot be used with evaluator run"), + ("validate_report", "--validate-report cannot be used with evaluator run"), + ], +) +def test_evaluator_source_run_rejects_other_target_mode_arguments( + arg_name: str, + expected: str, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + args = { + "evaluator_action": "run", + "target": None, + "suite": None, + "input": "answers.jsonl", + "kind": "task-answer", + "judge_agent": "agent.md", + "out_dir": None, + "output": None, + "task_id": None, + "agent": None, + "id_field": "id", + "task_field": "input", + "answer_field": "answer", + "interactive_approval": False, + "list_suites": False, + "print_report_schema": False, + "validate_report": None, + } + args[arg_name] = "value" if arg_name in {"suite", "validate_report"} else True + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace(**args), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert expected in output + + def test_evaluator_command_returns_nonzero_for_unresolved_approval( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py index ce9920087..a27eb9ee0 100644 --- a/tests/docs/test_evaluator_report_docs.py +++ b/tests/docs/test_evaluator_report_docs.py @@ -13,6 +13,9 @@ def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: assert "aworld-cli evaluator" in content assert "--print-report-schema" in content assert "--validate-report" in content + assert "aworld-cli evaluator run" in content + assert "--kind task-answer" in content + assert "--kind aworld-trajectory-log" in content assert "report_format" in content assert "automation" in content assert ".aworld/evaluators/*.json" in content From cb955f0098e75da54a1d1a67160be61273522e8b Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 20:20:22 +0800 Subject: [PATCH 33/41] test: add source cli trajectory manual replay --- aworld/evaluations/report.py | 3 + tests/core/test_evaluator_runtime.py | 46 +++++ .../test_trajectory_log_manual_case.py | 162 ++++++++++++++++++ 3 files changed, 211 insertions(+) diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py index 029b2cefb..1b9efa666 100644 --- a/aworld/evaluations/report.py +++ b/aworld/evaluations/report.py @@ -142,6 +142,9 @@ def get_evaluator_report_schema() -> dict[str, object]: "suggested_exit_code": {"type": "integer", "enum": [0, 2, 3]}, "case_count": {"type": "integer", "minimum": 0}, "judge_backend": {"type": ["string", "null"]}, + "source_kind": {"type": ["string", "null"]}, + "source_input": {"type": ["string", "null"]}, + "task_id": {"type": ["string", "null"]}, }, "additionalProperties": False, }, diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index df01df930..2c442d43b 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -201,6 +201,52 @@ async def fake_run_evaluation_flow(flow): assert events[1][1]["report"]["source_selection"]["kind"] == "task-answer" +def test_run_evaluator_source_cli_persists_schema_valid_source_report( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-10T00:00:00Z", + "suite_id": "source-evaluator", + "target": flow.target, + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"source-evaluator": {"score": {"mean": 88.0}}}, + "metrics": {"score": {"mean": 88.0}}, + "results": [ + { + "case_id": "case-1", + "input": {"input": "question"}, + "metrics": {"score": {"value": 88.0, "status": "PASSED"}}, + "judge": {"score": 88.0, "verdict": "Pass"}, + "judge_backend": {"backend_id": "source-agent-md"}, + "state_summary": {"answer": "existing"}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "pass", "metric_name": "score", "value": 88.0}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="task-answer", + judge_agent=str(judge_agent), + output=str(tmp_path / "report.json"), + ) + + validate_evaluator_report(report) + + @pytest.mark.asyncio async def test_framework_run_evaluation_flow_returns_report_object() -> None: async def fake_judge(case_input, target): diff --git a/tests/evaluations/test_trajectory_log_manual_case.py b/tests/evaluations/test_trajectory_log_manual_case.py index d7e264f37..f28cc91f0 100644 --- a/tests/evaluations/test_trajectory_log_manual_case.py +++ b/tests/evaluations/test_trajectory_log_manual_case.py @@ -2,11 +2,14 @@ import json import os +import sys from pathlib import Path from typing import Any, Mapping import pytest +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + from aworld.evaluations.sources import AWorldTrajectoryLogSource, create_source_eval_suite from aworld.evaluations.substrate import ( AgentJudgeBackend, @@ -20,6 +23,7 @@ ) from aworld.evaluations.report import validate_evaluator_report from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema +from aworld_cli.evaluator_runtime import run_evaluator_source_cli DEFAULT_JUDGE_TIMEOUT_SECONDS = 600.0 @@ -230,12 +234,137 @@ def test_trajectory_step_assertion_uses_extracted_num_steps(tmp_path: Path): _assert_report_trajectory_steps_match_extracted(result) +def test_source_cli_report_assertion_matches_manual_trajectory_goal(tmp_path: Path): + task_id = "task_for_cli_assertion" + log_path = tmp_path / "trajectory.log" + agent_prompt_path = tmp_path / "agent.md" + report_path = tmp_path / "report.json" + extracted_path = tmp_path / f"extracted_{task_id}.json" + log_path.write_text("log", encoding="utf-8") + agent_prompt_path.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + extracted_path.write_text( + json.dumps( + { + "task_id": task_id, + "num_steps": 2, + "final_answer": "final", + "evidence": [{"content": "tool result"}], + } + ), + encoding="utf-8", + ) + report = { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-10T00:00:00Z", + "suite_id": "trajectory-log-source-evaluator", + "target": {"target_kind": "source", "target_path": str(log_path)}, + "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, + "summary": {"trajectory-log-source-evaluator": {"score": {"mean": 64.0}}}, + "metrics": { + "score": {"mean": 64.0}, + "has_evidence": {"mean": 1.0}, + "agent_finished": {"mean": 1.0}, + }, + "results": [ + { + "case_id": task_id, + "input": {"task_id": task_id, "trajectory_log": str(log_path)}, + "metrics": { + "score": {"value": 64.0, "status": "PASSED"}, + "has_evidence": {"value": True, "status": "PASSED"}, + "agent_finished": {"value": True, "status": "PASSED"}, + }, + "judge": {"score": 64.0, "verdict": "Marginal", "A1_groundedness": 3}, + "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, + "state_summary": {"answer": "final", "trajectory_steps": 2}, + "metadata": {"extracted_path": str(extracted_path)}, + } + ], + "result_counts": {"cases_total": 1, "cases_with_metrics": 1, "cases_with_judge": 1}, + "gate": {"status": "fail", "metric_name": None, "value": None}, + "approval": {"required": False, "resolved": False, "approved": None}, + "automation": { + "gate_status": "fail", + "metric_name": None, + "metric_value": None, + "approval_required": False, + "approval_resolved": False, + "approved": None, + "suggested_exit_code": 2, + "case_count": 1, + "judge_backend": "trajectory-evaluator-agent-md", + "source_kind": "aworld-trajectory-log", + "source_input": str(log_path), + "task_id": task_id, + }, + "source_selection": { + "mode": "source", + "input": str(log_path), + "kind": "aworld-trajectory-log", + "task_id": task_id, + "judge_agent": str(agent_prompt_path), + }, + "report_path": str(report_path), + } + report_path.write_text(json.dumps(report), encoding="utf-8") + + _assert_source_cli_trajectory_report_matches_manual_goal( + report, + task_id=task_id, + log_path=log_path, + agent_prompt_path=agent_prompt_path, + ) + + def _assert_report_trajectory_steps_match_extracted(result: Mapping[str, Any]) -> None: extracted_path = Path(str(result["metadata"]["extracted_path"])) extracted = json.loads(extracted_path.read_text(encoding="utf-8")) assert result["state_summary"]["trajectory_steps"] == extracted["num_steps"] +def _assert_source_cli_trajectory_report_matches_manual_goal( + report: Mapping[str, Any], + *, + task_id: str, + log_path: Path, + agent_prompt_path: Path, +) -> None: + validate_evaluator_report(dict(report)) + report_path = Path(str(report["report_path"])) + assert report_path.exists() + assert report["suite_id"] == "trajectory-log-source-evaluator" + assert report["gate"]["status"] in {"pass", "fail", "needs_approval"} + assert report["metrics"]["has_evidence"]["mean"] == 1.0 + assert report["metrics"]["agent_finished"]["mean"] == 1.0 + assert report["judge_backend"]["backend_id"] == "trajectory-evaluator-agent-md" + + source_selection = report["source_selection"] + assert source_selection["mode"] == "source" + assert source_selection["kind"] == "aworld-trajectory-log" + assert source_selection["task_id"] == task_id + assert Path(str(source_selection["input"])).resolve() == log_path.resolve() + assert Path(str(source_selection["judge_agent"])).resolve() == agent_prompt_path.resolve() + + automation = report["automation"] + assert automation["source_kind"] == "aworld-trajectory-log" + assert automation["task_id"] == task_id + assert Path(str(automation["source_input"])).resolve() == log_path.resolve() + + result = report["results"][0] + assert result["case_id"] == task_id + assert result["judge"]["verdict"] in {"Excellent", "Pass", "Marginal", "Fail"} + assert 0 <= result["judge"]["score"] <= 100 + assert result["state_summary"]["answer"] + assert Path(result["metadata"]["extracted_path"]).exists() + _assert_report_trajectory_steps_match_extracted(result) + + extracted = json.loads(Path(result["metadata"]["extracted_path"]).read_text(encoding="utf-8")) + assert extracted["task_id"] == task_id + assert extracted["final_answer"] + assert extracted["evidence"] + + def _trajectory_judge_prompt(case_input: dict[str, Any], target: dict[str, Any], suite: EvalSuiteDef) -> str: outcome = (target.get("artifacts") or {}).get("outcome") or {} extracted_path = outcome.get("extracted_path") @@ -358,3 +487,36 @@ async def test_manual_trajectory_log_case_runs_end_to_end_for_human_replay(reque assert Path(report["results"][0]["metadata"]["extracted_path"]).exists() _assert_report_trajectory_steps_match_extracted(report["results"][0]) assert report_path.exists() + + +def test_manual_trajectory_log_case_runs_via_source_cli_for_human_replay(request: pytest.FixtureRequest): + try: + config = _manual_replay_config(request.config) + except pytest.UsageError as exc: + pytest.skip(str(exc)) + task_id = config["task_id"] + log_path = config["log_path"] + agent_prompt_path = config["agent_prompt_path"] + out_dir = config["out_dir"] + + if not log_path.exists(): + pytest.skip(f"manual trajectory log not found: {log_path}") + if not agent_prompt_path.exists(): + pytest.skip(f"manual trajectory evaluator agent prompt not found: {agent_prompt_path}") + if not os.getenv("LLM_MODEL_NAME") or not (os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")): + pytest.skip("real trajectory judge requires LLM_MODEL_NAME and LLM_API_KEY/OPENAI_API_KEY") + + report = run_evaluator_source_cli( + input=str(log_path), + kind="aworld-trajectory-log", + task_id=task_id, + judge_agent=str(agent_prompt_path), + out_dir=str(out_dir), + ) + + _assert_source_cli_trajectory_report_matches_manual_goal( + report, + task_id=task_id, + log_path=log_path, + agent_prompt_path=agent_prompt_path, + ) From e4bb56a6e6929cb002f29989f94633a899153842 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 20:36:32 +0800 Subject: [PATCH 34/41] feat: simplify evaluator source commands --- .../src/aworld_cli/commands/__init__.py | 3 + .../src/aworld_cli/commands/evaluation_cmd.py | 106 ++++++++++++++++++ .../src/aworld_cli/evaluator_runtime.py | 1 + .../top_level_commands/evaluator_cmd.py | 49 ++++---- docs/AWorld CLI/Commands/Evaluator.md | 12 +- .../design.md | 16 +-- .../proposal.md | 2 +- .../tasks.md | 4 +- tests/core/test_evaluator_runtime.py | 49 ++++++++ .../core/test_evaluator_top_level_command.py | 49 ++++++-- tests/docs/test_evaluator_report_docs.py | 2 +- tests/test_slash_commands.py | 73 +++++++++++- 12 files changed, 317 insertions(+), 49 deletions(-) create mode 100644 aworld-cli/src/aworld_cli/commands/evaluation_cmd.py diff --git a/aworld-cli/src/aworld_cli/commands/__init__.py b/aworld-cli/src/aworld_cli/commands/__init__.py index f5875ae2e..a0d8da80d 100644 --- a/aworld-cli/src/aworld_cli/commands/__init__.py +++ b/aworld-cli/src/aworld_cli/commands/__init__.py @@ -13,6 +13,7 @@ - /cron: Manage scheduled tasks (tool command) - /dispatch: Submit task to background execution (tool command) - /tasks: Manage background tasks (tool command) +- /evaluation: Run evaluator flows (tool command) Usage: # Import to register all commands @@ -33,6 +34,7 @@ from . import dispatch from . import tasks from . import plugins_cmd +from . import evaluation_cmd __all__ = [ "help_cmd", @@ -44,4 +46,5 @@ "dispatch", "tasks", "plugins_cmd", + "evaluation_cmd", ] diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py new file mode 100644 index 000000000..92b4b0e4f --- /dev/null +++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py @@ -0,0 +1,106 @@ +""" +/evaluation command - Run evaluator flows from chat. +""" +from __future__ import annotations + +import argparse +import shlex + +from aworld_cli.core.command_system import Command, CommandContext, register_command +from aworld_cli.evaluator_rendering import render_evaluator_summary +from aworld_cli.evaluator_runtime import run_evaluator_source_cli + + +def _usage() -> str: + return """Usage: + /evaluation --input --kind task-answer --judge-agent [--out-dir ] + /evaluation --input --kind aworld-trajectory-log --task-id --judge-agent [--out-dir ] + +Examples: + /evaluation --input ./task_answers.jsonl --kind task-answer --judge-agent ./eval/answer_judge/agent.md + /evaluation --input ~/Documents/logs/trajectory.log --kind aworld-trajectory-log --task-id task_123 --judge-agent ./eval/trajectory_evaluator/agent.md +""" + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="/evaluation", add_help=False) + parser.add_argument("--input", required=True) + parser.add_argument("--kind", required=True) + parser.add_argument("--judge-agent", required=True) + parser.add_argument("--out-dir") + parser.add_argument("--output") + parser.add_argument("--task-id") + parser.add_argument("--agent") + parser.add_argument("--id-field", default="id") + parser.add_argument("--task-field", default="input") + parser.add_argument("--answer-field", default="answer") + parser.add_argument("--interactive-approval", action="store_true") + parser.add_argument("--help", action="store_true") + return parser + + +@register_command +class EvaluationCommand(Command): + @property + def name(self) -> str: + return "evaluation" + + @property + def description(self) -> str: + return "Run evaluator flows" + + @property + def command_type(self) -> str: + return "tool" + + @property + def completion_items(self) -> dict[str, str]: + return { + "/evaluation --kind task-answer": "Evaluate task+answer JSONL records", + "/evaluation --kind aworld-trajectory-log": "Evaluate an AWorld trajectory log task", + } + + async def execute(self, context: CommandContext) -> str: + raw_args = (context.user_args or "").strip() + if not raw_args: + return _usage() + + try: + parts = shlex.split(raw_args) + except ValueError as exc: + return f"Evaluator error: {exc}\n\n{_usage()}" + + if not parts or parts[0] in {"help", "--help", "-h"}: + return _usage() + + parser = _build_parser() + try: + args = parser.parse_args(parts) + except SystemExit: + return _usage() + + if args.help: + return _usage() + + try: + report = run_evaluator_source_cli( + input=args.input, + kind=args.kind, + judge_agent=args.judge_agent, + out_dir=args.out_dir, + output=args.output, + task_id=args.task_id, + agent=args.agent, + id_field=args.id_field, + task_field=args.task_field, + answer_field=args.answer_field, + interactive_approval=args.interactive_approval, + ) + except (FileNotFoundError, ValueError, KeyError) as exc: + return f"Evaluator error: {exc}" + + summary = render_evaluator_summary(report) + report_path = report.get("report_path") + if report_path: + return f"{summary}\nReport: {report_path}" + return summary diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index bb124c0b8..97c332fa3 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -301,6 +301,7 @@ def _build_source_suite( pass_all=( GateMetricCondition(metric_name="score", op=">=", threshold=70.0), GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), + GateMetricCondition(metric_name="veto_triggered", op="==", threshold=False), GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), ) diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index 64db5ec8e..3d90f5186 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -42,27 +42,18 @@ def register_parser(self, subparsers) -> None: parser.add_argument("--list-suites", action="store_true") parser.add_argument("--print-report-schema", action="store_true") parser.add_argument("--validate-report", type=str) - subparsers = parser.add_subparsers(dest="evaluator_action") - run_parser = subparsers.add_parser( - "run", - help="Run a source-backed evaluator flow.", - description="Run a source-backed evaluator flow.", - prog="aworld-cli evaluator run", - ) - run_parser.add_argument("--input", required=True) - run_parser.add_argument("--kind", required=True) - run_parser.add_argument("--judge-agent", required=True) - run_parser.add_argument("--out-dir") - run_parser.add_argument("--output") - run_parser.add_argument("--task-id") - run_parser.add_argument("--agent") - run_parser.add_argument("--id-field", default="id") - run_parser.add_argument("--task-field", default="input") - run_parser.add_argument("--answer-field", default="answer") - run_parser.add_argument("--interactive-approval", action="store_true") + parser.add_argument("--input", type=str) + parser.add_argument("--kind", type=str) + parser.add_argument("--judge-agent", type=str) + parser.add_argument("--out-dir", type=str) + parser.add_argument("--task-id", type=str) + parser.add_argument("--agent", type=str) + parser.add_argument("--id-field", default="id") + parser.add_argument("--task-field", default="input") + parser.add_argument("--answer-field", default="answer") def run(self, args, context) -> int: - if getattr(args, "evaluator_action", None) == "run": + if getattr(args, "input", None): incompatible_args = ( ("target", "--target"), ("suite", "--suite"), @@ -72,8 +63,14 @@ def run(self, args, context) -> int: ) for attr_name, flag_name in incompatible_args: if getattr(args, attr_name, None): - print(f"Evaluator error: {flag_name} cannot be used with evaluator run") + print(f"Evaluator error: {flag_name} cannot be used with --input") return 1 + if not getattr(args, "kind", None): + print("Evaluator error: --kind is required with --input") + return 1 + if not getattr(args, "judge_agent", None): + print("Evaluator error: --judge-agent is required with --input") + return 1 try: report = run_evaluator_source_cli( input=args.input, @@ -94,6 +91,18 @@ def run(self, args, context) -> int: print(render_evaluator_summary(report)) return evaluator_exit_code(report) + source_only_args = ( + ("kind", "--kind"), + ("judge_agent", "--judge-agent"), + ("out_dir", "--out-dir"), + ("task_id", "--task-id"), + ("agent", "--agent"), + ) + for attr_name, flag_name in source_only_args: + if getattr(args, attr_name, None): + print(f"Evaluator error: --input is required when using {flag_name}") + return 1 + if getattr(args, "print_report_schema", False): print(json.dumps(get_evaluator_report_schema(), ensure_ascii=False, indent=2)) return 0 diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index ceb6ccb7f..108884cb6 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -33,13 +33,13 @@ aworld-cli evaluator --validate-report ./.aworld/evaluations/artifact.app-evalua Source-backed usage: ```bash -aworld-cli evaluator run \ +aworld-cli evaluator \ --input ./task_answers.jsonl \ --kind task-answer \ --judge-agent ./eval/answer_judge/agent.md \ --out-dir ./reports -aworld-cli evaluator run \ +aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ --kind aworld-trajectory-log \ --task-id task_20260609193335 \ @@ -54,7 +54,7 @@ Useful options: ```bash aworld-cli evaluator --target ./artifact --output ./report.json aworld-cli evaluator --target ./artifact --interactive-approval -aworld-cli evaluator run --input ./task_answers.jsonl --kind task-answer --judge-agent ./agent.md --output ./report.json +aworld-cli evaluator --input ./task_answers.jsonl --kind task-answer --judge-agent ./agent.md --output ./report.json ``` ## Declared Suite Manifests @@ -144,7 +144,7 @@ Key report sections: - `gate`: structured `pass` / `fail` / `needs_approval` decision - `automation`: exit-code-oriented summary fields for scripts and CI - `suite_selection`: resolved/defaulted suite selection diagnostics -- `source_selection`: source input diagnostics for `aworld-cli evaluator run` +- `source_selection`: source input diagnostics for source-backed `aworld-cli evaluator --input ...` - `approval`: approval decision metadata when the gate requires human confirmation See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/aworld/examples/aworld_quick_start/cli/evaluator_report.example.json) for a minimal example. @@ -153,7 +153,7 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ 1. Inspect matching suites with `aworld-cli evaluator --list-suites --target ./artifact`. 2. Run evaluation with `aworld-cli evaluator --target ./artifact`. -3. For existing outputs, run source-backed evaluation with `aworld-cli evaluator run --input --kind task-answer --judge-agent `. +3. For existing outputs, run source-backed evaluation with `aworld-cli evaluator --input --kind task-answer --judge-agent `. 4. Save or collect the emitted JSON report. 5. Validate persisted reports with `aworld-cli evaluator --validate-report `. 6. Export the current JSON Schema with `aworld-cli evaluator --print-report-schema` when integrating with external tooling. @@ -172,5 +172,5 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ - declared suite manifests currently layer on `app-evaluator` only; they are not a generic suite authoring format yet. - `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. - `--validate-report` validates an existing JSON report against that schema without re-running evaluation. -- `aworld-cli evaluator run` currently supports `task-answer` and `aworld-trajectory-log`; task-only execution sources and generic serialized-state sources are intentionally deferred until the framework provides those source kinds. +- `aworld-cli evaluator --input ...` currently supports `task-answer` and `aworld-trajectory-log`; task-only execution sources and generic serialized-state sources are intentionally deferred until the framework provides those source kinds. - the CLI command is an assembly/product layer; reusable evaluator building blocks stay in `aworld/evaluations/**`. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md index b12fac5fe..8e1a2b4f1 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md @@ -30,7 +30,7 @@ The new framework input-source layer will make evaluation inputs first-class: ta The canonical source-backed path should be: ```bash -aworld-cli evaluator run \ +aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ --kind aworld-trajectory-log \ --task-id task_20260609193335 \ @@ -41,7 +41,7 @@ aworld-cli evaluator run \ Task+answer files: ```bash -aworld-cli evaluator run \ +aworld-cli evaluator \ --input task_answers.jsonl \ --kind task-answer \ --judge-agent eval/answer_judge/agent.md \ @@ -53,7 +53,7 @@ The default JSONL fields are `id`, `input`, and `answer`. `--id-field`, `--task- Task-only files are a follow-on source kind once the framework input-source layer adds task-only source support: ```bash -aworld-cli evaluator run \ +aworld-cli evaluator \ --input tasks.jsonl \ --kind task \ --id-field task_id \ @@ -94,7 +94,7 @@ The evaluator command does not own: The implementation should follow existing CLI conventions: - keep `EvaluatorTopLevelCommand` as the command object exposed through the builtin evaluator plugin entrypoint -- add `run` as a subparser under `evaluator`, or otherwise route source-backed arguments through the same command object without creating a new top-level command +- route source-backed `--input` arguments through the same command object without creating a new top-level command - keep source-backed flow assembly in `aworld_cli.evaluator_runtime` - use `PluginManager`, `get_builtin_plugin_roots`, `load_plugin_hooks`, and `_run_evaluator_hooks` as the hook path @@ -147,19 +147,19 @@ Existing usage remains valid: aworld-cli evaluator --target ./some-target --suite app-evaluator ``` -The new `evaluator run` source path should not break `--list-suites`, `--print-report-schema`, `--validate-report`, or interactive approval behavior. +The new `evaluator --input ...` source path should not break `--list-suites`, `--print-report-schema`, `--validate-report`, or interactive approval behavior. ## Risks / Trade-offs -- [Command ambiguity] `evaluator --target` and `evaluator run --input` can coexist, but parser errors must clearly explain which mode is active. +- [Command ambiguity] `evaluator --target` and `evaluator --input` are mutually exclusive, so parser errors must clearly explain which mode is active. - [Too many flags] Field mappings are necessary for generic JSONL. Presets can reduce repeated arguments later. -- [Case-specific drift] Avoid canonical `evaluator trajectory-log`; if aliases are added later, they should delegate to `evaluator run --kind aworld-trajectory-log`. +- [Case-specific drift] Avoid canonical `evaluator trajectory-log`; if aliases are added later, they should delegate to `evaluator --input ... --kind aworld-trajectory-log`. - [Plugin overreach] Hook contracts must state that plugins customize CLI assembly and side effects only. ## Migration Plan 1. Land framework input sources first, including the manual test refactor. -2. Add source-run parser mode to the existing evaluator command. +2. Add source-backed `--input` parser mode to the existing evaluator command. 3. Add source-run runtime helper that calls framework APIs. 4. Extend evaluator hook event payloads with source mode fields. 5. Add CLI tests for argument validation and runtime delegation. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md index 331b8ecf8..9ce836577 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md @@ -10,7 +10,7 @@ The existing CLI already has an official evaluator command implemented through t ## What Changes -- Add a source-backed `aworld-cli evaluator run` mode to the existing evaluator command. +- Add a source-backed `aworld-cli evaluator --input ...` mode to the existing evaluator command. - Support source-oriented arguments: `--input`, `--kind`, optional field mappings, optional `--task-id`, `--agent`, `--judge-agent`, and output options. - Use conventional JSONL field defaults (`id`, `input`, `answer`) so simple task+answer files do not require field-mapping flags. - Keep the canonical command source-oriented rather than case-specific; trajectory-log, task-only, and task+answer are input kinds, not separate evaluator stacks. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md index fe3ac6d1c..7cc91c5f0 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md @@ -1,7 +1,7 @@ ## 1. Parser And Command Shape - [x] 1.1 Confirm `aworld-evaluator-input-sources-2026-06-10` has landed before implementing source-backed CLI behavior. -- [x] 1.2 Extend the existing `EvaluatorTopLevelCommand` parser with a source-backed `run` mode. +- [x] 1.2 Extend the existing `EvaluatorTopLevelCommand` parser with source-backed `--input` mode. - [x] 1.3 Add `--input`, `--kind`, `--judge-agent`, `--out-dir`, `--output`, `--task-id`, `--agent`, and optional JSONL field mapping arguments for source mode. - [x] 1.4 Default task+answer JSONL field mappings to `id`, `input`, and `answer`. - [x] 1.5 Preserve existing `--target`, `--suite`, `--list-suites`, `--print-report-schema`, `--validate-report`, and `--interactive-approval` behavior. @@ -32,7 +32,7 @@ ## 5. Tests -- [x] 5.1 Add parser tests for source-backed `evaluator run` arguments. +- [x] 5.1 Add parser tests for source-backed `evaluator --input` arguments. - [x] 5.2 Add validation tests for required source-mode arguments and incompatible argument combinations. - [x] 5.3 Add runtime delegation tests using fake framework source helpers. - [x] 5.4 Add hook payload tests for source-backed pre-run/post-run/render events. diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 2c442d43b..5d3767b6c 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -13,6 +13,7 @@ from aworld.evaluations.manifests import get_declared_eval_suite_schema from aworld.evaluations.report import EvaluatorReport from aworld_cli.evaluator_runtime import ( + _build_source_suite, _build_source_prompt, available_evaluator_suites, evaluator_exit_code, @@ -142,6 +143,54 @@ def test_run_evaluator_source_cli_rejects_unsupported_source_kind(tmp_path: Path ) +def test_trajectory_source_gate_consumes_veto_signal(tmp_path: Path) -> None: + task_id = "task-with-veto" + trajectory = [ + { + "state": {"input": {"content": "question"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "final", "is_agent_finished": "True"}, + } + ] + input_path = tmp_path / "trajectory.log" + input_path.write_text( + repr({"task_id": task_id, "is_sub_task": False, "trajectory": json.dumps(trajectory)}) + "\n", + encoding="utf-8", + ) + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + suite = _build_source_suite( + kind="aworld-trajectory-log", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=task_id, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + pass_conditions = suite.gate_policy.normalized_conditions()[0] + assert any( + condition.metric_name == "veto_triggered" + and condition.op == "==" + and condition.threshold is False + for condition in pass_conditions + ) + decision = suite.gate_policy.evaluate( + { + "score": 95.0, + "A1_groundedness": 5, + "has_evidence": 1.0, + "agent_finished": 1.0, + "veto_triggered": True, + } + ) + assert decision.status == "fail" + assert any(condition["metric_name"] == "veto_triggered" for condition in decision.failed_conditions) + + def test_run_evaluator_source_cli_passes_source_fields_to_hooks( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index 00d2e1d3c..acbf8b9f4 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -98,7 +98,6 @@ def fake_run_evaluator_source_cli(**kwargs): [ "aworld-cli", "evaluator", - "run", "--input", str(input_path), "--kind", @@ -159,7 +158,6 @@ def test_evaluator_source_run_rejects_target_mode_arguments( exit_code = EvaluatorTopLevelCommand().run( SimpleNamespace( - evaluator_action="run", target="artifact.txt", input="answers.jsonl", kind="task-answer", @@ -178,16 +176,16 @@ def test_evaluator_source_run_rejects_target_mode_arguments( output = capsys.readouterr().out assert exit_code == 1 - assert "--target cannot be used with evaluator run" in output + assert "--target cannot be used with --input" in output @pytest.mark.parametrize( ("arg_name", "expected"), [ - ("suite", "--suite cannot be used with evaluator run"), - ("list_suites", "--list-suites cannot be used with evaluator run"), - ("print_report_schema", "--print-report-schema cannot be used with evaluator run"), - ("validate_report", "--validate-report cannot be used with evaluator run"), + ("suite", "--suite cannot be used with --input"), + ("list_suites", "--list-suites cannot be used with --input"), + ("print_report_schema", "--print-report-schema cannot be used with --input"), + ("validate_report", "--validate-report cannot be used with --input"), ], ) def test_evaluator_source_run_rejects_other_target_mode_arguments( @@ -201,7 +199,6 @@ def test_evaluator_source_run_rejects_other_target_mode_arguments( lambda **kwargs: pytest.fail("source runtime should not be called"), ) args = { - "evaluator_action": "run", "target": None, "suite": None, "input": "answers.jsonl", @@ -231,6 +228,42 @@ def test_evaluator_source_run_rejects_other_target_mode_arguments( assert expected in output +def test_evaluator_source_mode_requires_kind_and_judge_agent( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + input="answers.jsonl", + kind=None, + judge_agent=None, + out_dir=None, + output=None, + task_id=None, + agent=None, + id_field="id", + task_field="input", + answer_field="answer", + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=None, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert "--kind is required with --input" in output + + def test_evaluator_command_returns_nonzero_for_unresolved_approval( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py index a27eb9ee0..b35668b1e 100644 --- a/tests/docs/test_evaluator_report_docs.py +++ b/tests/docs/test_evaluator_report_docs.py @@ -13,7 +13,7 @@ def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: assert "aworld-cli evaluator" in content assert "--print-report-schema" in content assert "--validate-report" in content - assert "aworld-cli evaluator run" in content + assert "aworld-cli evaluator --input" in content assert "--kind task-answer" in content assert "--kind aworld-trajectory-log" in content assert "report_format" in content diff --git a/tests/test_slash_commands.py b/tests/test_slash_commands.py index 28ffdb771..3036459ef 100644 --- a/tests/test_slash_commands.py +++ b/tests/test_slash_commands.py @@ -18,7 +18,7 @@ from aworld_cli.core.command_system import CommandRegistry, CommandContext from aworld.plugins.discovery import discover_plugins from aworld_cli.plugin_capabilities.commands import register_plugin_commands -from aworld_cli.commands import help_cmd, commit, review, diff, cron_cmd, plugins_cmd +from aworld_cli.commands import help_cmd, commit, review, diff, cron_cmd, plugins_cmd, evaluation_cmd from aworld_cli.console import AWorldCLI @@ -27,7 +27,7 @@ class TestCommandRegistration: def test_commands_registered(self): """Verify all commands are registered.""" - expected_commands = ['help', 'commit', 'review', 'diff', 'cron', 'plugins'] + expected_commands = ['help', 'commit', 'review', 'diff', 'cron', 'plugins', 'evaluation'] for cmd_name in expected_commands: cmd = CommandRegistry.get(cmd_name) assert cmd is not None, f"Command /{cmd_name} not registered" @@ -44,7 +44,7 @@ def test_command_types(self): cmd = CommandRegistry.get(cmd_name) assert cmd.command_type == 'prompt', f"/{cmd_name} should be prompt command" - for cmd_name in ['cron', 'plugins']: + for cmd_name in ['cron', 'plugins', 'evaluation']: tool_cmd = CommandRegistry.get(cmd_name) assert tool_cmd.command_type == 'tool' @@ -59,6 +59,7 @@ def test_list_commands(self): assert 'diff' in command_names assert 'cron' in command_names assert 'plugins' in command_names + assert 'evaluation' in command_names class TestHelpCommand: @@ -79,6 +80,7 @@ async def test_help_command_execution(self): assert '/review' in result assert '/diff' in result assert '/plugins' in result + assert '/evaluation' in result @pytest.mark.asyncio async def test_help_command_with_args(self): @@ -550,9 +552,74 @@ async def _drain_notifications(self, job_id=None): assert remaining[0].job_id == "job-2" +class TestEvaluationCommand: + """Test /evaluation command direct execution.""" + + @pytest.mark.asyncio + async def test_evaluation_without_args_shows_usage(self): + cmd = CommandRegistry.get("evaluation") + + result = await cmd.execute(CommandContext(cwd=os.getcwd(), user_args="")) + + assert "Usage:" in result + assert "/evaluation --input" in result + assert "--kind aworld-trajectory-log" in result + + @pytest.mark.asyncio + async def test_evaluation_delegates_to_source_runtime(self, monkeypatch, tmp_path): + cmd = CommandRegistry.get("evaluation") + input_path = tmp_path / "trajectory.log" + agent_path = tmp_path / "agent.md" + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "trajectory-log-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"trajectory-log-source-evaluator": {"score": {"mean": 88.0}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + "report_path": str(tmp_path / "report.json"), + } + + monkeypatch.setattr( + "aworld_cli.commands.evaluation_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + result = await cmd.execute( + CommandContext( + cwd=os.getcwd(), + user_args=( + f"--input {input_path} --kind aworld-trajectory-log " + f"--task-id task-1 --judge-agent {agent_path} --out-dir {tmp_path}" + ), + ) + ) + + assert calls["input"] == str(input_path) + assert calls["kind"] == "aworld-trajectory-log" + assert calls["task_id"] == "task-1" + assert calls["judge_agent"] == str(agent_path) + assert calls["out_dir"] == str(tmp_path) + assert "trajectory-log-source-evaluator" in result + assert "Report:" in result + + class TestSlashCommandCompletion: """Test slash command completion sources.""" + def test_console_completion_entries_include_evaluation_command(self): + cli = AWorldCLI() + + words, meta = cli._build_completion_entries(agent_names=[]) + + assert "/evaluation" in words + assert "/evaluation --kind task-answer" in words + assert "/evaluation --kind aworld-trajectory-log" in words + assert meta["/evaluation"] == "Run evaluator flows" + def test_console_completion_entries_include_cron_subcommands(self): """Typing /cron should expose concrete cron subcommands in the completer source.""" cli = AWorldCLI() From cfaaaf59f246ba5f3b85aae7bb1e84875342fa20 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 20:42:11 +0800 Subject: [PATCH 35/41] fix: avoid nested loop in evaluation slash command --- .../src/aworld_cli/commands/evaluation_cmd.py | 4 +- tests/test_slash_commands.py | 40 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py index 92b4b0e4f..1af5b5b3b 100644 --- a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py +++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py @@ -4,6 +4,7 @@ from __future__ import annotations import argparse +import asyncio import shlex from aworld_cli.core.command_system import Command, CommandContext, register_command @@ -83,7 +84,8 @@ async def execute(self, context: CommandContext) -> str: return _usage() try: - report = run_evaluator_source_cli( + report = await asyncio.to_thread( + run_evaluator_source_cli, input=args.input, kind=args.kind, judge_agent=args.judge_agent, diff --git a/tests/test_slash_commands.py b/tests/test_slash_commands.py index 3036459ef..704811c71 100644 --- a/tests/test_slash_commands.py +++ b/tests/test_slash_commands.py @@ -606,6 +606,46 @@ def fake_run_evaluator_source_cli(**kwargs): assert "trajectory-log-source-evaluator" in result assert "Report:" in result + @pytest.mark.asyncio + async def test_evaluation_runs_source_runtime_without_nested_event_loop(self, monkeypatch, tmp_path): + cmd = CommandRegistry.get("evaluation") + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8") + agent_path = tmp_path / "agent.md" + agent_path.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + async def fake_run_evaluation_flow(flow): + return { + "report_version": 1, + "report_format": {"id": "aworld.evaluator.report", "version": 1}, + "generated_at": "2026-06-10T00:00:00Z", + "suite_id": "source-evaluator", + "target": flow.target, + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"source-evaluator": {"score": {"mean": 88.0}}}, + "metrics": {"score": {"mean": 88.0}}, + "results": [], + "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, + "gate": {"status": "pass", "metric_name": "score", "value": 88.0}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime._load_evaluator_hooks", lambda: {}) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + result = await cmd.execute( + CommandContext( + cwd=os.getcwd(), + user_args=( + f"--input {input_path} --kind task-answer " + f"--judge-agent {agent_path} --output {tmp_path / 'report.json'}" + ), + ) + ) + + assert "source-evaluator" in result + assert "Report:" in result + class TestSlashCommandCompletion: """Test slash command completion sources.""" From 3aad4705616136d2340d3f14f5dcb0af07cab737 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 21:06:39 +0800 Subject: [PATCH 36/41] feat: add answer quality evaluator agent --- .../answer_quality_agent.md | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 eval/trajectory_evaluator/answer_quality_agent.md diff --git a/eval/trajectory_evaluator/answer_quality_agent.md b/eval/trajectory_evaluator/answer_quality_agent.md new file mode 100644 index 000000000..ea85ef4f2 --- /dev/null +++ b/eval/trajectory_evaluator/answer_quality_agent.md @@ -0,0 +1,116 @@ +--- +name: answer-quality-judge +description: 使用 LLM-as-judge 对「问题 ↔ 答案」对做答案质量评估(reference-free,可选参考答案)。仅评估最终答案本身的正确性、完整性、贴合度、可读性与忠实度,不评估执行过程或工具使用。输入由 evaluator 框架以 JSON 注入(case + state.answer),无需读日志、无需调用工具。 +tools: Read, Write +model: opus +--- + +# Answer Quality Evaluator(LLM-as-Judge) + +你是一名严格、以证据为准的 **答案质量评审员**。你的职责是对一条「问题 ↔ 答案」对做可复现、可量化的评估,**只覆盖最终答案本身的质量**,不评估 agent 的执行过程、工具使用或轨迹。 + +你**就是**这里的 LLM judge:所有打分由你完成,不调用外部模型,也**不需要读取任何文件或运行任何命令**。 + +## 评估输入(由框架注入) + +evaluator 框架会以**单个 JSON 对象**作为你的输入消息,结构形如: + +```json +{ + "case": { "task_id": "...", "input": "用户的原始问题/任务" }, + "state": { "answer": "待评估的最终答案", "status": "...", "artifacts": {}, "trajectory": [], "tool_calls": [] }, + "required_output_schema": { "score": "number 0-100", "verdict": "string" }, + "instruction": "Evaluate the existing answer/state and return exactly one JSON object." +} +``` + +判据来源(按优先级): + +1. **`case.input`**:用户实际想要什么——这是判断「相关性/贴合度」和「完整性」的标尺。 +2. **`state.answer`**:被评估的答案——所有评分的对象。 +3. **参考答案(若存在)**:若 `case.input` 或 `state.artifacts` 中显式给出了 reference / 标准答案 / 验收要点,则以其为「正确性」基准;否则按 **reference-free** 处理,仅凭答案的内在一致性与常识可验证性判断。 + +> 若 `state.answer` 为空或缺失,直接判 `Fail`、`score=0`、`Q1=1`,并在 `notes` 中说明。 + +--- + +## 阶段 1 · 评分(五维,1–5 分,带锚点) + +对每个维度给出 1–5 的整数分,并**引用答案中的具体片段或问题中的具体要求**作为依据。严禁仅凭印象打分。 + +锚点统一含义:**5=优秀无明显问题 / 4=良好有小瑕疵 / 3=合格但有明确缺陷 / 2=较差影响可用性 / 1=不合格**。 + +| 维度 | 权重 | 评什么 | 扣分信号 | +|---|---|---|---| +| Q1 正确性 / Correctness | 30% | 答案中的事实性断言、计算、结论是否正确;有参考时是否与参考一致 | 与参考矛盾;事实错误;计算/逻辑错误;编造数字、引述、专有名词 | +| Q2 完整性 / Completeness | 25% | 是否覆盖问题的全部子诉求与关键要点,无关键遗漏 | 漏答子问题;只答一半;缺少必要前提/边界 | +| Q3 贴合度 / Relevance & Instruction-following | 20% | 是否回答了**实际被问的问题**,是否遵守问题中的显式约束(格式、语言、长度、口吻等) | 答非所问;主题漂移;违反明确的格式/语言/长度要求 | +| Q4 可读性 / Clarity | 15% | 组织、清晰度、长度适配、表达是否凝练无歧义 | 冗长堆砌;结构混乱;表述含混;语言与提问不一致 | +| Q5 忠实度 / Faithfulness | 10% | 是否不臆造、不过度自信;不确定处是否如实标注;有参考时是否不超出参考范围杜撰 | 把猜测当事实;编造来源;无依据的绝对化断言 | + +--- + +## 阶段 2 · 汇总与判定 + +1. 计算加权总分(百分制):`score = Σ(dim_score / 5 × weight) × 100`,四舍五入到整数。 +2. 给出等级:`≥85 Excellent / 70–84 Pass / 55–69 Marginal / <55 Fail`。 +3. **一票否决项**:若 Q1 正确性 ≤2(存在实质性事实/逻辑错误,或与参考答案直接矛盾),则置 `veto_triggered=true`,且最终 `verdict` 不得高于 `Marginal`,无论加权总分多少。 +4. 列出 **Top-3 优点** 与 **Top-3 待改进项**,每条附答案中的证据指针与可执行的改进建议。 + +### 评判纪律(消除 judge 偏差) + +- 不因答案**更长 / 更华丽 / 更自信**而加分;只认正确性与目标贴合度。 +- 不被答案的自述(「我已确认…」「显然…」)影响——这类措辞需用问题约束与内在一致性核实。 +- 无参考时不确定真伪的事实,按「未证实」处理:可影响 Q5,但不要据此武断判 Q1 错误,除非违背常识或自相矛盾。 +- 打分**先写推理(引证),后给分数**,避免先入为主。 +- 语言中立:答案语言与问题不一致时,扣 Q3,而非据此曲解内容。 + +--- + +## 阶段 3 · 产出(返回严格 JSON) + +**你的最终回复必须是且仅是一个 JSON 对象,不要包裹 markdown 代码块、不要前后缀说明文字。** 框架会直接解析它。`score` 与 `verdict` 为框架必需字段,其余字段供报告与诊断使用: + +```json +{ + "task_id": "string", + "score": 0, + "verdict": "Excellent|Pass|Marginal|Fail", + "veto_triggered": false, + "Q1_correctness": 0, + "Q2_completeness": 0, + "Q3_relevance": 0, + "Q4_clarity": 0, + "Q5_faithfulness": 0, + "dimensions": { + "Q1_correctness": {"score": 0, "weight": 0.30, "evidence": ["..."], "rationale": "..."}, + "Q2_completeness": {"score": 0, "weight": 0.25, "evidence": ["..."], "rationale": "..."}, + "Q3_relevance": {"score": 0, "weight": 0.20, "evidence": ["..."], "rationale": "..."}, + "Q4_clarity": {"score": 0, "weight": 0.15, "evidence": ["..."], "rationale": "..."}, + "Q5_faithfulness": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."} + }, + "errors": [{"claim": "...", "why_wrong": "..."}], + "top_strengths": ["..."], + "top_improvements": [{"issue": "...", "evidence": "...", "suggestion": "..."}], + "notes": "可选:边界情况说明(如答案缺失、无参考、语言不一致等)" +} +``` + +字段约束: +- `score`:0–100 的整数,**等于**阶段 2 第 1 步算出的加权总分。 +- `verdict`:四档之一,且与 `score` 区间一致;触发一票否决时不得高于 `Marginal`。 +- `Q1..Q5` 顶层字段:与 `dimensions` 内对应 `score` 相同,便于框架直接读取。 +- `evidence`/`rationale`:引用答案或问题中的具体片段,不可空泛。 +- 报告语言与被评估答案保持一致。 + +> 仅当用户在 directive 中显式提供了 `OUT_DIR` 时,才额外用 `Write` 落一份人类可读的 `OUT_DIR/answer_eval_.md`;默认情况下**只返回上面的 JSON**,不写文件、不调工具。 + +--- + +## 执行清单(按序) + +- [ ] 解析注入的 JSON,定位 `case.input`、`state.answer`、可选参考答案 +- [ ] 答案缺失 → 直接 Fail 并返回(见输入说明) +- [ ] 五维逐项打分(先证据后分数) +- [ ] 加权汇总 + 一票否决检查 + 优缺点 +- [ ] 返回严格 JSON(score/verdict 必填,无 markdown 包裹) From a098b646641eabb321794537fcdffd06ae11313f Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 22:02:26 +0800 Subject: [PATCH 37/41] Add source-backed evaluator task flows --- .../src/aworld_cli/commands/evaluation_cmd.py | 17 +- .../src/aworld_cli/evaluator_runtime.py | 312 ++++++++++++++-- aworld/evaluations/report.py | 1 + aworld/evaluations/sources.py | 132 +++++-- docs/AWorld CLI/Commands/Evaluator.md | 34 +- .../design.md | 13 +- .../specs/cli-evaluator-flow/spec.md | 20 +- tests/core/test_evaluator_runtime.py | 351 +++++++++++++++++- .../core/test_evaluator_top_level_command.py | 63 +++- tests/docs/test_evaluator_report_docs.py | 5 +- .../test_evaluation_input_sources.py | 58 ++- .../test_trajectory_log_manual_case.py | 16 +- tests/test_slash_commands.py | 24 +- 13 files changed, 915 insertions(+), 131 deletions(-) diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py index 1af5b5b3b..1813e3135 100644 --- a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py +++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py @@ -14,12 +14,16 @@ def _usage() -> str: return """Usage: - /evaluation --input --kind task-answer --judge-agent [--out-dir ] - /evaluation --input --kind aworld-trajectory-log --task-id --judge-agent [--out-dir ] + /evaluation --input --kind task --judge-agent [--agent ] [--out-dir ] + /evaluation --input --kind answer --judge-agent [--out-dir ] + /evaluation --input --kind trajectory --judge-agent [--agent ] [--out-dir ] + /evaluation --input --kind trajectory --task-id --judge-agent [--out-dir ] Examples: - /evaluation --input ./task_answers.jsonl --kind task-answer --judge-agent ./eval/answer_judge/agent.md - /evaluation --input ~/Documents/logs/trajectory.log --kind aworld-trajectory-log --task-id task_123 --judge-agent ./eval/trajectory_evaluator/agent.md + /evaluation --input ./tasks.jsonl --kind task --judge-agent ./eval/answer_judge/agent.md + /evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./eval/answer_judge/agent.md + /evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./eval/trajectory_evaluator/agent.md + /evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./eval/trajectory_evaluator/agent.md """ @@ -57,8 +61,9 @@ def command_type(self) -> str: @property def completion_items(self) -> dict[str, str]: return { - "/evaluation --kind task-answer": "Evaluate task+answer JSONL records", - "/evaluation --kind aworld-trajectory-log": "Evaluate an AWorld trajectory log task", + "/evaluation --kind task": "Run tasks with the default agent, then evaluate the produced state", + "/evaluation --kind answer": "Evaluate existing task+answer JSONL records", + "/evaluation --kind trajectory": "Evaluate generated or replayed trajectories", } async def execute(self, context: CommandContext) -> str: diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 97c332fa3..1f6b1f704 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -3,9 +3,12 @@ import asyncio import builtins import json +import time from pathlib import Path +from typing import Any, Mapping from aworld.plugins.discovery import discover_plugins +from aworld.evaluations.execution import normalize_task_response_to_eval_state from aworld.evaluations.manifests import ( get_declared_eval_suite_schema as _get_declared_eval_suite_schema, ) @@ -25,8 +28,16 @@ describe_eval_target, run_evaluation_flow, ) -from aworld.evaluations.sources import AWorldTrajectoryLogSource, JsonlTaskAnswerSource, create_source_eval_suite +from aworld.evaluations.runtime_composition import RolloutState, RolloutTurn, derive_standard_metrics +from aworld.evaluations.sources import ( + AWorldTrajectoryLogSource, + JsonlTaskAnswerSource, + JsonlTaskSource, + create_source_eval_suite, + extract_aworld_trajectory_payload, +) from aworld.evaluations.trajectory_judge import TrajectoryJudgeSchema +from aworld.runner import Runners from pydantic import BaseModel from aworld_cli.core.plugin_manager import PluginManager, get_builtin_plugin_roots from aworld_cli.evaluator_rendering import render_evaluator_summary as _render_evaluator_summary @@ -38,6 +49,10 @@ from aworld_cli.plugin_capabilities.hooks import PluginHookResult, load_plugin_hooks +_CLI_AGENT_RUNTIME_BOOTSTRAPPED = False +_SUPPORTED_SOURCE_KINDS = ("task", "answer", "trajectory") + + def _sanitize_path_token(value: str) -> str: return "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in value).strip("-") or "target" @@ -112,6 +127,7 @@ def _build_automation_summary(report: dict) -> dict[str, object]: automation["source_kind"] = source_selection.get("kind") automation["source_input"] = source_selection.get("input") automation["task_id"] = source_selection.get("task_id") + automation["agent"] = source_selection.get("agent") return automation @@ -171,6 +187,19 @@ class _SourceJudgeOutput(BaseModel): verdict: str +def _looks_like_aworld_trajectory_log(path: Path) -> bool: + try: + with path.open(encoding="utf-8", errors="replace") as handle: + for line in handle: + stripped = line.strip() + if not stripped: + continue + return stripped.startswith("{") and "'trajectory'" in stripped and "'task_id'" in stripped + except OSError: + return False + return False + + def _source_report_path( *, input_path: Path, @@ -203,12 +232,180 @@ def _build_source_prompt(case_input: dict, target: dict, suite) -> str: return json.dumps(payload, ensure_ascii=False, indent=2) +def _case_query(case) -> str: + case_input = getattr(case, "input", {}) or {} + for key in ("input", "query", "prompt"): + if key in case_input and case_input[key] is not None: + return str(case_input[key]) + raise ValueError("task source case is missing input/query/prompt") + + +def _case_source_metadata(case) -> dict[str, Any]: + metadata = getattr(case, "metadata", {}) or {} + source_record = metadata.get("source_record") + if isinstance(source_record, Mapping) and isinstance(source_record.get("metadata"), Mapping): + return dict(source_record["metadata"]) + return {} + + +class _CliAgentRuntimeHarness: + def __init__(self, *, agent_name: str): + self.agent_name = agent_name + self._executor = None + + async def run_rollout(self, *, case, target: Mapping[str, Any]) -> RolloutState: + query = _case_query(case) + started_at = time.monotonic() + source_metadata = _case_source_metadata(case) + turns = [RolloutTurn(role="user", content=query)] + executor = await self._get_executor() + try: + swarm = getattr(executor, "swarm", None) + if swarm is not None: + answer = await Runners.run(input=query, swarm=swarm) + else: + answer = await executor.chat(query) + except Exception as exc: + duration_ms = int((time.monotonic() - started_at) * 1000) + state = RolloutState( + case_id=str(getattr(case, "case_id", "case")), + status="failed", + turns=turns, + trajectory=[turn.to_dict() for turn in turns], + timing={"duration_ms": duration_ms}, + error={"type": exc.__class__.__name__, "message": str(exc)}, + outcome={"has_answer": False, "agent": self.agent_name}, + metadata={**source_metadata, "agent": self.agent_name}, + ) + state.standard_metrics.update(derive_standard_metrics(state)) + return state + + duration_ms = int((time.monotonic() - started_at) * 1000) + eval_state = normalize_task_response_to_eval_state( + case_id=str(getattr(case, "case_id", "case")), + response=answer, + target=target, + metadata={**source_metadata, "agent": self.agent_name}, + ) + assistant_turn = RolloutTurn(role="assistant", content=eval_state.answer) + turns.append(assistant_turn) + trajectory = list(eval_state.trajectory) or [turn.to_dict() for turn in turns] + extracted_trajectory = {} + if trajectory: + try: + extracted_trajectory = extract_aworld_trajectory_payload( + trajectory, + task_id=eval_state.case_id, + is_sub_task=False, + ) + except Exception: + extracted_trajectory = {} + evidence_blocks = len(extracted_trajectory.get("evidence") or []) + is_finished = any( + bool(step.get("is_agent_finished")) + for step in extracted_trajectory.get("steps", []) + if isinstance(step, Mapping) + ) + state = RolloutState( + case_id=eval_state.case_id, + status=eval_state.status, + answer=eval_state.answer, + turns=turns, + trajectory=trajectory, + tool_calls=list(eval_state.tool_calls), + usage=dict(eval_state.usage), + timing={**dict(eval_state.timing), "duration_ms": duration_ms}, + error=eval_state.error, + outcome={ + "has_answer": eval_state.answer is not None, + "agent": self.agent_name, + "task_id": eval_state.case_id, + "question": query, + "evidence_blocks": evidence_blocks, + "num_steps": len(trajectory), + "is_finished": is_finished or eval_state.status == "success", + "final_answer_len": len(str(eval_state.answer or "")), + }, + metadata=dict(eval_state.metadata), + ) + state.standard_metrics.update(derive_standard_metrics(state)) + return state + + async def _get_executor(self): + if self._executor is None: + self._executor = await _load_cli_agent_executor(self.agent_name) + return self._executor + + +def _build_cli_agent_runtime_harness(*, agent_name: str): + return _CliAgentRuntimeHarness(agent_name=agent_name) + + +async def _load_cli_agent_executor(agent_name: str): + from aworld.core.scheduler import get_scheduler + from aworld_cli.main import _resolve_agent_dirs + from aworld_cli.runtime.cli import CliRuntime + + _ensure_cli_agent_runtime_bootstrapped() + runtime = CliRuntime( + agent_name=agent_name, + local_dirs=_resolve_agent_dirs(None), + disable_live_display=True, + ) + all_agents = await runtime._load_agents() + selected_agent = next((item for item in all_agents if item.name == agent_name), None) + if selected_agent is None: + available = ", ".join(sorted(item.name for item in all_agents)) or "none" + raise ValueError(f"agent '{agent_name}' not found; available agents: {available}") + + runtime._scheduler = get_scheduler() + runtime._bind_scheduler_default_agent(selected_agent.name) + executor = await runtime._create_executor(selected_agent) + if executor is None: + raise ValueError(f"failed to create executor for agent '{agent_name}'") + executor._base_runtime = runtime + executor._suppress_interactive_loading_status = True + return executor + + +def _ensure_cli_agent_runtime_bootstrapped() -> None: + global _CLI_AGENT_RUNTIME_BOOTSTRAPPED + if _CLI_AGENT_RUNTIME_BOOTSTRAPPED: + return + from aworld_cli.main import _show_banner, init_middlewares + from aworld_cli.runtime_bootstrap import RuntimeBootstrapError, bootstrap_runtime + + try: + bootstrap_runtime( + env_file=".env", + skill_paths=None, + show_banner=False, + init_middlewares_fn=init_middlewares, + show_banner_fn=_show_banner, + ) + except RuntimeBootstrapError as exc: + raise ValueError(str(exc)) from exc + _CLI_AGENT_RUNTIME_BOOTSTRAPPED = True + + def _build_trajectory_prompt(case_input: dict, target: dict, suite) -> str: outcome = (target.get("artifacts") or {}).get("outcome") or {} extracted_path = outcome.get("extracted_path") extracted_payload = {} if extracted_path: extracted_payload = json.loads(Path(str(extracted_path)).read_text(encoding="utf-8")) + elif isinstance(target.get("trajectory"), list) and target.get("trajectory"): + task_id = str(target.get("case_id") or case_input.get("id") or case_input.get("input_id") or case_input.get("_case_id") or "case") + extracted_payload = extract_aworld_trajectory_payload( + target["trajectory"], + task_id=task_id, + is_sub_task=False, + ) + if not extracted_payload.get("final_answer") and target.get("answer") is not None: + extracted_payload["final_answer"] = target.get("answer") + case_value = case_input.get("input") or case_input.get("query") or case_input.get("prompt") + if not extracted_payload.get("question") and case_value is not None: + extracted_payload["question"] = str(case_value) payload = { "case": {key: value for key, value in case_input.items() if not str(key).startswith("_")}, "extracted_trajectory": extracted_payload, @@ -244,17 +441,44 @@ def _build_source_suite( task_field: str, answer_field: str, out_dir: str | None, + agent: str | None = None, ): - if kind == "task-answer": - source = JsonlTaskAnswerSource( + agent_name = agent or "Aworld" + trajectory_gate = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), + GateMetricCondition(metric_name="veto_triggered", op="==", threshold=False), + GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), + GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), + ) + ) + trajectory_outcome_scorers = ( + StateCheckGrader( + metric_name="has_evidence", + source="outcome", + path=("evidence_blocks",), + op=">", + expected=0, + ), + StateCheckGrader( + metric_name="agent_finished", + source="outcome", + path=("is_finished",), + op="==", + expected=True, + ), + ) + if kind == "task": + source = JsonlTaskSource( path=input_path, id_field=id_field, input_field=task_field, - answer_field=answer_field, ) return create_source_eval_suite( - suite_id="source-evaluator", + suite_id="task-source-evaluator", source=source, + runtime_harness=_build_cli_agent_runtime_harness(agent_name=agent_name), judge_backend=AgentJudgeBackend.from_agent_markdown( judge_agent_path, backend_id="source-agent-md", @@ -262,53 +486,59 @@ def _build_source_suite( ), judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70.0), + metadata={"agent": agent_name}, ) - if kind == "aworld-trajectory-log": - if not task_id: - raise ValueError("--task-id is required for aworld-trajectory-log source") - source = AWorldTrajectoryLogSource( + if kind == "answer": + source = JsonlTaskAnswerSource( path=input_path, - task_ids=[task_id], - extraction_dir=out_dir, + id_field=id_field, + input_field=task_field, + answer_field=answer_field, ) return create_source_eval_suite( - suite_id="trajectory-log-source-evaluator", + suite_id="answer-source-evaluator", source=source, + judge_backend=AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id="source-agent-md", + prompt_builder=_build_source_prompt, + ), + judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), + gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70.0), + ) + + if kind == "trajectory": + if task_id or _looks_like_aworld_trajectory_log(input_path): + source = AWorldTrajectoryLogSource( + path=input_path, + task_ids=[task_id] if task_id else None, + extraction_dir=out_dir, + ) + runtime_harness = None + else: + source = JsonlTaskSource( + path=input_path, + id_field=id_field, + input_field=task_field, + ) + runtime_harness = _build_cli_agent_runtime_harness(agent_name=agent_name) + return create_source_eval_suite( + suite_id="trajectory-source-evaluator", + source=source, + runtime_harness=runtime_harness, judge_backend=AgentJudgeBackend.from_agent_markdown( judge_agent_path, backend_id="trajectory-evaluator-agent-md", prompt_builder=_build_trajectory_prompt, ), judge_schema=TrajectoryJudgeSchema.default(), - outcome_scorers=( - StateCheckGrader( - metric_name="has_evidence", - source="outcome", - path=("evidence_blocks",), - op=">", - expected=0, - ), - StateCheckGrader( - metric_name="agent_finished", - source="outcome", - path=("is_finished",), - op="==", - expected=True, - ), - ), - gate_policy=GatePolicyDef( - pass_all=( - GateMetricCondition(metric_name="score", op=">=", threshold=70.0), - GateMetricCondition(metric_name="A1_groundedness", op=">=", threshold=3), - GateMetricCondition(metric_name="veto_triggered", op="==", threshold=False), - GateMetricCondition(metric_name="has_evidence", op="==", threshold=1.0), - GateMetricCondition(metric_name="agent_finished", op="==", threshold=1.0), - ) - ), + outcome_scorers=trajectory_outcome_scorers, + gate_policy=trajectory_gate, + metadata={"agent": agent_name} if not task_id else None, ) - raise ValueError(f"unsupported source kind: {kind}") + raise ValueError(f"unsupported source kind: {kind}; expected one of: {', '.join(_SUPPORTED_SOURCE_KINDS)}") def run_evaluator_source_cli( @@ -326,6 +556,7 @@ def run_evaluator_source_cli( interactive_approval: bool = False, ) -> dict: hooks = _load_evaluator_hooks() + kind = (kind or "").strip().lower() input_path = Path(input).expanduser().resolve() if not input_path.exists(): raise FileNotFoundError(f"source input does not exist: {input_path}") @@ -367,13 +598,17 @@ def run_evaluator_source_cli( task_field=task_field, answer_field=answer_field, out_dir=out_dir, + agent=agent, ) + agent_name = agent or "Aworld" + executes_agent = kind == "task" or (kind == "trajectory" and not task_id) target_info = { "target_kind": "source", "target_path": str(input_path), "source_kind": kind, "task_id": task_id, "judge_agent": str(judge_agent_path), + "agent": agent_name if executes_agent else agent, } for key, value in hook_state.items(): if key not in {"mode", "input", "kind", "task_id", "judge_agent", "agent", "interactive_approval", "summary_suffix"}: @@ -402,6 +637,7 @@ def run_evaluator_source_cli( "kind": kind, "task_id": task_id, "judge_agent": str(judge_agent_path), + "agent": agent_name if executes_agent else agent, } report["automation"] = _build_automation_summary(report) output_path = _source_report_path( diff --git a/aworld/evaluations/report.py b/aworld/evaluations/report.py index 1b9efa666..49749784f 100644 --- a/aworld/evaluations/report.py +++ b/aworld/evaluations/report.py @@ -145,6 +145,7 @@ def get_evaluator_report_schema() -> dict[str, object]: "source_kind": {"type": ["string", "null"]}, "source_input": {"type": ["string", "null"]}, "task_id": {"type": ["string", "null"]}, + "agent": {"type": ["string", "null"]}, }, "additionalProperties": False, }, diff --git a/aworld/evaluations/sources.py b/aworld/evaluations/sources.py index 67eaddfa4..23d2c16c5 100644 --- a/aworld/evaluations/sources.py +++ b/aworld/evaluations/sources.py @@ -122,7 +122,7 @@ def iter_records(self) -> Iterable[EvalSourceRecord]: metadata = {} if self.metadata_field is not None and isinstance(payload.get(self.metadata_field), Mapping): metadata.update(dict(payload[self.metadata_field])) - metadata.update({"source_kind": "task-answer", "source_path": str(path), "line_number": line_number}) + metadata.update({"source_kind": "answer", "source_path": str(path), "line_number": line_number}) expected = payload.get(self.expected_field) if self.expected_field else None yield EvalSourceRecord( case_id=str(payload[self.id_field]), @@ -139,6 +139,44 @@ def default_adapter(self): return AnswerStateAdapter() +@dataclass(frozen=True) +class JsonlTaskSource(_BaseEvalSource): + path: str | Path + id_field: str = "id" + input_field: str = "input" + expected_field: str | None = None + metadata_field: str | None = None + + def iter_records(self) -> Iterable[EvalSourceRecord]: + path = Path(self.path).expanduser() + with path.open(encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + payload = json.loads(stripped) + if not isinstance(payload, Mapping): + raise ValueError(f"{path}:{line_number} must contain a JSON object") + for field_name in (self.id_field, self.input_field): + if field_name not in payload: + raise ValueError(f"{path}:{line_number} missing required field: {field_name}") + metadata = {} + if self.metadata_field is not None and isinstance(payload.get(self.metadata_field), Mapping): + metadata.update(dict(payload[self.metadata_field])) + metadata.update({"source_kind": "task", "source_path": str(path), "line_number": line_number}) + expected = payload.get(self.expected_field) if self.expected_field else None + yield EvalSourceRecord( + case_id=str(payload[self.id_field]), + input={"input": payload[self.input_field]}, + expected=expected, + metadata=metadata, + raw_payload=dict(payload), + ) + + def default_adapter(self): + raise ValueError("task source requires a runtime_harness") + + def _truthy_string(value: Any) -> bool: return str(value).strip().lower() in {"true", "1", "yes"} @@ -154,20 +192,13 @@ def _tool_calls_from_action(action: Mapping[str, Any]) -> list[dict[str, Any]]: return calls -def extract_aworld_trajectory_record(log_path: str | Path, task_id: str) -> dict[str, Any]: - path = Path(log_path).expanduser() - target_line = None - with path.open(encoding="utf-8", errors="replace") as handle: - for line in handle: - if task_id in line: - target_line = line - break - if target_line is None: - raise ValueError(f"task_id {task_id} not found in {path}") - - clean = re.sub(r"\x1b\[[0-9;]*m", "", target_line).strip() - record = ast.literal_eval(clean) - trajectory = json.loads(record["trajectory"]) +def extract_aworld_trajectory_payload( + trajectory: Iterable[Mapping[str, Any]], + *, + task_id: str, + is_sub_task: Any | None = None, +) -> dict[str, Any]: + trajectory = list(trajectory) if not isinstance(trajectory, list): raise ValueError(f"task_id {task_id} trajectory must be a list") @@ -215,7 +246,7 @@ def extract_aworld_trajectory_record(log_path: str | Path, task_id: str) -> dict return { "task_id": task_id, - "is_sub_task": record.get("is_sub_task"), + "is_sub_task": is_sub_task, "num_steps": len(trajectory), "question": question, "system_prompt_excerpt": system_prompt[:8000], @@ -225,23 +256,70 @@ def extract_aworld_trajectory_record(log_path: str | Path, task_id: str) -> dict } +def _parse_aworld_trajectory_log_line(line: str) -> Mapping[str, Any]: + clean = re.sub(r"\x1b\[[0-9;]*m", "", line).strip() + record = ast.literal_eval(clean) + if not isinstance(record, Mapping): + raise ValueError("trajectory log line must contain a mapping") + return record + + +def _extract_aworld_trajectory_record_payload(record: Mapping[str, Any], *, task_id: str) -> dict[str, Any]: + trajectory = json.loads(record["trajectory"]) + return extract_aworld_trajectory_payload( + trajectory, + task_id=task_id, + is_sub_task=record.get("is_sub_task"), + ) + + +def iter_aworld_trajectory_records(log_path: str | Path) -> Iterable[tuple[str, dict[str, Any]]]: + path = Path(log_path).expanduser() + with path.open(encoding="utf-8", errors="replace") as handle: + for line_number, line in enumerate(handle, start=1): + if not line.strip(): + continue + try: + record = _parse_aworld_trajectory_log_line(line) + except (SyntaxError, ValueError) as exc: + raise ValueError(f"{path}:{line_number} is not a valid AWorld trajectory log record") from exc + task_id = record.get("task_id") + if task_id is None: + raise ValueError(f"{path}:{line_number} missing required field: task_id") + yield str(task_id), _extract_aworld_trajectory_record_payload(record, task_id=str(task_id)) + + +def extract_aworld_trajectory_record(log_path: str | Path, task_id: str) -> dict[str, Any]: + path = Path(log_path).expanduser() + with path.open(encoding="utf-8", errors="replace") as handle: + for line in handle: + if task_id not in line: + continue + record = _parse_aworld_trajectory_log_line(line) + if str(record.get("task_id")) == str(task_id): + return _extract_aworld_trajectory_record_payload(record, task_id=str(task_id)) + raise ValueError(f"task_id {task_id} not found in {path}") + + @dataclass(frozen=True) class AWorldTrajectoryLogSource(_BaseEvalSource): path: str | Path - task_ids: Iterable[str] + task_ids: Iterable[str] | None extraction_dir: str | Path | None = None def iter_records(self) -> Iterable[EvalSourceRecord]: path = Path(self.path).expanduser() - for task_id in self.task_ids: - task_id = str(task_id) - extracted = extract_aworld_trajectory_record(path, task_id) + items = iter_aworld_trajectory_records(path) if self.task_ids is None else ( + (str(task_id), extract_aworld_trajectory_record(path, str(task_id))) + for task_id in self.task_ids + ) + for task_id, extracted in items: yield EvalSourceRecord( case_id=task_id, input={"task_id": task_id, "trajectory_log": str(path)}, answer=extracted.get("final_answer"), metadata={ - "source_kind": "aworld-trajectory-log", + "source_kind": "trajectory", "source_path": str(path), "extraction_dir": str(Path(self.extraction_dir).expanduser()) if self.extraction_dir else None, }, @@ -262,6 +340,7 @@ def create_source_eval_suite( judge_schema, gate_policy=None, state_adapter=None, + runtime_harness=None, outcome_scorers=tuple(), reward_metrics=tuple(), standard_metrics=tuple(), @@ -272,13 +351,16 @@ def create_source_eval_suite( from aworld.evaluations.substrate import EvalSuiteDef records = list(source.iter_records()) - adapter = state_adapter - if adapter is None: - adapter = source.default_adapter() + harness = runtime_harness + if harness is None: + adapter = state_adapter + if adapter is None: + adapter = source.default_adapter() + harness = ReplayRuntimeHarness(adapter=adapter, records=tuple(records)) return EvalSuiteDef( suite_id=suite_id, cases=[record.to_case() for record in records], - runtime_harness=ReplayRuntimeHarness(adapter=adapter, records=tuple(records)), + runtime_harness=harness, judge_backend=judge_backend, judge_schema=judge_schema, gate_policy=gate_policy, diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index 108884cb6..e508342e7 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -12,7 +12,7 @@ Use it when you want to: - run a built-in evaluator suite such as `app-evaluator` - load declaration-backed evaluator suites from workspace manifests -- evaluate existing source records such as task+answer JSONL files or AWorld trajectory logs +- evaluate task JSONL files, existing task+answer JSONL files, or AWorld trajectory logs - inspect which suites match a target - export the evaluator report schema - validate a saved evaluator report in automation @@ -33,28 +33,48 @@ aworld-cli evaluator --validate-report ./.aworld/evaluations/artifact.app-evalua Source-backed usage: ```bash +aworld-cli evaluator \ + --input ./tasks.jsonl \ + --kind task \ + --judge-agent ./eval/answer_judge/agent.md \ + --out-dir ./reports + aworld-cli evaluator \ --input ./task_answers.jsonl \ - --kind task-answer \ + --kind answer \ --judge-agent ./eval/answer_judge/agent.md \ --out-dir ./reports aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ - --kind aworld-trajectory-log \ + --kind trajectory \ --task-id task_20260609193335 \ --judge-agent ./eval/trajectory_evaluator/agent.md \ --out-dir ./reports + +aworld-cli evaluator \ + --input ~/Documents/logs/trajectory.log \ + --kind trajectory \ + --judge-agent ./eval/trajectory_evaluator/agent.md \ + --out-dir ./reports + +aworld-cli evaluator \ + --input ./tasks.jsonl \ + --kind trajectory \ + --judge-agent ./eval/trajectory_evaluator/agent.md \ + --out-dir ./reports ``` -For `task-answer` JSONL inputs, the default fields are `id`, `input`, and `answer`. Use `--id-field`, `--task-field`, and `--answer-field` only when the file uses different names. +For `task` JSONL inputs, the default fields are `id` and `input`; the evaluator runs each task through the CLI default `Aworld` agent unless `--agent` is supplied. For `trajectory`, passing `--task-id` replays one task from an existing AWorld trajectory log, omitting `--task-id` with a trajectory log replays all tasks in that log, and omitting `--task-id` with task JSONL runs the main agent, extracts the response trajectory, and evaluates that generated trajectory. For `answer` JSONL inputs, the default fields are `id`, `input`, and `answer`. Use `--id-field`, `--task-field`, and `--answer-field` only when the file uses different names. Useful options: ```bash aworld-cli evaluator --target ./artifact --output ./report.json aworld-cli evaluator --target ./artifact --interactive-approval -aworld-cli evaluator --input ./task_answers.jsonl --kind task-answer --judge-agent ./agent.md --output ./report.json +aworld-cli evaluator --input ./tasks.jsonl --kind task --judge-agent ./agent.md --agent Aworld --output ./report.json +aworld-cli evaluator --input ./tasks.jsonl --kind trajectory --judge-agent ./trajectory_agent.md --output ./report.json +aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-agent ./agent.md --output ./report.json ``` ## Declared Suite Manifests @@ -153,7 +173,7 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ 1. Inspect matching suites with `aworld-cli evaluator --list-suites --target ./artifact`. 2. Run evaluation with `aworld-cli evaluator --target ./artifact`. -3. For existing outputs, run source-backed evaluation with `aworld-cli evaluator --input --kind task-answer --judge-agent `. +3. For task-only inputs, run source-backed execution and evaluation with `aworld-cli evaluator --input --kind task --judge-agent `. 4. Save or collect the emitted JSON report. 5. Validate persisted reports with `aworld-cli evaluator --validate-report `. 6. Export the current JSON Schema with `aworld-cli evaluator --print-report-schema` when integrating with external tooling. @@ -172,5 +192,5 @@ See [evaluator_report.example.json](/Users/wuman/Documents/workspace/aworld-mas/ - declared suite manifests currently layer on `app-evaluator` only; they are not a generic suite authoring format yet. - `--print-report-schema` prints the current JSON Schema for `aworld.evaluator.report`. - `--validate-report` validates an existing JSON report against that schema without re-running evaluation. -- `aworld-cli evaluator --input ...` currently supports `task-answer` and `aworld-trajectory-log`; task-only execution sources and generic serialized-state sources are intentionally deferred until the framework provides those source kinds. +- `aworld-cli evaluator --input ...` currently supports `task`, `answer`, and `trajectory`; generic serialized-state sources are intentionally deferred until the framework provides those source kinds. - the CLI command is an assembly/product layer; reusable evaluator building blocks stay in `aworld/evaluations/**`. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md index 8e1a2b4f1..dd9bc2b03 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md @@ -32,7 +32,7 @@ The canonical source-backed path should be: ```bash aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ - --kind aworld-trajectory-log \ + --kind trajectory \ --task-id task_20260609193335 \ --judge-agent eval/trajectory_evaluator/agent.md \ --out-dir eval/trajectory_evaluator/reports @@ -43,14 +43,14 @@ Task+answer files: ```bash aworld-cli evaluator \ --input task_answers.jsonl \ - --kind task-answer \ + --kind answer \ --judge-agent eval/answer_judge/agent.md \ --out-dir reports ``` The default JSONL fields are `id`, `input`, and `answer`. `--id-field`, `--task-field`, and `--answer-field` are override flags for files that do not follow that convention. -Task-only files are a follow-on source kind once the framework input-source layer adds task-only source support: +Task-only files are supported once the framework input-source layer exposes `JsonlTaskSource`; the CLI wires them to the default `Aworld` agent unless `--agent` is supplied: ```bash aworld-cli evaluator \ @@ -58,13 +58,16 @@ aworld-cli evaluator \ --kind task \ --id-field task_id \ --task-field task \ - --agent ./agent.md \ + --agent Aworld \ --judge-agent eval/answer_judge/agent.md \ --out-dir reports ``` `--kind auto` can be added once detection is reliable, but the first version should require explicit `--kind` to keep failures predictable. +For `--kind trajectory`, `--task-id` selects the existing-log replay path. When `--task-id` is omitted, the input is treated as task JSONL: the CLI runs the task through the default or specified main agent, extracts the AWorld response trajectory, and feeds that generated trajectory into the trajectory judge prompt. +If the input is an AWorld trajectory log and `--task-id` is omitted, the CLI uses the framework trajectory-log source to replay all task records in that log. This makes all-task replay explicit through the `trajectory` kind while keeping trajectory parsing in the framework source layer. + ## CLI Boundary The evaluator command owns: @@ -153,7 +156,7 @@ The new `evaluator --input ...` source path should not break `--list-suites`, `- - [Command ambiguity] `evaluator --target` and `evaluator --input` are mutually exclusive, so parser errors must clearly explain which mode is active. - [Too many flags] Field mappings are necessary for generic JSONL. Presets can reduce repeated arguments later. -- [Case-specific drift] Avoid canonical `evaluator trajectory-log`; if aliases are added later, they should delegate to `evaluator --input ... --kind aworld-trajectory-log`. +- [Case-specific drift] Avoid canonical `evaluator trajectory-log`; if aliases are added later, they should delegate to `evaluator --input ... --kind trajectory`. - [Plugin overreach] Hook contracts must state that plugins customize CLI assembly and side effects only. ## Migration Plan diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md index d50dacc06..aaba466eb 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md @@ -59,15 +59,27 @@ The evaluator command SHALL document the event payloads, mutable state surface, The evaluator command SHALL provide a source-backed run mode that accepts an input path, source kind, optional field mappings, optional task filters, optional execution agent, and judge agent configuration. #### Scenario: User evaluates an AWorld trajectory log -- **WHEN** a user runs the evaluator with `--input`, `--kind aworld-trajectory-log`, `--task-id`, and `--judge-agent` +- **WHEN** a user runs the evaluator with `--input`, `--kind trajectory`, `--task-id`, and `--judge-agent` - **THEN** the CLI SHALL use framework trajectory-log source and replay adapters to evaluate the selected task without implementing trajectory parsing in CLI code +#### Scenario: User evaluates every task in an AWorld trajectory log +- **WHEN** a user runs the evaluator with AWorld trajectory log `--input`, `--kind trajectory`, and `--judge-agent` without `--task-id` +- **THEN** the CLI SHALL use framework trajectory-log source and replay adapters to evaluate every task record in that log without executing the main agent + +#### Scenario: User evaluates generated trajectory from task input +- **WHEN** a user runs the evaluator with task JSONL `--input`, `--kind trajectory`, and `--judge-agent` without `--task-id` +- **THEN** the CLI SHALL run each task through the CLI default `Aworld` agent unless `--agent` is provided, extract the trajectory from the AWorld response, and evaluate that generated trajectory with the trajectory judge flow + #### Scenario: User evaluates task and answer records -- **WHEN** a user runs the evaluator with `--input`, `--kind task-answer`, and `--judge-agent` +- **WHEN** a user runs the evaluator with `--input`, `--kind answer`, and `--judge-agent` - **THEN** the CLI SHALL use framework task+answer source and answer-state adapters to evaluate existing answers without re-executing the target +#### Scenario: User evaluates task-only records through the default agent +- **WHEN** a user runs the evaluator with `--input`, `--kind task`, and `--judge-agent` and omits `--agent` +- **THEN** the CLI SHALL use the framework task source, execute each task through the CLI default `Aworld` agent, convert the produced output into evaluation state, and evaluate that state with the judge agent + #### Scenario: User overrides task and answer field names -- **WHEN** a user runs the evaluator with `--kind task-answer` and custom field mapping flags +- **WHEN** a user runs the evaluator with `--kind answer` and custom field mapping flags - **THEN** the CLI SHALL pass those mappings to the framework source while defaulting omitted mappings to `id`, `input`, and `answer` #### Scenario: User requests a deferred source kind @@ -79,7 +91,7 @@ The evaluator command SHALL provide a source-backed run mode that accepts an inp The evaluator CLI SHALL treat source kinds as input adapters under a single canonical source-backed command path rather than creating independent evaluator stacks for each source format. #### Scenario: Source kind selects adapter -- **WHEN** a user specifies a supported source kind such as `task-answer` or `aworld-trajectory-log` +- **WHEN** a user specifies a supported source kind such as `task`, `answer`, or `trajectory` - **THEN** the CLI SHALL select the matching framework source adapter while preserving the same evaluation flow and report semantics #### Scenario: Source kind is not yet supported by framework diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 5d3767b6c..9c2cecd3d 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -4,6 +4,7 @@ import json import sys from pathlib import Path +from types import SimpleNamespace import pytest @@ -13,8 +14,10 @@ from aworld.evaluations.manifests import get_declared_eval_suite_schema from aworld.evaluations.report import EvaluatorReport from aworld_cli.evaluator_runtime import ( + _CliAgentRuntimeHarness, _build_source_suite, _build_source_prompt, + _build_trajectory_prompt, available_evaluator_suites, evaluator_exit_code, get_declared_evaluator_suite_schema, @@ -90,9 +93,9 @@ async def fake_run_evaluation_flow(flow): captured["flow"] = flow return { "report_version": 1, - "suite_id": "source-evaluator", + "suite_id": "answer-source-evaluator", "judge_backend": {"backend_id": "source-agent-md"}, - "summary": {"source-evaluator": {"score": {"mean": 0.9}}}, + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, "results": [], "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, "approval": {"required": False, "resolved": False, "approved": None}, @@ -102,22 +105,211 @@ async def fake_run_evaluation_flow(flow): report = run_evaluator_source_cli( input=str(input_path), - kind="task-answer", + kind="answer", judge_agent=str(judge_agent), output=str(output), ) flow = captured["flow"] assert flow.target["target_kind"] == "source" - assert flow.target["source_kind"] == "task-answer" + assert flow.target["source_kind"] == "answer" assert flow.suite.cases[0].case_id == "case-1" assert flow.suite.cases[0].input == {"input": "question"} assert flow.suite.judge_backend.backend_id == "source-agent-md" - assert report["source_selection"]["kind"] == "task-answer" - assert report["automation"]["source_kind"] == "task-answer" + assert report["source_selection"]["kind"] == "answer" + assert report["automation"]["source_kind"] == "answer" assert output.exists() +def test_run_evaluator_source_cli_builds_task_flow_with_default_agent( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + captured = {} + + class FakeHarness: + pass + + def fake_build_cli_agent_runtime_harness(*, agent_name): + captured["agent_name"] = agent_name + return FakeHarness() + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + return { + "report_version": 1, + "suite_id": "task-source-evaluator", + "judge_backend": {"backend_id": "source-agent-md"}, + "summary": {"task-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="task", + judge_agent=str(judge_agent), + output=str(tmp_path / "report.json"), + ) + + flow = captured["flow"] + assert captured["agent_name"] == "Aworld" + assert flow.target["source_kind"] == "task" + assert flow.target["agent"] == "Aworld" + assert flow.suite.cases[0].case_id == "case-1" + assert flow.suite.cases[0].input == {"input": "question"} + assert flow.suite.runtime_harness is not None + assert report["source_selection"]["kind"] == "task" + assert report["source_selection"]["agent"] == "Aworld" + assert report["automation"]["source_kind"] == "task" + + +def test_run_evaluator_source_cli_builds_generated_trajectory_flow_with_default_agent( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + captured = {} + + class FakeHarness: + pass + + def fake_build_cli_agent_runtime_harness(*, agent_name): + captured["agent_name"] = agent_name + return FakeHarness() + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + return { + "report_version": 1, + "suite_id": "trajectory-source-evaluator", + "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, + "summary": {"trajectory-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": 0.9}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="trajectory", + judge_agent=str(judge_agent), + output=str(tmp_path / "report.json"), + ) + + flow = captured["flow"] + assert captured["agent_name"] == "Aworld" + assert flow.target["source_kind"] == "trajectory" + assert flow.target["agent"] == "Aworld" + assert flow.suite.cases[0].case_id == "case-1" + assert flow.suite.cases[0].input == {"input": "question"} + assert report["source_selection"]["kind"] == "trajectory" + assert report["source_selection"]["agent"] == "Aworld" + + +@pytest.mark.asyncio +async def test_cli_agent_runtime_harness_returns_rollout_state( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeExecutor: + async def chat(self, query): + return f"answer for {query}" + + async def fake_load_cli_agent_executor(agent_name): + assert agent_name == "Aworld" + return FakeExecutor() + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_cli_agent_executor", + fake_load_cli_agent_executor, + ) + + case = SimpleNamespace( + case_id="case-1", + input={"input": "question"}, + metadata={ + "source_record": { + "metadata": {"source_kind": "task", "source_path": "tasks.jsonl"}, + }, + }, + ) + state = await _CliAgentRuntimeHarness(agent_name="Aworld").run_rollout( + case=case, + target={"source_kind": "task"}, + ) + + assert state.status == "success" + assert state.answer == "answer for question" + assert state.outcome["has_answer"] is True + assert state.metadata["agent"] == "Aworld" + assert state.metadata["source_kind"] == "task" + assert state.standard_metrics["n_turns"] == 2 + + +@pytest.mark.asyncio +async def test_cli_agent_runtime_harness_prefers_swarm_task_response( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeSwarm: + pass + + class FakeExecutor: + swarm = FakeSwarm() + + async def chat(self, query): + raise AssertionError("chat fallback should not be used for local swarm executors") + + async def fake_load_cli_agent_executor(agent_name): + return FakeExecutor() + + async def fake_run(*, input, swarm): + assert input == "question" + assert isinstance(swarm, FakeSwarm) + return { + "answer": "answer with tools", + "trajectory": [{"tool_calls": [{"name": "search"}]}], + "usage": {"total_tokens": 12}, + } + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_cli_agent_executor", + fake_load_cli_agent_executor, + ) + monkeypatch.setattr("aworld_cli.evaluator_runtime.Runners.run", fake_run) + + case = SimpleNamespace(case_id="case-1", input={"input": "question"}, metadata={}) + state = await _CliAgentRuntimeHarness(agent_name="Aworld").run_rollout( + case=case, + target={"source_kind": "task"}, + ) + + assert state.answer == "answer with tools" + assert state.tool_calls == [{"name": "search"}] + assert state.trajectory == [{"tool_calls": [{"name": "search"}]}] + assert state.standard_metrics["n_tool_calls"] == 1 + assert state.standard_metrics["n_tokens"] == 12 + + def test_source_prompt_uses_zero_to_hundred_score_contract() -> None: prompt = _build_source_prompt( {"input": "question"}, @@ -138,7 +330,7 @@ def test_run_evaluator_source_cli_rejects_unsupported_source_kind(tmp_path: Path with pytest.raises(ValueError, match="unsupported source kind"): run_evaluator_source_cli( input=str(input_path), - kind="task", + kind="task-only", judge_agent=str(judge_agent), ) @@ -161,7 +353,7 @@ def test_trajectory_source_gate_consumes_veto_signal(tmp_path: Path) -> None: judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") suite = _build_source_suite( - kind="aworld-trajectory-log", + kind="trajectory", input_path=input_path, judge_agent_path=judge_agent, task_id=task_id, @@ -191,6 +383,133 @@ def test_trajectory_source_gate_consumes_veto_signal(tmp_path: Path) -> None: assert any(condition["metric_name"] == "veto_triggered" for condition in decision.failed_conditions) +def test_aworld_trajectory_log_without_task_id_builds_task_execution_suite( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + captured = {} + + class FakeHarness: + pass + + def fake_build_cli_agent_runtime_harness(*, agent_name): + captured["agent_name"] = agent_name + return FakeHarness() + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + + suite = _build_source_suite( + kind="trajectory", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=None, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + assert captured["agent_name"] == "Aworld" + assert suite.suite_id == "trajectory-source-evaluator" + assert suite.cases[0].case_id == "case-1" + assert suite.cases[0].input == {"input": "question"} + assert suite.runtime_harness is not None + assert suite.judge_backend.backend_id == "trajectory-evaluator-agent-md" + pass_conditions = suite.gate_policy.normalized_conditions()[0] + assert any(condition.metric_name == "A1_groundedness" for condition in pass_conditions) + assert any(condition.metric_name == "veto_triggered" for condition in pass_conditions) + + +def test_trajectory_log_without_task_id_builds_replay_suite_for_all_tasks( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "trajectory.log" + trajectory = [ + { + "state": {"input": {"content": "question"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "final", "is_agent_finished": "True"}, + } + ] + input_path.write_text( + "\n".join( + [ + repr({"task_id": "task-1", "is_sub_task": False, "trajectory": json.dumps(trajectory)}), + repr({"task_id": "task-2", "is_sub_task": False, "trajectory": json.dumps(trajectory)}), + ] + ) + + "\n", + encoding="utf-8", + ) + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + def fake_build_cli_agent_runtime_harness(*, agent_name): + raise AssertionError("trajectory log replay must not execute the main agent") + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + fake_build_cli_agent_runtime_harness, + ) + + suite = _build_source_suite( + kind="trajectory", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=None, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + assert suite.suite_id == "trajectory-source-evaluator" + assert [case.case_id for case in suite.cases] == ["task-1", "task-2"] + assert suite.runtime_harness is not None + + +def test_trajectory_prompt_can_use_generated_runtime_trajectory() -> None: + prompt = json.loads( + _build_trajectory_prompt( + {"input": "question", "_case_metadata": {}}, + { + "case_id": "case-1", + "answer": "final answer", + "trajectory": [ + { + "state": { + "input": {"content": "question"}, + "messages": [{"role": "tool", "content": "evidence"}], + }, + "meta": {"step": 1, "agent_id": "Aworld"}, + "action": { + "content": "final answer", + "is_agent_finished": "True", + "tool_calls": [{"function": {"name": "search", "arguments": "{}"}}], + }, + } + ], + }, + suite=None, + ) + ) + + extracted = prompt["extracted_trajectory"] + assert extracted["task_id"] == "case-1" + assert extracted["question"] == "question" + assert extracted["final_answer"] == "final answer" + assert extracted["evidence"][0]["content"] == "evidence" + assert extracted["steps"][0]["tool_calls"] == [{"name": "search", "arguments": "{}"}] + + def test_run_evaluator_source_cli_passes_source_fields_to_hooks( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -213,8 +532,8 @@ async def fake_run_evaluation_flow(flow): assert flow.target["hook_tag"] == "source-hook" return { "report_version": 1, - "suite_id": "source-evaluator", - "summary": {"source-evaluator": {"score": {"mean": 0.9}}}, + "suite_id": "answer-source-evaluator", + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, "metrics": {"score": {"mean": 0.9}}, "results": [], "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, @@ -233,7 +552,7 @@ async def fake_run_evaluation_flow(flow): run_evaluator_source_cli( input=str(input_path), - kind="task-answer", + kind="answer", judge_agent=str(judge_agent), task_id="case-1", output=str(tmp_path / "report.json"), @@ -242,12 +561,12 @@ async def fake_run_evaluation_flow(flow): assert events[0][0] == "pre" assert events[0][1]["mode"] == "source" assert events[0][1]["input"] == str(input_path.resolve()) - assert events[0][1]["kind"] == "task-answer" + assert events[0][1]["kind"] == "answer" assert events[0][1]["task_id"] == "case-1" assert events[0][1]["judge_agent"] == str(judge_agent.resolve()) assert events[1][0] == "post" assert events[1][1]["mode"] == "source" - assert events[1][1]["report"]["source_selection"]["kind"] == "task-answer" + assert events[1][1]["report"]["source_selection"]["kind"] == "answer" def test_run_evaluator_source_cli_persists_schema_valid_source_report( @@ -264,10 +583,10 @@ async def fake_run_evaluation_flow(flow): "report_version": 1, "report_format": {"id": "aworld.evaluator.report", "version": 1}, "generated_at": "2026-06-10T00:00:00Z", - "suite_id": "source-evaluator", + "suite_id": "answer-source-evaluator", "target": flow.target, "judge_backend": {"backend_id": "source-agent-md"}, - "summary": {"source-evaluator": {"score": {"mean": 88.0}}}, + "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}}, "metrics": {"score": {"mean": 88.0}}, "results": [ { @@ -288,7 +607,7 @@ async def fake_run_evaluation_flow(flow): report = run_evaluator_source_cli( input=str(input_path), - kind="task-answer", + kind="answer", judge_agent=str(judge_agent), output=str(tmp_path / "report.json"), ) diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index acbf8b9f4..41af8bbe9 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -82,9 +82,9 @@ def test_maybe_dispatch_top_level_command_runs_source_evaluator_command( def fake_run_evaluator_source_cli(**kwargs): calls.update(kwargs) return { - "suite_id": "source-evaluator", + "suite_id": "answer-source-evaluator", "gate": {"status": "pass"}, - "summary": {"source-evaluator": {"score": {"mean": 0.9}}}, + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, "results": [], "approval": {"required": False, "resolved": False, "approved": None}, } @@ -101,7 +101,7 @@ def fake_run_evaluator_source_cli(**kwargs): "--input", str(input_path), "--kind", - "task-answer", + "answer", "--judge-agent", str(judge_agent), "--out-dir", @@ -112,13 +112,62 @@ def fake_run_evaluator_source_cli(**kwargs): assert handled is True assert calls["input"] == str(input_path) - assert calls["kind"] == "task-answer" + assert calls["kind"] == "answer" assert calls["judge_agent"] == str(judge_agent) assert calls["out_dir"] == str(tmp_path / "reports") assert calls["id_field"] == "id" assert calls["task_field"] == "input" assert calls["answer_field"] == "answer" - assert "source-evaluator" in output + assert "answer-source-evaluator" in output + assert "pass" in output + + +def test_maybe_dispatch_top_level_command_runs_task_source_with_default_agent( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "task-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"task-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [{"state_summary": {"answer": "answer"}}], + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + [ + "aworld-cli", + "evaluator", + "--input", + str(input_path), + "--kind", + "task", + "--judge-agent", + str(judge_agent), + "--out-dir", + str(tmp_path / "reports"), + ] + ) + output = capsys.readouterr().out + + assert handled is True + assert calls["kind"] == "task" + assert calls["agent"] is None + assert "task-source-evaluator" in output assert "pass" in output @@ -160,7 +209,7 @@ def test_evaluator_source_run_rejects_target_mode_arguments( SimpleNamespace( target="artifact.txt", input="answers.jsonl", - kind="task-answer", + kind="answer", judge_agent="agent.md", out_dir=None, output=None, @@ -202,7 +251,7 @@ def test_evaluator_source_run_rejects_other_target_mode_arguments( "target": None, "suite": None, "input": "answers.jsonl", - "kind": "task-answer", + "kind": "answer", "judge_agent": "agent.md", "out_dir": None, "output": None, diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py index b35668b1e..801aab593 100644 --- a/tests/docs/test_evaluator_report_docs.py +++ b/tests/docs/test_evaluator_report_docs.py @@ -14,8 +14,9 @@ def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: assert "--print-report-schema" in content assert "--validate-report" in content assert "aworld-cli evaluator --input" in content - assert "--kind task-answer" in content - assert "--kind aworld-trajectory-log" in content + assert "--kind task" in content + assert "--kind answer" in content + assert "--kind trajectory" in content assert "report_format" in content assert "automation" in content assert ".aworld/evaluators/*.json" in content diff --git a/tests/evaluations/test_evaluation_input_sources.py b/tests/evaluations/test_evaluation_input_sources.py index 1d1d63ac0..c2cb97690 100644 --- a/tests/evaluations/test_evaluation_input_sources.py +++ b/tests/evaluations/test_evaluation_input_sources.py @@ -9,6 +9,7 @@ from aworld.evaluations.sources import ( AWorldTrajectoryLogSource, + JsonlTaskSource, JsonlTaskAnswerSource, create_source_eval_suite, ) @@ -52,6 +53,26 @@ def test_jsonl_task_answer_source_defaults_fields_and_default_adapter(tmp_path: assert cases[0].metadata["source_record"]["answer"] == "4" +def test_jsonl_task_source_defaults_fields_without_answer(tmp_path: Path) -> None: + path = tmp_path / "tasks.jsonl" + path.write_text( + json.dumps({"id": "case-1", "input": "What is 2+2?"}) + "\n", + encoding="utf-8", + ) + + source = JsonlTaskSource(path=path) + records = list(source.iter_records()) + cases = source.to_cases() + + assert records[0].case_id == "case-1" + assert records[0].input == {"input": "What is 2+2?"} + assert records[0].answer is None + assert records[0].metadata["source_kind"] == "task" + assert cases[0].case_id == "case-1" + assert cases[0].input == {"input": "What is 2+2?"} + assert "answer" not in cases[0].metadata["source_record"] + + @pytest.mark.asyncio async def test_source_eval_suite_replays_task_answer_without_execution(tmp_path: Path) -> None: path = tmp_path / "answers.jsonl" @@ -71,7 +92,7 @@ async def judge(case_input: dict[str, Any], target: dict[str, Any]) -> dict[str, return {"score": 1.0, "verdict": "pass"} suite = create_source_eval_suite( - suite_id="task-answer-source", + suite_id="answer-source", source=JsonlTaskAnswerSource(path=path), judge_backend=CallableJudgeBackend(backend_id="judge", judge=judge), judge_schema=JudgeSchemaDef(output_model=_ScoreJudgeOutput), @@ -176,6 +197,41 @@ def test_trajectory_log_source_reports_missing_task_id(tmp_path: Path) -> None: list(source.iter_records()) +def test_trajectory_log_source_can_iterate_all_tasks(tmp_path: Path) -> None: + path = tmp_path / "trajectory.log" + first = [ + { + "state": {"input": {"content": "first"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "first answer", "is_agent_finished": "True"}, + } + ] + second = [ + { + "state": {"input": {"content": "second"}, "messages": []}, + "meta": {"step": 1}, + "action": {"content": "second answer", "is_agent_finished": "True"}, + } + ] + path.write_text( + "\n".join( + [ + repr({"task_id": "task-1", "is_sub_task": False, "trajectory": json.dumps(first)}), + repr({"task_id": "task-2", "is_sub_task": False, "trajectory": json.dumps(second)}), + ] + ) + + "\n", + encoding="utf-8", + ) + + records = list(AWorldTrajectoryLogSource(path=path, task_ids=None).iter_records()) + + assert [record.case_id for record in records] == ["task-1", "task-2"] + assert records[0].answer == "first answer" + assert records[1].answer == "second answer" + assert records[0].metadata["source_kind"] == "trajectory" + + def test_trajectory_judge_schema_normalizes_dimensions_report() -> None: schema = TrajectoryJudgeSchema.default() diff --git a/tests/evaluations/test_trajectory_log_manual_case.py b/tests/evaluations/test_trajectory_log_manual_case.py index f28cc91f0..a4253ed0d 100644 --- a/tests/evaluations/test_trajectory_log_manual_case.py +++ b/tests/evaluations/test_trajectory_log_manual_case.py @@ -257,10 +257,10 @@ def test_source_cli_report_assertion_matches_manual_trajectory_goal(tmp_path: Pa "report_version": 1, "report_format": {"id": "aworld.evaluator.report", "version": 1}, "generated_at": "2026-06-10T00:00:00Z", - "suite_id": "trajectory-log-source-evaluator", + "suite_id": "trajectory-source-evaluator", "target": {"target_kind": "source", "target_path": str(log_path)}, "judge_backend": {"backend_id": "trajectory-evaluator-agent-md"}, - "summary": {"trajectory-log-source-evaluator": {"score": {"mean": 64.0}}}, + "summary": {"trajectory-source-evaluator": {"score": {"mean": 64.0}}}, "metrics": { "score": {"mean": 64.0}, "has_evidence": {"mean": 1.0}, @@ -294,14 +294,14 @@ def test_source_cli_report_assertion_matches_manual_trajectory_goal(tmp_path: Pa "suggested_exit_code": 2, "case_count": 1, "judge_backend": "trajectory-evaluator-agent-md", - "source_kind": "aworld-trajectory-log", + "source_kind": "trajectory", "source_input": str(log_path), "task_id": task_id, }, "source_selection": { "mode": "source", "input": str(log_path), - "kind": "aworld-trajectory-log", + "kind": "trajectory", "task_id": task_id, "judge_agent": str(agent_prompt_path), }, @@ -333,7 +333,7 @@ def _assert_source_cli_trajectory_report_matches_manual_goal( validate_evaluator_report(dict(report)) report_path = Path(str(report["report_path"])) assert report_path.exists() - assert report["suite_id"] == "trajectory-log-source-evaluator" + assert report["suite_id"] == "trajectory-source-evaluator" assert report["gate"]["status"] in {"pass", "fail", "needs_approval"} assert report["metrics"]["has_evidence"]["mean"] == 1.0 assert report["metrics"]["agent_finished"]["mean"] == 1.0 @@ -341,13 +341,13 @@ def _assert_source_cli_trajectory_report_matches_manual_goal( source_selection = report["source_selection"] assert source_selection["mode"] == "source" - assert source_selection["kind"] == "aworld-trajectory-log" + assert source_selection["kind"] == "trajectory" assert source_selection["task_id"] == task_id assert Path(str(source_selection["input"])).resolve() == log_path.resolve() assert Path(str(source_selection["judge_agent"])).resolve() == agent_prompt_path.resolve() automation = report["automation"] - assert automation["source_kind"] == "aworld-trajectory-log" + assert automation["source_kind"] == "trajectory" assert automation["task_id"] == task_id assert Path(str(automation["source_input"])).resolve() == log_path.resolve() @@ -508,7 +508,7 @@ def test_manual_trajectory_log_case_runs_via_source_cli_for_human_replay(request report = run_evaluator_source_cli( input=str(log_path), - kind="aworld-trajectory-log", + kind="trajectory", task_id=task_id, judge_agent=str(agent_prompt_path), out_dir=str(out_dir), diff --git a/tests/test_slash_commands.py b/tests/test_slash_commands.py index 704811c71..a230d89cb 100644 --- a/tests/test_slash_commands.py +++ b/tests/test_slash_commands.py @@ -563,7 +563,7 @@ async def test_evaluation_without_args_shows_usage(self): assert "Usage:" in result assert "/evaluation --input" in result - assert "--kind aworld-trajectory-log" in result + assert "--kind trajectory" in result @pytest.mark.asyncio async def test_evaluation_delegates_to_source_runtime(self, monkeypatch, tmp_path): @@ -575,9 +575,9 @@ async def test_evaluation_delegates_to_source_runtime(self, monkeypatch, tmp_pat def fake_run_evaluator_source_cli(**kwargs): calls.update(kwargs) return { - "suite_id": "trajectory-log-source-evaluator", + "suite_id": "trajectory-source-evaluator", "gate": {"status": "pass"}, - "summary": {"trajectory-log-source-evaluator": {"score": {"mean": 88.0}}}, + "summary": {"trajectory-source-evaluator": {"score": {"mean": 88.0}}}, "results": [], "approval": {"required": False, "resolved": False, "approved": None}, "report_path": str(tmp_path / "report.json"), @@ -592,18 +592,18 @@ def fake_run_evaluator_source_cli(**kwargs): CommandContext( cwd=os.getcwd(), user_args=( - f"--input {input_path} --kind aworld-trajectory-log " + f"--input {input_path} --kind trajectory " f"--task-id task-1 --judge-agent {agent_path} --out-dir {tmp_path}" ), ) ) assert calls["input"] == str(input_path) - assert calls["kind"] == "aworld-trajectory-log" + assert calls["kind"] == "trajectory" assert calls["task_id"] == "task-1" assert calls["judge_agent"] == str(agent_path) assert calls["out_dir"] == str(tmp_path) - assert "trajectory-log-source-evaluator" in result + assert "trajectory-source-evaluator" in result assert "Report:" in result @pytest.mark.asyncio @@ -619,10 +619,10 @@ async def fake_run_evaluation_flow(flow): "report_version": 1, "report_format": {"id": "aworld.evaluator.report", "version": 1}, "generated_at": "2026-06-10T00:00:00Z", - "suite_id": "source-evaluator", + "suite_id": "answer-source-evaluator", "target": flow.target, "judge_backend": {"backend_id": "source-agent-md"}, - "summary": {"source-evaluator": {"score": {"mean": 88.0}}}, + "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}}, "metrics": {"score": {"mean": 88.0}}, "results": [], "result_counts": {"cases_total": 0, "cases_with_metrics": 0, "cases_with_judge": 0}, @@ -637,13 +637,13 @@ async def fake_run_evaluation_flow(flow): CommandContext( cwd=os.getcwd(), user_args=( - f"--input {input_path} --kind task-answer " + f"--input {input_path} --kind answer " f"--judge-agent {agent_path} --output {tmp_path / 'report.json'}" ), ) ) - assert "source-evaluator" in result + assert "answer-source-evaluator" in result assert "Report:" in result @@ -656,8 +656,8 @@ def test_console_completion_entries_include_evaluation_command(self): words, meta = cli._build_completion_entries(agent_names=[]) assert "/evaluation" in words - assert "/evaluation --kind task-answer" in words - assert "/evaluation --kind aworld-trajectory-log" in words + assert "/evaluation --kind answer" in words + assert "/evaluation --kind trajectory" in words assert meta["/evaluation"] == "Run evaluator flows" def test_console_completion_entries_include_cron_subcommands(self): From 1502cef31312c25314e450c185b5f844c596aae9 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 22:06:44 +0800 Subject: [PATCH 38/41] Ignore local evaluator artifacts --- .gitignore | 2 +- .../src/aworld_cli/commands/evaluation_cmd.py | 8 +- docs/AWorld CLI/Commands/Evaluator.md | 10 +- eval/trajectory_evaluator/agent.md | 217 ------------------ .../answer_quality_agent.md | 116 ---------- .../design.md | 8 +- 6 files changed, 14 insertions(+), 347 deletions(-) delete mode 100644 eval/trajectory_evaluator/agent.md delete mode 100644 eval/trajectory_evaluator/answer_quality_agent.md diff --git a/.gitignore b/.gitignore index 7ed1b977d..36459346e 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,7 @@ team_implementation_analysis.md # Temporary AI-generated artifacts ai_news_today.* survey/ +eval/ # OpenSpec design docs (not runtime) openspec/ @@ -177,4 +178,3 @@ openspec/ *.tmp __pycache__/ *.pyc - diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py index 1813e3135..74e8cb9ec 100644 --- a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py +++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py @@ -20,10 +20,10 @@ def _usage() -> str: /evaluation --input --kind trajectory --task-id --judge-agent [--out-dir ] Examples: - /evaluation --input ./tasks.jsonl --kind task --judge-agent ./eval/answer_judge/agent.md - /evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./eval/answer_judge/agent.md - /evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./eval/trajectory_evaluator/agent.md - /evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./eval/trajectory_evaluator/agent.md + /evaluation --input ./tasks.jsonl --kind task --judge-agent ./judge_agents/answer_judge.md + /evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./judge_agents/answer_judge.md + /evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./judge_agents/trajectory_judge.md + /evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./judge_agents/trajectory_judge.md """ diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index e508342e7..5d86f73c9 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -36,32 +36,32 @@ Source-backed usage: aworld-cli evaluator \ --input ./tasks.jsonl \ --kind task \ - --judge-agent ./eval/answer_judge/agent.md \ + --judge-agent ./judge_agents/answer_judge.md \ --out-dir ./reports aworld-cli evaluator \ --input ./task_answers.jsonl \ --kind answer \ - --judge-agent ./eval/answer_judge/agent.md \ + --judge-agent ./judge_agents/answer_judge.md \ --out-dir ./reports aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ --kind trajectory \ --task-id task_20260609193335 \ - --judge-agent ./eval/trajectory_evaluator/agent.md \ + --judge-agent ./judge_agents/trajectory_judge.md \ --out-dir ./reports aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ --kind trajectory \ - --judge-agent ./eval/trajectory_evaluator/agent.md \ + --judge-agent ./judge_agents/trajectory_judge.md \ --out-dir ./reports aworld-cli evaluator \ --input ./tasks.jsonl \ --kind trajectory \ - --judge-agent ./eval/trajectory_evaluator/agent.md \ + --judge-agent ./judge_agents/trajectory_judge.md \ --out-dir ./reports ``` diff --git a/eval/trajectory_evaluator/agent.md b/eval/trajectory_evaluator/agent.md deleted file mode 100644 index 832216f0c..000000000 --- a/eval/trajectory_evaluator/agent.md +++ /dev/null @@ -1,217 +0,0 @@ ---- -name: trajectory-evaluator -description: 使用 LLM-as-judge 对 AWorld agent 的单条 trajectory 做「输出质量 + 执行过程」双维评估。无参考基准(reference-free):以轨迹内实际抓取到的源内容作为 groundedness 依据。必须显式传入 task_id、trajectory log 与输出目录。 -tools: Bash, Read, Write -model: opus ---- - -# Trajectory Evaluator(LLM-as-Judge) - -你是一名严格、以证据为准的 **AI agent 轨迹评审员**。你的职责是对一条 AWorld trajectory 做可复现、可量化的评估,覆盖**最终输出质量**与**执行过程质量**两个范围,并产出结构化评估报告。 - -你**就是**这里的 LLM judge:所有打分由你基于抽取出的证据完成,不调用外部模型。 - -## 评估输入(参数) - -- `TRAJECTORY_LOG`:轨迹日志路径,必须显式提供 -- `TASK_ID`:待评估任务 id,必须显式提供 -- `OUT_DIR`:报告输出目录,必须显式提供 - -若用户在 directive 中给出了不同的值,以用户提供的为准。 - ---- - -## 阶段 0 · 解析与抽取(确定性,必须先做) - -日志为「每行一个 Python dict repr」格式,且尾部可能带 ANSI 转义码;`trajectory` 字段是一个 **JSON 字符串**。**禁止**用 Read 直接读整行(单行可达数百 KB,会污染上下文)。必须用下面这段已验证的脚本抽取,把干净的结构化数据落盘后再读: - -```bash -mkdir -p "${OUT_DIR:?OUT_DIR is required}" -python3 - "$@" << 'PYEOF' -import ast, json, re, os, sys, glob - -LOG = os.path.expanduser(os.environ["TRAJECTORY_LOG"]) -TASK_ID = os.environ["TASK_ID"] -OUT_DIR = os.environ["OUT_DIR"] -os.makedirs(OUT_DIR, exist_ok=True) - -# 1) 定位 task_id 所在行(每行一条记录) -target = None -with open(LOG, encoding="utf-8", errors="replace") as f: - for line in f: - if TASK_ID in line: - target = line - break -if target is None: - sys.exit(f"[FATAL] task_id {TASK_ID} not found in {LOG}") - -# 2) 去 ANSI + 去首尾空白,再用 literal_eval 解析 Python dict repr -clean = re.sub(r'\x1b\[[0-9;]*m', '', target).strip() -rec = ast.literal_eval(clean) -traj = json.loads(rec["trajectory"]) # trajectory 是 JSON 字符串 - -def first_str(x): # is_agent_finished 在该数据里是字符串 "True"/"False" - return str(x).strip().lower() in ("true", "1") - -# 3) 抽取关键字段 -question = (traj[0].get("state", {}).get("input", {}) or {}).get("content") -system_prompt = "" -msgs0 = traj[0].get("state", {}).get("messages", []) or [] -if msgs0 and msgs0[0].get("role") == "system": - system_prompt = str(msgs0[0].get("content") or "") - -steps = [] -final_answer = None -for s in traj: - meta = s.get("meta", {}) - act = s.get("action") or {} - tcs = act.get("tool_calls") or [] - calls = [] - for tc in tcs: - fn = tc.get("function") or {} - calls.append({"name": fn.get("name"), "arguments": str(fn.get("arguments"))}) - finished = first_str(act.get("is_agent_finished")) - steps.append({ - "step": meta.get("step"), - "pre_agent": meta.get("pre_agent"), - "agent_id": meta.get("agent_id"), - "tool_calls": calls, - "assistant_content": str(act.get("content") or ""), - "is_agent_finished": finished, - }) - if finished and act.get("content"): - final_answer = str(act.get("content")) - -# 4) 抽取「源证据」= 最终对话里的所有 tool 结果(groundedness 依据) -final_msgs = traj[-1].get("state", {}).get("messages", []) or [] -evidence = [] -for i, m in enumerate(final_msgs): - if m.get("role") == "tool": - evidence.append({"msg_index": i, "content": str(m.get("content") or "")}) - -extract = { - "task_id": TASK_ID, - "is_sub_task": rec.get("is_sub_task"), - "num_steps": len(traj), - "question": question, - "system_prompt_excerpt": system_prompt[:8000], # 仅用于约束合规检查,截断以省 token - "steps": steps, - "final_answer": final_answer, - "evidence": evidence, -} -out = os.path.join(OUT_DIR, f"extracted_{TASK_ID}.json") -with open(out, "w", encoding="utf-8") as f: - json.dump(extract, f, ensure_ascii=False, indent=2) - -# 控制台打印一份紧凑摘要,便于你快速判断 -print(f"[OK] task_id={TASK_ID} steps={len(traj)} evidence_blocks={len(evidence)}") -print(f"[OK] question: {question}") -print(f"[OK] final_answer_len: {len(final_answer or '')}") -print(f"[OK] extracted -> {out}") -for st in steps: - names = [c['name'] for c in st['tool_calls']] - print(f" step{st['step']}: tools={names} finished={st['is_agent_finished']}") -PYEOF -``` - -> 运行前用环境变量传参:`TRAJECTORY_LOG=... TASK_ID=... OUT_DIR=... python3 ...`。 -> 运行后用 `Read` 读取 `OUT_DIR/extracted_.json`,再进入评估阶段。**只读这个抽取文件**,不要回头读原始日志行。 - ---- - -## 阶段 1 · 构建证据集(groundedness 基线) - -从 `evidence[]`(所有 tool 结果)中归纳出「本次运行实际获取到的事实集合」——即 agent 真正从外部(网页 Show Notes、`innerText`、命令输出等)拿到的内容。这是判断**忠实度/幻觉**的唯一基准。 - -关键判据: -- 系统提示中明确「模型知识截止 2024」。若 `final_answer` 中出现具体的、**证据集里不存在**的事实性断言(人名、数字、专有名词、引述金句、章节结构),默认按**潜在幻觉**处理,除非能在 evidence 中找到出处。 -- 区分「可被证据支撑的断言」与「模型基于先验/常识的合理推断」——后者也要标注为「未经证据证实」。 - ---- - -## 阶段 2 · 评分(八维,1–5 分,带锚点) - -对每个维度给出 1–5 的整数分,并**引用证据**(步骤号 / `msg_index` / final_answer 中的具体句子)作为依据。严禁仅凭印象打分。 - -锚点统一含义:**5=优秀无明显问题 / 4=良好有小瑕疵 / 3=合格但有明确缺陷 / 2=较差影响可用性 / 1=不合格**。 - -### A. 输出质量(权重合计 60%) - -| 维度 | 权重 | 评什么 | 扣分信号 | -|---|---|---|---| -| A1 忠实度 / Groundedness | 25% | 每条事实性断言是否被证据集支撑,是否有幻觉 | 出现证据集外的具体事实;把先验当事实;编造引述/数字 | -| A2 覆盖度 / Completeness | 15% | 是否同时覆盖「核心内容」与「关键洞察」(本任务的双重诉求) | 只复述梗概无洞察;漏掉主线 | -| A3 相关性 / 目标贴合 | 10% | 是否回答了实际问题、是否锁定了正确的对象(该 episode),无主题漂移 | 答非所问;张冠李戴;偷换目标 | -| A4 结构与可读性 | 10% | 组织、清晰度、长度适配、语言与提问一致 | 冗长堆砌;无结构;语言不一致 | - -### B. 执行过程质量(权重合计 40%) - -| 维度 | 权重 | 评什么 | 扣分信号 | -|---|---|---|---| -| B1 工具使用恰当性 | 12% | 工具选择与参数是否合理、是否达成目的 | 错用工具;参数错误;无效调用 | -| B2 效率 | 10% | 步数 / 调用数相对必要工作量是否经济 | 冗余探测、重复弯路、无谓的全量重试 | -| B3 约束合规 | 10% | 是否遵守 system_prompt 的硬性约束(工作目录、不 rm -rf、不写 /tmp、完成校验、不臆造) | 违反明确禁令;越权 | -| B4 鲁棒性 / 错误恢复 | 8% | 失败后是否定位并转向有效路径 | 反复撞同一墙;放弃;忽略错误 | - -> 本条轨迹的已知特征(供参考,不要照抄结论,须自行用证据复核):steps 1–3 尝试 curl+grep+regex 抽取网页失败;steps 4–7 在探测 `kimi-webbridge`/`agent-browser` 工具(疑似弯路,计入 B2/B4);steps 8–9 改用 `agent-browser` 复用 CDP 9222 成功抓到 Show Notes 与 `innerText`;step 10 输出。请据此核实 B2(效率)与 B4(恢复)的真实表现。 - ---- - -## 阶段 3 · 汇总与判定 - -1. 计算加权总分(百分制):`score = Σ(dim_score/5 × weight) × 100`。 -2. 给出等级:`≥85 优秀(Excellent) / 70–84 合格(Pass) / 55–69 需改进(Marginal) / <55 不合格(Fail)`。 -3. **一票否决项**:若 A1 忠实度 ≤2(存在实质性幻觉),无论总分多少,最终判定不得高于「需改进」,并在报告中显著标红。 -4. 列出 **Top-3 优点** 与 **Top-3 待改进项**,每条附证据指针与可执行的改进建议。 - -### 评判纪律(消除 judge 偏差) - -- 不因答案**更长/更华丽**而加分;只认证据与目标贴合度。 -- 不被 agent 的自信措辞影响——「已成功拿到完整 Show Notes」这类自述必须用 evidence 核实。 -- 不确定是否有出处时,标注为「未证实」而非默认正确。 -- 打分先写推理(引证),后给分数,避免先入为主。 - ---- - -## 阶段 4 · 产出报告(两份) - -用 `Write` 写出: - -1. `OUT_DIR/eval_report_.json` —— 机器可读,严格遵循以下 schema: - -```json -{ - "task_id": "string", - "question": "string", - "verdict": "Excellent|Pass|Marginal|Fail", - "weighted_score": 0, - "veto_triggered": false, - "dimensions": { - "A1_groundedness": {"score": 0, "weight": 0.25, "evidence": ["..."], "rationale": "..."}, - "A2_completeness": {"score": 0, "weight": 0.15, "evidence": ["..."], "rationale": "..."}, - "A3_relevance": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, - "A4_readability": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, - "B1_tool_use": {"score": 0, "weight": 0.12, "evidence": ["..."], "rationale": "..."}, - "B2_efficiency": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, - "B3_compliance": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}, - "B4_robustness": {"score": 0, "weight": 0.08, "evidence": ["..."], "rationale": "..."} - }, - "hallucinations": [{"claim": "...", "why_unsupported": "..."}], - "top_strengths": ["..."], - "top_improvements": [{"issue": "...", "evidence": "...", "suggestion": "..."}] -} -``` - -2. `OUT_DIR/eval_report_.md` —— 人类可读报告,包含:评估对象与问题、判定与总分、八维评分表(分数+证据+理由)、幻觉清单、Top-3 优点、Top-3 改进建议。语言与被评估答案保持一致(本任务为中文)。 - -最后在对话中回复一段 ≤8 行的高信号摘要:判定 + 总分 + 最关键的 1–2 个发现 + 两份报告的路径。 - ---- - -## 执行清单(按序) - -- [ ] 阶段 0:运行解析脚本,落盘 `extracted_.json`,Read 之 -- [ ] 阶段 1:构建证据集,标出无出处的断言 -- [ ] 阶段 2:八维逐项打分(先证据后分数) -- [ ] 阶段 3:加权汇总 + 一票否决检查 + 优缺点 -- [ ] 阶段 4:写 JSON + MD 报告,回复摘要 diff --git a/eval/trajectory_evaluator/answer_quality_agent.md b/eval/trajectory_evaluator/answer_quality_agent.md deleted file mode 100644 index ea85ef4f2..000000000 --- a/eval/trajectory_evaluator/answer_quality_agent.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -name: answer-quality-judge -description: 使用 LLM-as-judge 对「问题 ↔ 答案」对做答案质量评估(reference-free,可选参考答案)。仅评估最终答案本身的正确性、完整性、贴合度、可读性与忠实度,不评估执行过程或工具使用。输入由 evaluator 框架以 JSON 注入(case + state.answer),无需读日志、无需调用工具。 -tools: Read, Write -model: opus ---- - -# Answer Quality Evaluator(LLM-as-Judge) - -你是一名严格、以证据为准的 **答案质量评审员**。你的职责是对一条「问题 ↔ 答案」对做可复现、可量化的评估,**只覆盖最终答案本身的质量**,不评估 agent 的执行过程、工具使用或轨迹。 - -你**就是**这里的 LLM judge:所有打分由你完成,不调用外部模型,也**不需要读取任何文件或运行任何命令**。 - -## 评估输入(由框架注入) - -evaluator 框架会以**单个 JSON 对象**作为你的输入消息,结构形如: - -```json -{ - "case": { "task_id": "...", "input": "用户的原始问题/任务" }, - "state": { "answer": "待评估的最终答案", "status": "...", "artifacts": {}, "trajectory": [], "tool_calls": [] }, - "required_output_schema": { "score": "number 0-100", "verdict": "string" }, - "instruction": "Evaluate the existing answer/state and return exactly one JSON object." -} -``` - -判据来源(按优先级): - -1. **`case.input`**:用户实际想要什么——这是判断「相关性/贴合度」和「完整性」的标尺。 -2. **`state.answer`**:被评估的答案——所有评分的对象。 -3. **参考答案(若存在)**:若 `case.input` 或 `state.artifacts` 中显式给出了 reference / 标准答案 / 验收要点,则以其为「正确性」基准;否则按 **reference-free** 处理,仅凭答案的内在一致性与常识可验证性判断。 - -> 若 `state.answer` 为空或缺失,直接判 `Fail`、`score=0`、`Q1=1`,并在 `notes` 中说明。 - ---- - -## 阶段 1 · 评分(五维,1–5 分,带锚点) - -对每个维度给出 1–5 的整数分,并**引用答案中的具体片段或问题中的具体要求**作为依据。严禁仅凭印象打分。 - -锚点统一含义:**5=优秀无明显问题 / 4=良好有小瑕疵 / 3=合格但有明确缺陷 / 2=较差影响可用性 / 1=不合格**。 - -| 维度 | 权重 | 评什么 | 扣分信号 | -|---|---|---|---| -| Q1 正确性 / Correctness | 30% | 答案中的事实性断言、计算、结论是否正确;有参考时是否与参考一致 | 与参考矛盾;事实错误;计算/逻辑错误;编造数字、引述、专有名词 | -| Q2 完整性 / Completeness | 25% | 是否覆盖问题的全部子诉求与关键要点,无关键遗漏 | 漏答子问题;只答一半;缺少必要前提/边界 | -| Q3 贴合度 / Relevance & Instruction-following | 20% | 是否回答了**实际被问的问题**,是否遵守问题中的显式约束(格式、语言、长度、口吻等) | 答非所问;主题漂移;违反明确的格式/语言/长度要求 | -| Q4 可读性 / Clarity | 15% | 组织、清晰度、长度适配、表达是否凝练无歧义 | 冗长堆砌;结构混乱;表述含混;语言与提问不一致 | -| Q5 忠实度 / Faithfulness | 10% | 是否不臆造、不过度自信;不确定处是否如实标注;有参考时是否不超出参考范围杜撰 | 把猜测当事实;编造来源;无依据的绝对化断言 | - ---- - -## 阶段 2 · 汇总与判定 - -1. 计算加权总分(百分制):`score = Σ(dim_score / 5 × weight) × 100`,四舍五入到整数。 -2. 给出等级:`≥85 Excellent / 70–84 Pass / 55–69 Marginal / <55 Fail`。 -3. **一票否决项**:若 Q1 正确性 ≤2(存在实质性事实/逻辑错误,或与参考答案直接矛盾),则置 `veto_triggered=true`,且最终 `verdict` 不得高于 `Marginal`,无论加权总分多少。 -4. 列出 **Top-3 优点** 与 **Top-3 待改进项**,每条附答案中的证据指针与可执行的改进建议。 - -### 评判纪律(消除 judge 偏差) - -- 不因答案**更长 / 更华丽 / 更自信**而加分;只认正确性与目标贴合度。 -- 不被答案的自述(「我已确认…」「显然…」)影响——这类措辞需用问题约束与内在一致性核实。 -- 无参考时不确定真伪的事实,按「未证实」处理:可影响 Q5,但不要据此武断判 Q1 错误,除非违背常识或自相矛盾。 -- 打分**先写推理(引证),后给分数**,避免先入为主。 -- 语言中立:答案语言与问题不一致时,扣 Q3,而非据此曲解内容。 - ---- - -## 阶段 3 · 产出(返回严格 JSON) - -**你的最终回复必须是且仅是一个 JSON 对象,不要包裹 markdown 代码块、不要前后缀说明文字。** 框架会直接解析它。`score` 与 `verdict` 为框架必需字段,其余字段供报告与诊断使用: - -```json -{ - "task_id": "string", - "score": 0, - "verdict": "Excellent|Pass|Marginal|Fail", - "veto_triggered": false, - "Q1_correctness": 0, - "Q2_completeness": 0, - "Q3_relevance": 0, - "Q4_clarity": 0, - "Q5_faithfulness": 0, - "dimensions": { - "Q1_correctness": {"score": 0, "weight": 0.30, "evidence": ["..."], "rationale": "..."}, - "Q2_completeness": {"score": 0, "weight": 0.25, "evidence": ["..."], "rationale": "..."}, - "Q3_relevance": {"score": 0, "weight": 0.20, "evidence": ["..."], "rationale": "..."}, - "Q4_clarity": {"score": 0, "weight": 0.15, "evidence": ["..."], "rationale": "..."}, - "Q5_faithfulness": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."} - }, - "errors": [{"claim": "...", "why_wrong": "..."}], - "top_strengths": ["..."], - "top_improvements": [{"issue": "...", "evidence": "...", "suggestion": "..."}], - "notes": "可选:边界情况说明(如答案缺失、无参考、语言不一致等)" -} -``` - -字段约束: -- `score`:0–100 的整数,**等于**阶段 2 第 1 步算出的加权总分。 -- `verdict`:四档之一,且与 `score` 区间一致;触发一票否决时不得高于 `Marginal`。 -- `Q1..Q5` 顶层字段:与 `dimensions` 内对应 `score` 相同,便于框架直接读取。 -- `evidence`/`rationale`:引用答案或问题中的具体片段,不可空泛。 -- 报告语言与被评估答案保持一致。 - -> 仅当用户在 directive 中显式提供了 `OUT_DIR` 时,才额外用 `Write` 落一份人类可读的 `OUT_DIR/answer_eval_.md`;默认情况下**只返回上面的 JSON**,不写文件、不调工具。 - ---- - -## 执行清单(按序) - -- [ ] 解析注入的 JSON,定位 `case.input`、`state.answer`、可选参考答案 -- [ ] 答案缺失 → 直接 Fail 并返回(见输入说明) -- [ ] 五维逐项打分(先证据后分数) -- [ ] 加权汇总 + 一票否决检查 + 优缺点 -- [ ] 返回严格 JSON(score/verdict 必填,无 markdown 包裹) diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md index dd9bc2b03..824095f75 100644 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md +++ b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md @@ -34,8 +34,8 @@ aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ --kind trajectory \ --task-id task_20260609193335 \ - --judge-agent eval/trajectory_evaluator/agent.md \ - --out-dir eval/trajectory_evaluator/reports + --judge-agent judge_agents/trajectory_judge.md \ + --out-dir reports ``` Task+answer files: @@ -44,7 +44,7 @@ Task+answer files: aworld-cli evaluator \ --input task_answers.jsonl \ --kind answer \ - --judge-agent eval/answer_judge/agent.md \ + --judge-agent judge_agents/answer_judge.md \ --out-dir reports ``` @@ -59,7 +59,7 @@ aworld-cli evaluator \ --id-field task_id \ --task-field task \ --agent Aworld \ - --judge-agent eval/answer_judge/agent.md \ + --judge-agent judge_agents/answer_judge.md \ --out-dir reports ``` From 512c9b4f3dd65456998cab2dbd9ab4c0327b5766 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Wed, 10 Jun 2026 22:10:30 +0800 Subject: [PATCH 39/41] Gate answer evaluations on veto signal --- .../src/aworld_cli/evaluator_runtime.py | 17 +++++-- tests/core/test_evaluator_runtime.py | 45 +++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 1f6b1f704..863381b00 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -185,6 +185,7 @@ def _run_evaluator_hooks( class _SourceJudgeOutput(BaseModel): score: float verdict: str + veto_triggered: bool = False def _looks_like_aworld_trajectory_log(path: Path) -> bool: @@ -226,7 +227,11 @@ def _build_source_prompt(case_input: dict, target: dict, suite) -> str: "trajectory": target.get("trajectory"), "tool_calls": target.get("tool_calls"), }, - "required_output_schema": {"score": "number, weighted score from 0 to 100", "verdict": "string"}, + "required_output_schema": { + "score": "number, weighted score from 0 to 100", + "verdict": "string", + "veto_triggered": "boolean, true only for one-vote veto failures", + }, "instruction": "Evaluate the existing answer/state and return exactly one JSON object.", } return json.dumps(payload, ensure_ascii=False, indent=2) @@ -469,6 +474,12 @@ def _build_source_suite( expected=True, ), ) + answer_gate = GatePolicyDef( + pass_all=( + GateMetricCondition(metric_name="score", op=">=", threshold=70.0), + GateMetricCondition(metric_name="veto_triggered", op="==", threshold=False), + ) + ) if kind == "task": source = JsonlTaskSource( path=input_path, @@ -485,7 +496,7 @@ def _build_source_suite( prompt_builder=_build_source_prompt, ), judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), - gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70.0), + gate_policy=answer_gate, metadata={"agent": agent_name}, ) @@ -505,7 +516,7 @@ def _build_source_suite( prompt_builder=_build_source_prompt, ), judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), - gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70.0), + gate_policy=answer_gate, ) if kind == "trajectory": diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 9c2cecd3d..65ffd1a8c 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -175,6 +175,50 @@ async def fake_run_evaluation_flow(flow): assert report["automation"]["source_kind"] == "task" +def test_task_source_gate_consumes_answer_veto_signal( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "tasks.jsonl" + input_path.write_text('{"id":"case-1","input":"question"}\n', encoding="utf-8") + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + class FakeHarness: + pass + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._build_cli_agent_runtime_harness", + lambda *, agent_name: FakeHarness(), + ) + + suite = _build_source_suite( + kind="task", + input_path=input_path, + judge_agent_path=judge_agent, + task_id=None, + id_field="id", + task_field="input", + answer_field="answer", + out_dir=str(tmp_path), + ) + + payload = suite.judge_schema.validate_payload( + {"score": 95.0, "verdict": "Excellent", "veto_triggered": True} + ) + assert payload["veto_triggered"] is True + pass_conditions = suite.gate_policy.normalized_conditions()[0] + assert any( + condition.metric_name == "veto_triggered" + and condition.op == "==" + and condition.threshold is False + for condition in pass_conditions + ) + decision = suite.gate_policy.evaluate({"score": 95.0, "veto_triggered": True}) + assert decision.status == "fail" + assert any(condition["metric_name"] == "veto_triggered" for condition in decision.failed_conditions) + + def test_run_evaluator_source_cli_builds_generated_trajectory_flow_with_default_agent( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -319,6 +363,7 @@ def test_source_prompt_uses_zero_to_hundred_score_contract() -> None: payload = json.loads(prompt) assert payload["required_output_schema"]["score"] == "number, weighted score from 0 to 100" + assert payload["required_output_schema"]["veto_triggered"] == "boolean, true only for one-vote veto failures" def test_run_evaluator_source_cli_rejects_unsupported_source_kind(tmp_path: Path) -> None: From f9c40d5f4dc4c33d7ad6e33fc877ebf391c867e1 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Thu, 11 Jun 2026 09:54:13 +0800 Subject: [PATCH 40/41] Handle non-numeric state check comparisons --- aworld/evaluations/runtime_composition.py | 12 +++++- tests/evaluations/test_runtime_composition.py | 39 +++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/aworld/evaluations/runtime_composition.py b/aworld/evaluations/runtime_composition.py index 56c290823..be59bacdc 100644 --- a/aworld/evaluations/runtime_composition.py +++ b/aworld/evaluations/runtime_composition.py @@ -159,12 +159,20 @@ def grade(self, *, state: RolloutState, case: Any, target: Mapping[str, Any]) -> raise ValueError(f"unsupported state-check source: {self.source}") try: actual = _resolve_path(sources[self.source], self.path) - passed = _compare_values(actual, self.op, self.expected) - reason = "matched" if passed else f"expected {self.expected!r}, got {actual!r}" except KeyError: actual = None passed = False reason = f"missing path: {'.'.join(self.path)}" + else: + try: + passed = _compare_values(actual, self.op, self.expected) + except (TypeError, ValueError) as exc: + if isinstance(exc, ValueError) and str(exc).startswith("unsupported state-check operator"): + raise + passed = False + reason = f"not comparable: expected {self.expected!r}, got {actual!r} ({exc})" + else: + reason = "matched" if passed else f"expected {self.expected!r}, got {actual!r}" return OutcomeCheckResult( metric_name=self.metric_name, value=1.0 if passed else 0.0, diff --git a/tests/evaluations/test_runtime_composition.py b/tests/evaluations/test_runtime_composition.py index d93cc0b45..8db02fe3a 100644 --- a/tests/evaluations/test_runtime_composition.py +++ b/tests/evaluations/test_runtime_composition.py @@ -76,6 +76,45 @@ def test_state_check_grader_emits_outcome_metric(): assert result.passed is True +def test_state_check_grader_fails_non_numeric_comparison_without_crashing(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"latency_ms": "not-a-number"}, + ) + grader = StateCheckGrader( + metric_name="latency_ok", + path=("latency_ms",), + op="<=", + expected=1000, + ) + + result = grader.grade(state=state, case=None, target={}) + + assert result.metric_name == "latency_ok" + assert result.value == 0.0 + assert result.passed is False + assert "not comparable" in result.reason + assert result.metadata["actual"] == "not-a-number" + + +def test_state_check_grader_rejects_unsupported_operator(): + state = RolloutState( + case_id="case-1", + status="success", + outcome={"latency_ms": 10}, + ) + grader = StateCheckGrader( + metric_name="latency_ok", + path=("latency_ms",), + op="between", + expected=1000, + ) + + with pytest.raises(ValueError, match="unsupported state-check operator"): + grader.grade(state=state, case=None, target={}) + + def test_scripted_user_simulator_emits_turns_in_order(): simulator = ScriptedUserSimulator() state = RolloutState(case_id="case-1") From 84998aaaa1968ebabff9553f683f781583a47308 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Thu, 11 Jun 2026 09:57:53 +0800 Subject: [PATCH 41/41] Remove openspec changes from PR --- .../.openspec.yaml | 2 - .../design.md | 170 ----- .../implementation-plan.md | 37 - .../proposal.md | 32 - .../specs/cli-evaluator-flow/spec.md | 103 --- .../tasks.md | 40 - .../.openspec.yaml | 2 - .../design.md | 96 --- .../implementation-plan.md | 150 ---- .../proposal.md | 19 - .../specs/evaluation-substrate/spec.md | 58 -- .../tasks.md | 35 - .../.openspec.yaml | 2 - .../design.md | 238 ------ .../implementation-plan.md | 85 --- .../proposal.md | 36 - .../specs/evaluation-substrate/spec.md | 105 --- .../tasks.md | 60 -- .../.openspec.yaml | 2 - .../design.md | 77 -- .../implementation-plan.md | 94 --- .../proposal.md | 19 - .../specs/evaluation-substrate/spec.md | 34 - .../tasks.md | 17 - .../.openspec.yaml | 2 - .../design.md | 247 ------- .../implementation-plan.md | 285 -------- .../proposal.md | 37 - .../specs/evaluation-substrate/spec.md | 157 ---- .../tasks.md | 53 -- .../.openspec.yaml | 2 - .../design.md | 97 --- .../implementation-plan.md | 200 ----- .../proposal.md | 34 - .../specs/evaluation-substrate/spec.md | 65 -- .../tasks.md | 40 - .../design.md | 183 ----- .../implementation-plan.md | 681 ------------------ .../proposal.md | 34 - .../specs/evaluation-substrate/spec.md | 101 --- .../tasks.md | 37 - 41 files changed, 3768 deletions(-) delete mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml delete mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md delete mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md delete mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md delete mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md delete mode 100644 openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md delete mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml delete mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md delete mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md delete mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md delete mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md delete mode 100644 openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md delete mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml delete mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md delete mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md delete mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md delete mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md delete mode 100644 openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md delete mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml delete mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md delete mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md delete mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md delete mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md delete mode 100644 openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md delete mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml delete mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md delete mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md delete mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md delete mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md delete mode 100644 openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md delete mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml delete mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md delete mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md delete mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md delete mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md delete mode 100644 openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md delete mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md delete mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md delete mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md delete mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md delete mode 100644 openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml deleted file mode 100644 index 2cb80411e..000000000 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: spec-driven -created: 2026-06-10 diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md deleted file mode 100644 index 824095f75..000000000 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/design.md +++ /dev/null @@ -1,170 +0,0 @@ -# AWorld CLI Evaluator Source Run - -## Context - -`aworld-cli evaluator` currently runs suite-backed evaluations for local targets. It is already exposed through the builtin plugin command model and uses evaluator hooks around discovery, pre-run, post-run, and summary rendering. That is the right extension surface for CLI-level concerns. - -The new framework input-source layer will make evaluation inputs first-class: task-only files, task+answer files, serialized states, and AWorld trajectory logs all normalize into source records and framework state adapters. The CLI should not duplicate parsing or replay logic. Its job is to assemble a source-backed flow from user-facing arguments and then call the same `run_evaluation_flow` substrate used by code callers. - -## Goals / Non-Goals - -**Goals:** - -- Provide a simple CLI path for evaluating files/logs without writing a Python test harness. -- Keep the evaluator command plugin-backed and compatible with existing top-level command registration. -- Reuse existing evaluator hooks and extend their payloads for source-backed runs. -- Make common source kinds usable with a small argument set. -- Keep framework evaluation semantics in `aworld.evaluations`, not in CLI handlers. -- Preserve existing `--target` / `--suite` evaluator behavior. - -**Non-Goals:** - -- Adding a separate `aworld-cli trajectory-eval` command. -- Making trajectory logs a special CLI-only feature. -- Implementing source parsing, state replay, judge normalization, scoring, or gate logic in `aworld-cli`. -- Replacing the plugin command system or hook infrastructure. -- Adding remote storage connectors, sandbox lifecycle management, or training/optimizer flows. - -## Command Shape - -The canonical source-backed path should be: - -```bash -aworld-cli evaluator \ - --input ~/Documents/logs/trajectory.log \ - --kind trajectory \ - --task-id task_20260609193335 \ - --judge-agent judge_agents/trajectory_judge.md \ - --out-dir reports -``` - -Task+answer files: - -```bash -aworld-cli evaluator \ - --input task_answers.jsonl \ - --kind answer \ - --judge-agent judge_agents/answer_judge.md \ - --out-dir reports -``` - -The default JSONL fields are `id`, `input`, and `answer`. `--id-field`, `--task-field`, and `--answer-field` are override flags for files that do not follow that convention. - -Task-only files are supported once the framework input-source layer exposes `JsonlTaskSource`; the CLI wires them to the default `Aworld` agent unless `--agent` is supplied: - -```bash -aworld-cli evaluator \ - --input tasks.jsonl \ - --kind task \ - --id-field task_id \ - --task-field task \ - --agent Aworld \ - --judge-agent judge_agents/answer_judge.md \ - --out-dir reports -``` - -`--kind auto` can be added once detection is reliable, but the first version should require explicit `--kind` to keep failures predictable. - -For `--kind trajectory`, `--task-id` selects the existing-log replay path. When `--task-id` is omitted, the input is treated as task JSONL: the CLI runs the task through the default or specified main agent, extracts the AWorld response trajectory, and feeds that generated trajectory into the trajectory judge prompt. -If the input is an AWorld trajectory log and `--task-id` is omitted, the CLI uses the framework trajectory-log source to replay all task records in that log. This makes all-task replay explicit through the `trajectory` kind while keeping trajectory parsing in the framework source layer. - -## CLI Boundary - -The evaluator command owns: - -- argument parsing and validation -- path normalization -- selecting a framework source class by `--kind` -- passing field mappings and task filters to the source -- selecting a framework state adapter or execution spec -- loading a judge agent through framework helpers -- invoking framework flow execution -- writing the report and rendering a summary -- invoking evaluator hooks with source-aware payloads - -The evaluator command does not own: - -- parsing trajectory internals -- converting source records into `EvalState` or `RolloutState` -- judge payload normalization -- scorer implementation -- gate implementation -- report schema semantics -- trial, sandbox, or simulator semantics - -## Plugin And Hook Integration - -The implementation should follow existing CLI conventions: - -- keep `EvaluatorTopLevelCommand` as the command object exposed through the builtin evaluator plugin entrypoint -- route source-backed `--input` arguments through the same command object without creating a new top-level command -- keep source-backed flow assembly in `aworld_cli.evaluator_runtime` -- use `PluginManager`, `get_builtin_plugin_roots`, `load_plugin_hooks`, and `_run_evaluator_hooks` as the hook path - -Hook payloads should gain source-aware fields while preserving existing keys: - -- `mode`: `target` or `source` -- `input`: resolved input path for source mode -- `kind`: source kind for source mode -- `task_id` or `task_ids` when provided -- `judge_agent`: resolved judge-agent path when provided -- `agent`: resolved execution agent path/name when provided -- `workspace_path` -- `output_path` or report path after resolution - -Allowed hook behavior remains CLI-scoped: - -- pre-discover/pre-run hooks may add metadata or override summary fields -- post-run hooks may upload, notify, or record report metadata -- render hooks may append summary text -- hooks must not replace framework execution, scoring, gate decisions, or report contracts - -## Data Flow - -```text -CLI args - -> EvaluatorTopLevelCommand parser - -> aworld_cli.evaluator_runtime source runner - -> framework EvalSource + EvalStateAdapter / execution spec - -> create source-backed EvalSuiteDef / EvaluationFlowDef - -> run_evaluation_flow - -> report write + render summary + hooks -``` - -For the trajectory-log manual case, the CLI path should be equivalent to the current pytest invocation but without test-local glue: - -```text ---input trajectory.log - -> AWorldTrajectoryLogSource - -> TrajectoryLogStateAdapter - -> ReplayRuntimeHarness - -> AgentJudgeBackend.from_agent_markdown(agent.md) - -> typed schema + gate + report -``` - -## Compatibility - -Existing usage remains valid: - -```bash -aworld-cli evaluator --target ./some-target --suite app-evaluator -``` - -The new `evaluator --input ...` source path should not break `--list-suites`, `--print-report-schema`, `--validate-report`, or interactive approval behavior. - -## Risks / Trade-offs - -- [Command ambiguity] `evaluator --target` and `evaluator --input` are mutually exclusive, so parser errors must clearly explain which mode is active. -- [Too many flags] Field mappings are necessary for generic JSONL. Presets can reduce repeated arguments later. -- [Case-specific drift] Avoid canonical `evaluator trajectory-log`; if aliases are added later, they should delegate to `evaluator --input ... --kind trajectory`. -- [Plugin overreach] Hook contracts must state that plugins customize CLI assembly and side effects only. - -## Migration Plan - -1. Land framework input sources first, including the manual test refactor. -2. Add source-backed `--input` parser mode to the existing evaluator command. -3. Add source-run runtime helper that calls framework APIs. -4. Extend evaluator hook event payloads with source mode fields. -5. Add CLI tests for argument validation and runtime delegation. -6. Add one opt-in manual command example for the trajectory evaluator case. -7. Keep old target/suite command behavior unchanged. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md deleted file mode 100644 index 15b5feae5..000000000 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/implementation-plan.md +++ /dev/null @@ -1,37 +0,0 @@ -# Implementation Plan - -## Commit 1: Parser Shape - -- Confirm the framework input-source change has landed. -- Add source-backed `run` parsing to `EvaluatorTopLevelCommand`. -- Add JSONL field defaults for task+answer sources. -- Keep the builtin evaluator plugin command as the registration path. -- Add tests for command parsing and incompatible argument combinations. - -## Commit 2: Runtime Delegation - -- Add a `run_evaluator_source_cli(...)` helper in `aworld_cli.evaluator_runtime`. -- Map initially supported `--kind` values to framework input-source APIs. -- Return clear unsupported-kind errors for source kinds not yet implemented by the framework layer. -- Build source-backed flows through framework helpers only. -- Add runtime delegation tests with monkeypatched framework helpers. - -## Commit 3: Hooks And Reporting - -- Extend evaluator hook payloads for source-backed mode. -- Preserve existing target-mode hook payloads. -- Add automation/report metadata for source input, kind, task ids, and output path. -- Add hook payload and summary tests. - -## Commit 4: Examples And Manual Regression - -- Document the trajectory-log command that replaces the pytest-specific manual invocation. -- Add task+answer examples. -- Mention task-only examples only after the framework source layer supports task-only sources. -- Keep the existing pytest manual regression as a lower-level framework e2e until the source API is fully adopted. - -## Verification - -- `pytest` for evaluator CLI tests. -- Evaluator framework tests from the input-source change. -- `openspec validate aworld-cli-evaluator-source-run-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md deleted file mode 100644 index 9ce836577..000000000 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/proposal.md +++ /dev/null @@ -1,32 +0,0 @@ -# AWorld CLI Evaluator Source Run - -## Why - -The manual trajectory evaluator regression proved that AWorld can evaluate real task outputs, trajectories, outcome checks, typed judge schemas, and composite gates. It also showed that a user-facing CLI must not expose the full substrate assembly surface just to run a simple evaluation. - -The framework input-source change is responsible for normalizing task files, task+answer files, serialized states, and AWorld trajectory logs into framework-owned evaluation records and replay state. The CLI should be a thin consumer of that layer: parse user intent, select a source adapter and judge agent, run the suite-backed flow, and write a report. - -The existing CLI already has an official evaluator command implemented through the builtin plugin command path, with evaluator lifecycle hooks for discovery, pre-run, post-run, and rendering. This change extends that command shape instead of adding an ad hoc script or a separate evaluator CLI. - -## What Changes - -- Add a source-backed `aworld-cli evaluator --input ...` mode to the existing evaluator command. -- Support source-oriented arguments: `--input`, `--kind`, optional field mappings, optional `--task-id`, `--agent`, `--judge-agent`, and output options. -- Use conventional JSONL field defaults (`id`, `input`, `answer`) so simple task+answer files do not require field-mapping flags. -- Keep the canonical command source-oriented rather than case-specific; trajectory-log, task-only, and task+answer are input kinds, not separate evaluator stacks. -- Build source-backed evaluation flows by calling the framework input-source APIs from `aworld.evaluations`. -- Preserve the existing target/suite evaluator path for current users. -- Integrate through the existing builtin plugin command and evaluator hook model; plugins may observe or customize CLI assembly metadata, but they may not redefine framework execution, scoring, gate, or report semantics. - -## Capabilities - -### Modified Capabilities - -- `cli-evaluator-flow`: add a source-backed run path for simple file/log based evaluation while preserving plugin-backed command registration and hook extensibility. - -## Impact - -- Affected code: `aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py`, `aworld-cli/src/aworld_cli/evaluator_runtime.py`, builtin evaluator plugin command wiring, CLI rendering/tests. -- Affected APIs: additive CLI flags and runtime helpers; existing `aworld-cli evaluator --target ...` behavior remains compatible. -- Dependencies: this change depends on the framework input-source layer from `aworld-evaluator-input-sources-2026-06-10` and should land after that change. -- Non-goals: no new framework source semantics, no case-specific `trajectory-log` command as the canonical API, no new plugin system, no CLI-owned scoring or gate implementation, no training/optimizer integration. diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md deleted file mode 100644 index aaba466eb..000000000 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/specs/cli-evaluator-flow/spec.md +++ /dev/null @@ -1,103 +0,0 @@ -## MODIFIED Requirements - -### Requirement: CLI evaluator command runs a complete evaluation flow - -The CLI SHALL provide an evaluator command that can run a complete evaluation flow against a supported local target or a supported evaluation input source such as a task file, task+answer file, serialized evaluation state, or AWorld trajectory log. - -#### Scenario: User evaluates a local target -- **WHEN** a user invokes the evaluator command with a supported local evaluation target -- **THEN** the CLI SHALL resolve the target, build an evaluation flow, execute the selected suite, and return a completed evaluation result - -#### Scenario: User evaluates a source input -- **WHEN** a user invokes the evaluator command with a supported source input and source kind -- **THEN** the CLI SHALL resolve the input, select the matching framework source adapter, build a source-backed evaluation flow, and return a completed evaluation result - -### Requirement: CLI evaluator is an official plugin-backed command - -The evaluator command SHALL integrate with the CLI through the same builtin plugin command model used by other official top-level commands. - -#### Scenario: CLI loads official evaluator command -- **WHEN** the CLI initializes builtin top-level command providers -- **THEN** the evaluator command SHALL be exposed through a builtin plugin-backed command entry rather than only through an ad hoc direct registration path - -#### Scenario: Source-backed evaluator mode uses existing command registration -- **WHEN** the CLI exposes source-backed evaluator usage -- **THEN** it SHALL do so through the existing evaluator command object and builtin evaluator plugin registration rather than a separate top-level command or standalone script - -### Requirement: CLI evaluator extensibility uses hooks for peripheral customization - -The evaluator command SHALL support plugin and hook-based extensibility for CLI-specific discovery, assembly, and output concerns without moving framework evaluation semantics into CLI handlers. - -#### Scenario: Plugin customizes evaluator discovery or assembly -- **WHEN** an installed or builtin CLI plugin participates in evaluator discovery or pre-run assembly -- **THEN** the CLI SHALL provide hook points for those lifecycle stages without requiring the plugin to redefine framework execution, scoring, or gate logic - -#### Scenario: Plugin extends evaluator rendering or post-run handling -- **WHEN** an installed or builtin CLI plugin needs to append summary output, upload reports, or trigger notifications after evaluation -- **THEN** the CLI SHALL provide hook points for rendering and post-run handling while preserving the framework-owned evaluation result and report contract - -#### Scenario: Source-backed evaluator flow invokes evaluator hooks -- **WHEN** a source-backed evaluator run is assembled or completed -- **THEN** the CLI SHALL invoke the same evaluator hook infrastructure used by target-backed runs, with source-aware event fields that identify mode, input path, source kind, task filters, judge agent, execution agent, workspace path, and output path when available - -### Requirement: CLI evaluator hook contracts are explicit - -The evaluator command SHALL document the event payloads, mutable state surface, and allowed side effects for evaluator-specific CLI hooks. - -#### Scenario: Plugin author implements an evaluator lifecycle hook -- **WHEN** a plugin author uses an evaluator-specific hook such as pre-run, post-run, or summary rendering -- **THEN** the CLI SHALL provide a documented hook contract describing which fields are guaranteed and what a hook may modify - -#### Scenario: Source-backed hook remains CLI-scoped -- **WHEN** a plugin hook observes or customizes a source-backed evaluator run -- **THEN** the hook SHALL be limited to CLI assembly metadata, side effects, and rendering, and SHALL NOT replace framework source parsing, state adaptation, execution, scoring, gate decisions, or report schema semantics - -## ADDED Requirements - -### Requirement: CLI evaluator supports source-backed run mode - -The evaluator command SHALL provide a source-backed run mode that accepts an input path, source kind, optional field mappings, optional task filters, optional execution agent, and judge agent configuration. - -#### Scenario: User evaluates an AWorld trajectory log -- **WHEN** a user runs the evaluator with `--input`, `--kind trajectory`, `--task-id`, and `--judge-agent` -- **THEN** the CLI SHALL use framework trajectory-log source and replay adapters to evaluate the selected task without implementing trajectory parsing in CLI code - -#### Scenario: User evaluates every task in an AWorld trajectory log -- **WHEN** a user runs the evaluator with AWorld trajectory log `--input`, `--kind trajectory`, and `--judge-agent` without `--task-id` -- **THEN** the CLI SHALL use framework trajectory-log source and replay adapters to evaluate every task record in that log without executing the main agent - -#### Scenario: User evaluates generated trajectory from task input -- **WHEN** a user runs the evaluator with task JSONL `--input`, `--kind trajectory`, and `--judge-agent` without `--task-id` -- **THEN** the CLI SHALL run each task through the CLI default `Aworld` agent unless `--agent` is provided, extract the trajectory from the AWorld response, and evaluate that generated trajectory with the trajectory judge flow - -#### Scenario: User evaluates task and answer records -- **WHEN** a user runs the evaluator with `--input`, `--kind answer`, and `--judge-agent` -- **THEN** the CLI SHALL use framework task+answer source and answer-state adapters to evaluate existing answers without re-executing the target - -#### Scenario: User evaluates task-only records through the default agent -- **WHEN** a user runs the evaluator with `--input`, `--kind task`, and `--judge-agent` and omits `--agent` -- **THEN** the CLI SHALL use the framework task source, execute each task through the CLI default `Aworld` agent, convert the produced output into evaluation state, and evaluate that state with the judge agent - -#### Scenario: User overrides task and answer field names -- **WHEN** a user runs the evaluator with `--kind answer` and custom field mapping flags -- **THEN** the CLI SHALL pass those mappings to the framework source while defaulting omitted mappings to `id`, `input`, and `answer` - -#### Scenario: User requests a deferred source kind -- **WHEN** a user runs the evaluator with a source kind that is defined as a future framework source but not implemented yet -- **THEN** the CLI SHALL fail with a clear unsupported-kind error rather than implementing that source kind in CLI code - -### Requirement: CLI evaluator preserves source-oriented canonical commands - -The evaluator CLI SHALL treat source kinds as input adapters under a single canonical source-backed command path rather than creating independent evaluator stacks for each source format. - -#### Scenario: Source kind selects adapter -- **WHEN** a user specifies a supported source kind such as `task`, `answer`, or `trajectory` -- **THEN** the CLI SHALL select the matching framework source adapter while preserving the same evaluation flow and report semantics - -#### Scenario: Source kind is not yet supported by framework -- **WHEN** a user specifies a source kind that the framework source layer has not implemented yet -- **THEN** the CLI SHALL fail with a clear unsupported-kind error instead of implementing source parsing in CLI code - -#### Scenario: Case-specific alias delegates to canonical flow -- **WHEN** a future CLI alias is added for a common source kind -- **THEN** that alias SHALL delegate to the canonical source-backed evaluator flow rather than implementing separate parsing, judging, scoring, or gating behavior diff --git a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md b/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md deleted file mode 100644 index 7cc91c5f0..000000000 --- a/openspec/changes/aworld-cli-evaluator-source-run-2026-06-10/tasks.md +++ /dev/null @@ -1,40 +0,0 @@ -## 1. Parser And Command Shape - -- [x] 1.1 Confirm `aworld-evaluator-input-sources-2026-06-10` has landed before implementing source-backed CLI behavior. -- [x] 1.2 Extend the existing `EvaluatorTopLevelCommand` parser with source-backed `--input` mode. -- [x] 1.3 Add `--input`, `--kind`, `--judge-agent`, `--out-dir`, `--output`, `--task-id`, `--agent`, and optional JSONL field mapping arguments for source mode. -- [x] 1.4 Default task+answer JSONL field mappings to `id`, `input`, and `answer`. -- [x] 1.5 Preserve existing `--target`, `--suite`, `--list-suites`, `--print-report-schema`, `--validate-report`, and `--interactive-approval` behavior. -- [x] 1.6 Add clear validation errors for mixing incompatible target-mode and source-mode arguments. - -## 2. Runtime Assembly - -- [x] 2.1 Add a source-backed runtime helper in `aworld_cli.evaluator_runtime`. -- [x] 2.2 Resolve source kind to framework source/adapters from `aworld.evaluations`. -- [x] 2.3 Resolve `agent.md` judge path through framework `AgentJudgeBackend.from_agent_markdown`. -- [x] 2.4 For task+answer and trajectory-log sources, use framework replay/state adapters without re-execution. -- [x] 2.5 Treat task-only and serialized-state source kinds as unsupported until the framework source layer provides those built-ins. -- [x] 2.6 Persist reports with deterministic default names under the requested output directory. - -## 3. Plugin And Hook Integration - -- [x] 3.1 Keep evaluator command exposure through the existing builtin plugin command entrypoint. -- [x] 3.2 Reuse `_load_evaluator_hooks` and `_run_evaluator_hooks` for source-backed runs. -- [x] 3.3 Extend evaluator hook event payloads with `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, and output path fields. -- [x] 3.4 Document that hooks may customize CLI metadata, side effects, and rendering but must not redefine framework execution, scoring, gate, or report semantics. - -## 4. UX And Reporting - -- [x] 4.1 Render the same evaluator summary shape for source-backed reports. -- [x] 4.2 Include resolved source mode, input path, kind, selected task ids, and report path in summary or automation metadata. -- [x] 4.3 Keep exit codes based on gate status and approval state. -- [x] 4.4 Add examples for trajectory-log and task+answer evaluation, and document task-only evaluation as deferred until the framework source exists. - -## 5. Tests - -- [x] 5.1 Add parser tests for source-backed `evaluator --input` arguments. -- [x] 5.2 Add validation tests for required source-mode arguments and incompatible argument combinations. -- [x] 5.3 Add runtime delegation tests using fake framework source helpers. -- [x] 5.4 Add hook payload tests for source-backed pre-run/post-run/render events. -- [x] 5.5 Add compatibility tests for the existing target/suite evaluator path. -- [x] 5.6 Validate this OpenSpec change with `openspec validate aworld-cli-evaluator-source-run-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml deleted file mode 100644 index 2cb80411e..000000000 --- a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: spec-driven -created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md deleted file mode 100644 index 92a376619..000000000 --- a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/design.md +++ /dev/null @@ -1,96 +0,0 @@ -## Context - -The evaluator roadmap now has three layers in place: - -- rollout-owning runtime harnesses with serializable `RolloutState` -- deterministic outcome/state-check graders -- independent trials with pass@k/pass^k aggregation - -The remaining correctness gap is trial independence. Re-running a case without resetting environment state can inflate pass rates or hide regressions. This change adds the environment lifecycle boundary that lets a suite declare reset semantics without embedding live handles in suite state or report state. - -## Goals / Non-Goals - -**Goals:** - -- Add a trusted environment fixture protocol for reset and cleanup. -- Provide a runtime-harness wrapper that applies the fixture around each rollout. -- Ensure each expanded trial receives its own environment metadata. -- Record environment reset/cleanup metadata in serializable rollout/report state. -- Clean up after failed rollouts or raised exceptions where possible. -- Keep retry attempts inside a single environment reset unless the suite explicitly wraps retry differently. - -**Non-Goals:** - -- Running shell commands, test commands, or arbitrary workflow engines. -- Providing a production sandbox/container implementation. -- Managing external databases or filesystem snapshots directly. -- Supporting untrusted suite manifests for environment fixture references. -- Adding LLM-backed adaptive user simulators or training/optimizer integration. - -## Decisions - -### 1. Add a trusted fixture lifecycle - -Define a small in-process contract: - -- `reset(case, target) -> EnvironmentSnapshot` -- `cleanup(snapshot, case, target, state) -> EnvironmentSnapshot | None` - -The fixture is trusted Python code supplied by the suite author, not a declared JSON manifest capability. Returned metadata must be serializable. Live clients, file handles, subprocesses, and credentials must not be retained in rollout state. - -### 2. Represent reset output as serializable environment snapshot - -Add `EnvironmentSnapshot` with: - -- `environment_id` -- `trial_id` -- `metadata` - -The snapshot is injected into: - -- `case.input["_environment"]` -- `case.metadata["_environment"]` -- `target["_environment"]` -- `RolloutState.metadata["environment"]` - -This lets the base harness find a workspace id, database schema id, or seed without coupling to a concrete sandbox implementation. - -### 3. Use wrapper composition instead of changing every harness - -Add `EnvironmentIsolatedRuntimeHarness(base_harness, fixture)`. The wrapper owns reset and cleanup around exactly one call to `base_harness.run_rollout`. - -For multi-trial suites, case expansion already creates one case row per trial, so the wrapper naturally runs one reset per trial. For retry suites, the recommended composition is: - -- `EnvironmentIsolatedRuntimeHarness(RetryRuntimeHarness(base))`: one environment per trial, retry attempts share that trial environment. - -If a suite intentionally needs one environment per retry attempt, it can wrap in the opposite order: - -- `RetryRuntimeHarness(EnvironmentIsolatedRuntimeHarness(base))` - -### 4. Fail closed on lifecycle errors - -If reset fails, the rollout should not run. If cleanup fails after a successful rollout, the terminal state should record cleanup failure metadata and mark the state failed only if the fixture declares cleanup failure as fatal. The first implementation keeps cleanup failure fatal by default to avoid silently reporting contaminated environments as clean. - -If the base harness raises, the wrapper must still attempt cleanup and then re-raise the original error unless cleanup failure is the only error. - -## Risks / Trade-offs - -- [False sandbox confidence] -> Mitigation: name the feature environment fixture lifecycle, not production sandboxing, and document sandbox adapters as future work. -- [Live handle leakage] -> Mitigation: serialize snapshots through existing serializable filtering before storing state. -- [Retry/trial confusion] -> Mitigation: document wrapper order and add tests proving one reset per trial when retry is inside isolation. -- [Cleanup masking rollout errors] -> Mitigation: preserve original rollout exception when both rollout and cleanup fail. - -## Migration Plan - -1. Add environment snapshot and fixture protocol primitives. -2. Add environment-isolated runtime harness wrapper. -3. Inject serializable environment metadata into case, target, and rollout state. -4. Add trial integration tests proving one reset per trial. -5. Add failure-path tests for cleanup on raised rollout. -6. Keep existing suites unchanged unless they opt into the wrapper. - -## Deferred Questions - -- Concrete filesystem/database/container adapters should be handled in a later environment-adapter change. -- LLM-backed adaptive user simulation remains a simulator-focused follow-up. -- Training/optimizer integration should wait until environment isolation and trial metrics stabilize. diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md deleted file mode 100644 index 141743ad4..000000000 --- a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/implementation-plan.md +++ /dev/null @@ -1,150 +0,0 @@ -# AWorld Evaluator Environment Isolation Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add trusted per-rollout environment reset/cleanup lifecycle support for runtime-composed evaluator suites. - -**Architecture:** Extend `runtime_composition.py` with serializable environment snapshots, a fixture protocol, and a wrapper harness. The wrapper resets before one base rollout, injects environment metadata into case/target context, cleans up afterward, and records lifecycle metadata in `RolloutState` without exposing live handles. - -**Tech Stack:** Python dataclasses/protocols, existing runtime harness and trial substrate, pytest, OpenSpec. - ---- - -## File Structure - -- Modify: `aworld/evaluations/runtime_composition.py` - Add environment snapshot, fixture protocol, wrapper harness, context injection, and cleanup semantics. -- Test: `tests/evaluations/test_environment_isolation.py` - Focused TDD coverage for reset/cleanup, trial integration, retry composition, failure cleanup, and report metadata. -- Validate: `openspec/changes/aworld-evaluator-environment-isolation-2026-06-10` - Keep tasks/spec/design aligned with implementation. - -## Task 1: Environment Snapshot And Fixture - -- [x] **Step 1: Write failing snapshot serialization test** - -Create `tests/evaluations/test_environment_isolation.py`: - -```python -from aworld.evaluations.runtime_composition import EnvironmentSnapshot - - -def test_environment_snapshot_excludes_live_handles(): - snapshot = EnvironmentSnapshot( - environment_id="env-1", - trial_id="case-1::trial-1", - metadata={"workspace": "/tmp/demo", "client": object()}, - ) - - assert snapshot.to_dict() == { - "environment_id": "env-1", - "trial_id": "case-1::trial-1", - "metadata": {"workspace": "/tmp/demo"}, - } -``` - -- [x] **Step 2: Run test and confirm failure** - -Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_snapshot_excludes_live_handles -q` - -Expected: FAIL because `EnvironmentSnapshot` does not exist. - -- [x] **Step 3: Implement `EnvironmentSnapshot`** - -Add a frozen dataclass in `aworld/evaluations/runtime_composition.py` with `to_dict()` that uses `_serializable_dict()`. - -- [x] **Step 4: Run snapshot test until green** - -Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_snapshot_excludes_live_handles -q` - -Expected: PASS. - -## Task 2: Reset/Cleanup Wrapper - -- [x] **Step 1: Write failing reset and cleanup test** - -Add a test where a fixture records `reset` and `cleanup`, and the base harness asserts `_environment` exists in both case and target. - -- [x] **Step 2: Run wrapper test and confirm failure** - -Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_isolated_harness_resets_and_cleans_up -q` - -Expected: FAIL because `EnvironmentIsolatedRuntimeHarness` does not exist. - -- [x] **Step 3: Implement wrapper harness** - -Add `EnvironmentFixture` protocol and `EnvironmentIsolatedRuntimeHarness`. Use `_maybe_await()` for sync/async fixture hooks. Inject snapshot dictionaries into copied case input/metadata and copied target. - -- [x] **Step 4: Run wrapper tests until green** - -Run: `pytest tests/evaluations/test_environment_isolation.py -q` - -Expected: PASS for initial wrapper tests. - -## Task 3: Trial And Retry Semantics - -- [x] **Step 1: Write failing trial reset count test** - -Use `EvalSuiteDef(trial_policy=TrialPolicyDef(num_trials=2))` with `EnvironmentIsolatedRuntimeHarness`. Assert two resets, two cleanups, and distinct trial ids. - -- [x] **Step 2: Run test and confirm failure** - -Run: `pytest tests/evaluations/test_environment_isolation.py::test_environment_isolation_resets_once_per_trial -q` - -Expected: FAIL until wrapper metadata flows through expanded trial cases. - -- [x] **Step 3: Write retry-inside-isolation test** - -Compose `EnvironmentIsolatedRuntimeHarness(base_harness=RetryRuntimeHarness(...))`. Assert reset count equals trial count, not retry attempt count. - -- [x] **Step 4: Run trial/retry tests until green** - -Run: `pytest tests/evaluations/test_environment_isolation.py tests/evaluations/test_evaluator_trials.py -q` - -Expected: PASS. - -## Task 4: Failure Cleanup - -- [x] **Step 1: Write failing cleanup-on-rollout-error test** - -Create a base harness that raises after reset. Assert cleanup is attempted and the original rollout exception is raised. - -- [x] **Step 2: Implement failure cleanup path** - -Wrap base rollout execution in `try/except/finally`. Preserve original exception when cleanup also fails. - -- [x] **Step 3: Write cleanup-failure-after-success test** - -Create a cleanup hook that raises after a successful rollout. Assert the returned state has `status == "failed"` and environment cleanup error metadata. - -- [x] **Step 4: Run failure tests until green** - -Run: `pytest tests/evaluations/test_environment_isolation.py -q` - -Expected: PASS. - -## Task 5: Verification And Commit - -- [x] **Step 1: Run evaluator regression suite** - -Run: - -```bash -pytest tests/evaluations/test_environment_isolation.py tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluator_trials.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py -q -``` - -Expected: PASS. - -- [x] **Step 2: Validate OpenSpec** - -Run: `openspec validate aworld-evaluator-environment-isolation-2026-06-10 --strict` - -Expected: `Change 'aworld-evaluator-environment-isolation-2026-06-10' is valid` - -- [x] **Step 3: Commit** - -```bash -git add aworld/evaluations/runtime_composition.py tests/evaluations/test_environment_isolation.py -git add -f openspec/changes/aworld-evaluator-environment-isolation-2026-06-10 -git commit -m "feat: add evaluator environment isolation" -``` diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md deleted file mode 100644 index 60a625216..000000000 --- a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/proposal.md +++ /dev/null @@ -1,19 +0,0 @@ -# AWorld Evaluator Environment Isolation - -## Why - -Runtime composition can now run one rollout and trial policy can repeat cases for pass@k/pass^k. Those trials are not truly independent if they share filesystem, database, service, or in-memory state. Evaluator users need a framework-owned reset lifecycle so every trial can start from a declared clean environment and record enough metadata to audit the reset. - -## What Changes - -- Add a trusted environment fixture contract for setup/reset/cleanup around each runtime-composed rollout. -- Add a wrapper harness that runs environment reset before the base harness and cleanup after the terminal rollout. -- Inject serializable environment metadata into the case/target visible to the base harness. -- Preserve environment metadata in rollout state, evaluator state, and report artifacts/metadata. -- Keep real sandbox/container/database adapters out of scope; this change defines the lifecycle and trusted in-process contract. - -## Impact - -- Affected code: `aworld/evaluations/runtime_composition.py`, runtime-composed substrate paths, report metadata through existing state serialization. -- Affected tests: add focused coverage for reset-per-trial, retry separation, cleanup-on-failure, and report metadata. -- Follow-ups: concrete filesystem/database/container environment fixtures and LLM-backed user simulators remain separate changes. diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md deleted file mode 100644 index 3bf2a61e8..000000000 --- a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/specs/evaluation-substrate/spec.md +++ /dev/null @@ -1,58 +0,0 @@ -## MODIFIED Requirements - -### Requirement: Environment lifecycle for runtime evaluation - -Runtime-composed evaluation flows SHALL support an opt-in trusted environment lifecycle that resets environment state before a rollout and cleans it up afterward. - -#### Scenario: Runtime harness uses environment fixture -- **WHEN** a runtime-composed suite wraps its harness with an environment fixture -- **THEN** the framework SHALL call the fixture reset hook before executing the base rollout -- **AND** the framework SHALL call the fixture cleanup hook after the base rollout finishes - -#### Scenario: Environment metadata is serializable -- **WHEN** a fixture returns environment metadata -- **THEN** the framework SHALL preserve only serializable metadata in rollout state, evaluator state, and reports -- **AND** the framework SHALL exclude live handles such as clients, file handles, subprocesses, and credentials from serialized state - -#### Scenario: Base harness needs environment context -- **WHEN** environment reset succeeds -- **THEN** the framework SHALL expose the environment snapshot to the base harness through case input, case metadata, and target metadata - -### Requirement: Environment isolation across trials - -Trial-based evaluation SHALL be able to reset environment state independently for each trial. - -#### Scenario: Multi-trial evaluation uses environment isolation -- **WHEN** a suite declares multiple trials and wraps its runtime harness with environment isolation -- **THEN** each expanded trial SHALL receive a distinct reset lifecycle - -#### Scenario: Retry runs inside one isolated trial -- **WHEN** a suite composes retry inside the environment-isolated harness -- **THEN** retry attempts SHALL share one environment reset for that trial -- **AND** retry attempts SHALL NOT increase environment reset count - -### Requirement: Environment lifecycle failure handling - -Environment lifecycle handling SHALL fail closed and preserve cleanup attempts. - -#### Scenario: Reset fails -- **WHEN** an environment reset hook fails -- **THEN** the framework SHALL NOT execute the base rollout -- **AND** the evaluation SHALL surface the reset error through the normal runtime error path - -#### Scenario: Rollout fails -- **WHEN** the base rollout raises after reset succeeds -- **THEN** the framework SHALL attempt cleanup -- **AND** the framework SHALL preserve the original rollout error if cleanup also fails - -#### Scenario: Cleanup fails after rollout success -- **WHEN** cleanup fails after the base rollout returns a terminal state -- **THEN** the framework SHALL mark the rollout state failed and record cleanup error metadata unless the fixture explicitly declares cleanup failure non-fatal - -### Requirement: Sandbox execution remains deferred - -Environment lifecycle support SHALL define trusted reset/cleanup boundaries without introducing untrusted sandbox command execution. - -#### Scenario: Suite requests command-backed sandbox reset -- **WHEN** a suite requires shell commands, container lifecycle, workflow engines, database snapshotting, or filesystem reset -- **THEN** this change SHALL treat that as adapter-specific future work rather than executing arbitrary commands in the evaluator substrate diff --git a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md deleted file mode 100644 index a9512a44c..000000000 --- a/openspec/changes/aworld-evaluator-environment-isolation-2026-06-10/tasks.md +++ /dev/null @@ -1,35 +0,0 @@ -## 1. Environment Fixture Primitives - -- [x] 1.1 Add `EnvironmentSnapshot` with serializable `environment_id`, `trial_id`, and metadata. -- [x] 1.2 Add an `EnvironmentFixture` protocol with async-compatible `reset` and `cleanup`. -- [x] 1.3 Add validation/serialization helpers that exclude live handles from snapshot metadata. - -## 2. Runtime Harness Wrapper - -- [x] 2.1 Add `EnvironmentIsolatedRuntimeHarness`. -- [x] 2.2 Reset before exactly one base rollout. -- [x] 2.3 Inject snapshot metadata into case input, case metadata, and target. -- [x] 2.4 Add cleanup after rollout and preserve cleanup metadata in rollout state. - -## 3. Trial And Retry Semantics - -- [x] 3.1 Prove multi-trial suites reset once per trial. -- [x] 3.2 Prove retry attempts do not increase reset count when retry is inside environment isolation. -- [x] 3.3 Document wrapper-order semantics for one-environment-per-trial versus one-environment-per-attempt. - -## 4. Failure Semantics - -- [x] 4.1 Attempt cleanup when the base harness raises. -- [x] 4.2 Preserve the original rollout exception when rollout and cleanup both fail. -- [x] 4.3 Record cleanup failure metadata on terminal rollout state when cleanup fails after rollout success. - -## 5. Report Shape - -- [x] 5.1 Ensure environment metadata appears through existing state metadata/artifacts. -- [x] 5.2 Keep report schema additive and compatible. - -## 6. Verification - -- [x] 6.1 Add focused tests for reset, cleanup, trial count, retry composition, failure cleanup, and report metadata. -- [x] 6.2 Run evaluator regression tests. -- [x] 6.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-environment-isolation-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml deleted file mode 100644 index 2cb80411e..000000000 --- a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: spec-driven -created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md deleted file mode 100644 index 2bb1c3dfa..000000000 --- a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/design.md +++ /dev/null @@ -1,238 +0,0 @@ -# AWorld Evaluator Input Sources - -## Context - -AWorld's evaluator stack now has the core pieces for serious agent evaluation: - -- suite/case/judge/gate/report substrate -- execution adapters for static, agent, task, and program modes -- runtime-composed rollout harnesses and serializable `RolloutState` -- outcome/state checks, trajectory scorers, standard metrics, trials, environment isolation hooks, and LLM user simulators - -The missing layer is input normalization. The framework can evaluate well once a caller has produced `EvalCaseDef` plus `EvalState` or `RolloutState`, but external evaluation data usually arrives as files or logs. The current manual trajectory-log test manually implements parsing, replay, markdown-agent loading, schema flattening, and suite wiring. That is useful as a spike, but it is not the framework-level integration experience AWorld should expose. - -This change introduces a framework-owned input source layer. It should not create a separate evaluator stack. It should feed existing `EvalSuiteDef`, `EvalCaseDef`, `EvalExecutionSpec`, `RuntimeHarness`, `JudgeBackend`, and report assembly paths. - -## Goals / Non-Goals - -**Goals:** - -- Provide reusable source primitives for external evaluation input records. -- Support task+answer inputs that should be judged without runtime execution. -- Support AWorld trajectory-log inputs by parsing them once in framework code and replaying them into `RolloutState`. -- Keep task-only and serialized-state sources as follow-on implementations of the same protocol rather than first-version built-ins. -- Keep source parsing, state adaptation, judge backend wiring, and suite creation discoverable from `aworld.evaluations`. -- Make the manual trajectory-log regression a small consumer of framework APIs rather than a copy of framework internals. - -**Non-Goals:** - -- Adding CLI commands or argument parsing in this change. -- Adding first-version built-ins for task-only execution sources or generic serialized-state files. -- Adding database, object-store, or remote log connectors. -- Executing untrusted code from input files. -- Running shell commands or external environment checks from source adapters. -- Replacing `EvaluationConfig`, `EvaluateRunner`, `EvalSuiteDef`, or runtime-composition harnesses. -- Adding production sandbox or clean-environment reset implementations. - -## Proposed Abstractions - -### 1. `EvalSource` - -`EvalSource` is a trusted framework object that enumerates evaluation records and converts them into cases. - -Conceptually: - -```python -class EvalSource(Protocol): - def iter_records(self) -> Iterable[EvalSourceRecord]: ... - def to_cases(self) -> tuple[EvalCaseDef, ...]: ... -``` - -`EvalSourceRecord` should contain: - -- `case_id` -- `input` -- optional `expected` -- optional existing `answer` -- optional existing `state` -- optional source metadata -- optional raw source payload for trusted adapters - -Source records must be serializable or sanitize non-serializable values before report state. - -If a source kind uniquely determines its replay adapter, the source should expose `default_adapter()` or equivalent metadata. Callers may override the adapter for advanced cases, but the happy path should not require both `source=AWorldTrajectoryLogSource(...)` and `state_adapter=TrajectoryLogStateAdapter()`. - -### 2. `EvalStateAdapter` - -`EvalStateAdapter` converts source records that already contain outputs into normalized state. - -First-version examples: - -- `AnswerStateAdapter`: turns a task+answer record into `EvalState(answer=answer, completion=[answer])` -- `TrajectoryLogStateAdapter`: turns one AWorld trajectory-log record into `RolloutState` - -Task-only records do not use a replay adapter; they flow through existing execution modes (`AGENT`, `TASK`, `PROGRAM`, or `STATIC` when judge-only). That path is intentionally deferred from this first version because the current simplification target is existing-output replay. - -### 3. `ReplayRuntimeHarness` - -`ReplayRuntimeHarness` is a runtime harness that receives source records and state adapters, then returns `RolloutState` or bridgeable state without re-executing the target. - -The harness owns: - -- selecting the source record for the case -- applying the adapter -- preserving source metadata -- deriving tool calls, usage, timing, and standard metrics where available - -It does not own judging, scoring, gate decisions, trial expansion, or environment reset. - -### 4. Built-in Sources - -The first implementation should include the file-backed sources that have immediate consumers: - -- `JsonlTaskAnswerSource` - - default fields: `id`, `input`, `answer`; optional `expected`, optional metadata - - field names may be overridden by constructor options - - used with `AnswerStateAdapter` -- `AWorldTrajectoryLogSource` - - reads AWorld line-oriented trajectory logs - - extracts records by task id - - used with `TrajectoryLogStateAdapter` - -The API should not hardcode these as evaluator types. They are source/adapters that feed the same suite-backed evaluator. - -Deferred source implementations: - -- `JsonlTaskSource` for task-only records that require runtime execution. -- `RolloutStateFileSource` for generic serialized `EvalState` or `RolloutState` records. - -### 5. Markdown Agent Loading - -The manual regression showed a separate but related gap: a judge agent may be provided as `agent.md`, while framework loading currently favors `SKILL.md`. - -This change should add a framework helper, not a test-local workaround: - -```python -load_agent_markdown(path) -> Agent -AgentJudgeBackend.from_agent_markdown(path, prompt_builder=..., timeout_seconds=...) -``` - -The helper can internally reuse skill loading or instantiate an AWorld agent directly, but callers should not materialize temporary `SKILL.md` files. - -### 6. Judge Payload Normalization - -The trajectory evaluator agent currently returns: - -```json -{ - "weighted_score": 78, - "dimensions": { - "A1_groundedness": {"score": 4} - } -} -``` - -The evaluator substrate prefers flat judge payload fields: - -```json -{"score": 78, "A1_groundedness": 4} -``` - -This change should avoid hidden global flattening. Instead, add explicit normalization support: - -- `JudgeSchemaDef(normalizer=callable)` or equivalent -- a built-in trajectory judge output model/normalizer for dimensions-style reports - -Suite authors should opt into a normalizer so report contracts remain explicit. - -## Data Flow - -### Deferred: Task-only file - -```text -JsonlTaskSource -> EvalCaseDef -> existing EvalExecutionSpec -> EvalState -> judge/scorers/gate/report -``` - -### Task + answer file - -```text -JsonlTaskAnswerSource -> EvalCaseDef + answer record -> AnswerStateAdapter -> EvalState -> judge/scorers/gate/report -``` - -### Deferred: Serialized rollout state file - -```text -RolloutStateFileSource -> RolloutStateAdapter -> RolloutState/EvalState -> judge/scorers/gate/report -``` - -### AWorld trajectory log - -```text -AWorldTrajectoryLogSource -> TrajectoryLogStateAdapter -> RolloutState -> judge/scorers/gate/report -``` - -## API Shape - -Expected high-level usage: - -```python -source = JsonlTaskAnswerSource( - path="task_answers.jsonl", -) - -suite = create_source_eval_suite( - source=source, - judge_backend=AgentJudgeBackend.from_agent_markdown("eval/judge/agent.md"), - judge_schema=JudgeSchemaDef(output_model=AnswerJudgeOutput), - gate_policy=GatePolicyDef(metric_name="score", pass_threshold=70), -) - -report = await run_evaluation_flow(EvaluationFlowDef(target={"kind": "source"}, suite=suite)) -``` - -`create_source_eval_suite(...)` must return a normal `EvalSuiteDef`. It is syntax sugar over the existing suite substrate, not a second suite type or execution stack. - -Expected trajectory-log usage: - -```python -source = AWorldTrajectoryLogSource( - path="~/Documents/logs/trajectory.log", - task_ids=["task_20260609193335"], -) - -suite = create_source_eval_suite( - source=source, - judge_backend=AgentJudgeBackend.from_agent_markdown("eval/trajectory_evaluator/agent.md"), - judge_schema=TrajectoryJudgeSchema.default(), - gate_policy=TrajectoryJudgeGate.default(), -) -``` - -## Risks / Trade-offs - -- [Too much abstraction] -> Mitigation: keep first version limited to task+answer and trajectory-log file-backed sources; allow explicit adapter overrides only for advanced callers. -- [Case-by-case source creep] -> Mitigation: require new sources to implement the same record/state adapter contracts rather than custom evaluator flows. -- [Untrusted file assumptions] -> Mitigation: sources parse data only; they do not execute code or commands. -- [Schema normalization ambiguity] -> Mitigation: make normalizers explicit on schema/suite. -- [Runtime vs replay confusion] -> Mitigation: document that current task+answer and trajectory sources replay existing outputs; future task-only sources will execute through normal execution specs. - -## Migration Plan - -1. Add source record and source protocols, including optional source-provided default adapters. -2. Add JSONL task+answer source with default `id`, `input`, and `answer` field names. -3. Add answer-record state adapter. -4. Add trajectory-log source and trajectory-log state adapter. -5. Add replay harness and source-backed suite factory. -6. Add markdown-agent judge backend factory. -7. Add explicit judge payload normalizer support or built-in trajectory judge schema. -8. Refactor the manual trajectory-log test to use the new framework APIs. -9. Keep existing suite-backed APIs compatible. - -## Deferred Questions - -- Task-only execution sources and generic serialized-state file sources should be separate follow-on implementations of the same protocol. -- Concrete remote source connectors should be separate changes. -- CLI integration should be a later consumer of this framework layer. -- Dataset registry integration can be considered after file-backed sources settle. -- Environment reset remains owned by the environment-isolation capability, not by sources. -- The trajectory evaluator `agent.md` currently contains prompt-local trajectory extraction guidance. This change removes the test-local parser duplication; a later cleanup should either feed the agent framework-extracted trajectory content directly or explicitly keep the prompt-local parsing instructions as evaluator-agent policy. diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md deleted file mode 100644 index 5d2810a9f..000000000 --- a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/implementation-plan.md +++ /dev/null @@ -1,85 +0,0 @@ -# Implementation Plan: AWorld Evaluator Input Sources - -## Phase 1: Core Source Contracts - -1. Add `aworld/evaluations/sources.py`. -2. Define `EvalSourceRecord` as a serializable dataclass. -3. Define `EvalSource` protocol/base with `iter_records()` and `to_cases()`. -4. Add source-provided default adapter support for obvious replay pairs. -5. Add unit tests for record serialization, case lowering, and default adapter selection. - -Expected commit: source contracts and default adapter tests. - -## Phase 2: File Sources - -1. Implement JSONL reader helpers with clear field mapping. -2. Implement `JsonlTaskAnswerSource` with default fields `id`, `input`, and `answer`. -3. Implement override options for JSONL field names. -4. Add tests for valid records, missing required fields, and metadata preservation. - -Expected commit: task+answer file-backed source implementation. - -## Phase 3: State Adapters and Replay - -1. Add `aworld/evaluations/state_adapters.py`. -2. Define `EvalStateAdapter`. -3. Implement `AnswerStateAdapter`. -4. Implement `ReplayRuntimeHarness` or add it to `runtime_composition.py` if it fits better with existing harnesses. -5. Add tests proving source records can be replayed into state and reports. - -Expected commit: replay adapter path for existing outputs. - -## Phase 4: AWorld Trajectory Log Source - -1. Move trajectory log parsing out of the manual test into framework code. -2. Implement `AWorldTrajectoryLogSource`. -3. Implement `TrajectoryLogStateAdapter`. -4. Expose `TrajectoryLogStateAdapter` as the trajectory source default adapter. -5. Derive evidence, final answer, trajectory steps, tool calls, outcome, usage/timing defaults, and standard metrics. -6. Add focused tests with small synthetic trajectory logs. - -Expected commit: trajectory log source and replay adapter. - -## Phase 5: Suite Helpers - -1. Add `create_source_eval_suite(...)` helper. -2. Support replay-backed sources through `ReplayRuntimeHarness`. -3. Use source default adapters when `state_adapter` is omitted. -4. Ensure the helper returns a normal `EvalSuiteDef`. -5. Ensure helper remains optional; callers can still manually construct `EvalSuiteDef`. - -Expected commit: suite factory helpers. - -## Phase 6: Markdown Agent Judge Backend - -1. Add framework-level `load_agent_markdown(path)` helper. -2. Add `AgentJudgeBackend.from_agent_markdown(...)` factory or an equivalent named constructor. -3. Reuse existing AWorld Agent execution path. -4. Add tests proving `agent.md` metadata/body become an executable judge agent. - -Expected commit: markdown agent judge backend. - -## Phase 7: Judge Normalization - -1. Add explicit normalizer hook to `JudgeSchemaDef` or introduce a trajectory judge schema helper. -2. Normalize dimensions-style judge reports before typed validation. -3. Add tests for nested dimensions input and flat output. - -Expected commit: explicit judge payload normalization. - -## Phase 8: Manual Test Refactor - -1. Refactor `tests/evaluations/test_trajectory_log_manual_case.py` to use source/adapters/backend factories. -2. Remove test-local parser, replay harness, markdown-agent materializer, and schema flattening. -3. Keep explicit pytest parameters and LLM skip behavior. -4. Run the manual e2e with an explicit local task id when credentials/logs are available. - -Expected commit: manual trajectory regression uses framework APIs. - -## Verification Commands - -```bash -pytest tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_runtime_composition.py -q -pytest tests/evaluations/test_trajectory_log_manual_case.py -q -openspec validate aworld-evaluator-input-sources-2026-06-10 --strict -``` diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md deleted file mode 100644 index 6628616e7..000000000 --- a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/proposal.md +++ /dev/null @@ -1,36 +0,0 @@ -# AWorld Evaluator Input Sources - -## Why - -The current manual trajectory-log regression proves that AWorld's evaluator substrate can judge final answers, inspect trajectories, run typed LLM-as-judge schemas, apply outcome checks, and gate reports. It also exposes an integration problem: callers must hand-write too much glue code to get external evaluation inputs into `EvalCaseDef` and normalized evaluator state. - -This is not specific to trajectory logs. The same problem appears for several common inputs: - -- a file containing task + answer pairs, where the evaluator should judge existing outputs without re-execution -- an AWorld trajectory log, where the evaluator should reconstruct `RolloutState` from prior execution -- future files containing tasks only or serialized rollout/task responses, which should implement the same source contracts when they gain real consumers - -Adding a dedicated `trajectory_log.py` top-level path would solve one case but repeat the same problem for task files, answer files, rollout dumps, and future stores. The framework needs a small input-source and state-adapter layer that converts heterogeneous evaluation inputs into the existing suite/case/state substrate. - -## What Changes - -- Add framework-owned `EvalSource` primitives that load external evaluation inputs into `EvalCaseDef` rows plus source metadata. -- Add state adapters that convert source records with existing outputs into normalized `EvalState` or `RolloutState`. -- Let sources expose a default state adapter when the adapter is uniquely implied by the source kind. -- Add first built-in source/adapters for task+answer files and AWorld trajectory logs. -- Add a reusable replay harness that uses source adapters to provide rollout/eval state without re-executing an agent. -- Add helper factories so callers can create suite-backed evaluations from sources without hand-writing parser, replay, schema-normalization, and report plumbing. -- Keep CLI integration out of scope; this change is framework-only under `aworld/evaluations/`. - -## Capabilities - -### Modified Capabilities - -- `evaluation-substrate`: add source-backed evaluation input normalization so suite-backed evaluation can consume task+answer and trajectory-log inputs through one framework path, with protocols that allow task-only and serialized-state sources to be added later. - -## Impact - -- Affected code: `aworld/evaluations/**`, especially new source/adapters, runtime-composition replay harness integration, and suite factory helpers. -- Affected APIs: additive framework APIs; existing suite-backed and runtime-composition APIs remain compatible. -- Affected tests: replace manual trajectory-log test-local glue with framework source/adapters; add focused coverage for task+answer and trajectory-log source behavior. -- Non-goals: no `aworld-cli` command shape changes, no untrusted file execution, no production storage connectors, no sandbox reset or external environment management. diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md deleted file mode 100644 index 6c2e941a0..000000000 --- a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/specs/evaluation-substrate/spec.md +++ /dev/null @@ -1,105 +0,0 @@ -## MODIFIED Requirements - -### Requirement: Source-backed evaluation inputs - -Suite-backed evaluation flows SHALL support framework-owned input sources that normalize external evaluation records into cases and optional existing evaluator state. - -#### Scenario: Source provides task and answer records -- **WHEN** an input source provides records with case id, task input, and an existing answer -- **THEN** the framework SHALL allow that source or an explicit state adapter to convert each record into evaluator state without re-executing an agent, task, or program - -#### Scenario: Source metadata is reported safely -- **WHEN** source records include metadata -- **THEN** the framework SHALL preserve serializable source metadata and exclude live file handles, clients, process objects, and other runtime handles - -#### Scenario: Source default adapter is available -- **WHEN** a source kind has one obvious replay adapter -- **THEN** the framework SHALL allow suite construction to use that default adapter without requiring the caller to pass both source and adapter explicitly - -### Requirement: Source state adapters - -Source-backed evaluation flows SHALL separate reading input records from converting existing outputs into evaluator state, while allowing sources to declare a default adapter for the common path. - -#### Scenario: Answer adapter converts existing answer -- **WHEN** an answer state adapter receives a task+answer record -- **THEN** it SHALL produce an evaluator state with terminal answer, completion view, success status, source metadata, and no runtime execution - -#### Scenario: Adapter fails on malformed state -- **WHEN** a source record claims to contain existing output state but required fields are malformed -- **THEN** the framework SHALL raise a clear validation error before judging or reporting the case - -### Requirement: Replay harness for existing outputs - -Suite-backed evaluation flows SHALL support replaying existing outputs through a runtime harness without re-executing the target. - -#### Scenario: Replay harness returns adapted state -- **WHEN** a replay harness is configured with a source record and state adapter -- **THEN** it SHALL return the adapted `RolloutState` or bridgeable evaluator state as the case rollout result - -#### Scenario: Replay is distinct from execution -- **WHEN** a source already contains answer, trajectory, or rollout state -- **THEN** the framework SHALL NOT invoke the suite's agent, task, or program execution adapter for that case unless explicitly configured to do so - -#### Scenario: Replay state feeds existing scorers -- **WHEN** replayed state contains answer, outcome, trajectory, tool calls, usage, timing, or standard metrics -- **THEN** existing judge, trajectory, outcome, reward, standard metric, gate, and report paths SHALL consume that state through the same normalized evaluator interfaces used by runtime-composed execution - -### Requirement: AWorld trajectory log source - -The framework SHALL provide a source and adapter for trusted AWorld trajectory log records. - -#### Scenario: Trajectory log source selects task ids -- **WHEN** a trajectory log source is configured with one or more task ids -- **THEN** it SHALL extract the matching line-oriented AWorld trajectory records and expose one source record per task id - -#### Scenario: Trajectory log record is parsed -- **WHEN** a trajectory log record contains ANSI-decorated Python dict repr with a JSON-string `trajectory` field -- **THEN** the framework SHALL clean ANSI escapes, parse the record, decode the trajectory, and surface a structured record or a clear parse error - -#### Scenario: Trajectory log adapter builds rollout state -- **WHEN** a trajectory log adapter receives a parsed trajectory record -- **THEN** it SHALL produce rollout state containing terminal answer, ordered trajectory steps, extracted tool calls, evidence summary, outcome metadata, usage/timing defaults, and standard metrics - -### Requirement: Source suite factory remains syntax sugar - -Framework helpers for source-backed evaluation SHALL construct ordinary suite-backed evaluation definitions and SHALL NOT introduce a parallel suite type. - -#### Scenario: Source helper creates suite -- **WHEN** a caller uses `create_source_eval_suite` with a supported source, judge backend, judge schema, and gate policy -- **THEN** the helper SHALL return a normal `EvalSuiteDef` that can be passed to existing suite-backed flow execution - -#### Scenario: Source helper uses default adapter -- **WHEN** a caller omits `state_adapter` and the source provides a default adapter -- **THEN** the helper SHALL use the source default adapter for replay construction - -### Requirement: Markdown agent judge loading - -Evaluator judge backends SHALL support loading trusted markdown agent definitions without requiring callers to create temporary skill directories. - -#### Scenario: Judge backend loads agent markdown -- **WHEN** a caller supplies an `agent.md` path to a supported judge backend factory -- **THEN** the framework SHALL create an executable AWorld judge agent from the markdown metadata and body - -#### Scenario: Existing system-prompt judge remains compatible -- **WHEN** a caller uses the existing `AgentJudgeBackend(system_prompt=...)` form -- **THEN** behavior SHALL remain compatible - -#### Scenario: Markdown agent execution is trusted -- **WHEN** markdown agent loading is used -- **THEN** the framework SHALL treat the markdown definition as trusted local evaluator configuration and SHALL NOT execute arbitrary shell commands from the file during loading - -### Requirement: Explicit judge payload normalization - -Suite-backed judge validation SHALL support explicit payload normalization before typed schema validation. - -#### Scenario: Suite declares a normalizer -- **WHEN** a suite or judge schema declares a payload normalizer -- **THEN** the framework SHALL apply that normalizer before typed model validation, metric extraction, and report assembly - -#### Scenario: Dimensions-style trajectory judge output is normalized -- **WHEN** a trajectory judge output contains `weighted_score` and nested `dimensions..score` fields and the suite opts into the built-in trajectory normalizer -- **THEN** the framework SHALL normalize the payload into flat `score` and metric fields before validation - -#### Scenario: No hidden global normalization -- **WHEN** a judge output contains nested dimensions but no normalizer is configured -- **THEN** the framework SHALL preserve current validation behavior and SHALL NOT silently flatten the payload diff --git a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md deleted file mode 100644 index ba3d65d36..000000000 --- a/openspec/changes/aworld-evaluator-input-sources-2026-06-10/tasks.md +++ /dev/null @@ -1,60 +0,0 @@ -## 1. Source Model - -- [x] 1.1 Add `EvalSourceRecord` with case id, input, expected, answer/state payload, source metadata, and raw payload. -- [x] 1.2 Add an `EvalSource` protocol/base class that enumerates records and lowers them into `EvalCaseDef` values. -- [x] 1.3 Add source-provided default adapter support for source kinds with one obvious replay adapter. -- [x] 1.4 Ensure source metadata remains serializable and does not retain file handles, clients, or live runtime objects. - -## 2. Built-in File Sources - -- [x] 2.1 Add `JsonlTaskAnswerSource` for task + answer records that should be judged without re-execution, with default field names `id`, `input`, and `answer`. -- [x] 2.2 Add override options for task+answer field names. -- [x] 2.3 Add `AWorldTrajectoryLogSource` for line-oriented AWorld trajectory logs with task-id selection. -- [x] 2.4 Add validation and error messages for missing id/input/answer fields and missing trajectory task ids. -- [x] 2.5 Defer task-only and generic serialized-state sources to follow-on implementations of the same protocol. - -## 3. State Adapters and Replay Harness - -- [x] 3.1 Add `EvalStateAdapter` protocol/base class. -- [x] 3.2 Add `AnswerStateAdapter` that converts task+answer records into `EvalState`. -- [x] 3.3 Add `TrajectoryLogStateAdapter` that converts AWorld trajectory-log records into `RolloutState`, including answer, evidence, trajectory, tool calls, usage, timing, outcome, and standard metrics. -- [x] 3.4 Ensure `JsonlTaskAnswerSource` and `AWorldTrajectoryLogSource` expose their default adapters. -- [x] 3.5 Add `ReplayRuntimeHarness` that applies a source record and adapter without re-executing the target. - -## 4. Suite Factory Helpers - -- [x] 4.1 Add `create_source_eval_suite(...)` helper that wires source cases, replay harness, judge schema/backend, scorers, and gate policy. -- [x] 4.2 Make `state_adapter` optional when the source provides a default adapter. -- [x] 4.3 Ensure `create_source_eval_suite(...)` returns a normal `EvalSuiteDef`. -- [x] 4.4 Support task+answer and trajectory-log sources with replay adapters. -- [x] 4.5 Preserve existing `run_evaluation_flow` report shape and gate behavior. - -## 5. Markdown Agent Judge Backend - -- [x] 5.1 Add framework helper to load `agent.md` into an AWorld `Agent` without test-local temporary `SKILL.md` materialization. -- [x] 5.2 Add `AgentJudgeBackend.from_agent_markdown(...)` or equivalent factory. -- [x] 5.3 Preserve existing `AgentJudgeBackend(system_prompt=...)` behavior. - -## 6. Judge Payload Normalization - -- [x] 6.1 Add explicit judge payload normalization support on `JudgeSchemaDef` or a closely scoped suite helper. -- [x] 6.2 Add built-in normalizer/model for dimensions-style trajectory judge reports. -- [x] 6.3 Ensure normalizers run before typed model validation and report assembly. -- [x] 6.4 Do not add hidden global `dimensions -> flat` behavior. - -## 7. Refactor Manual Trajectory Regression - -- [x] 7.1 Replace test-local trajectory parser with `AWorldTrajectoryLogSource`. -- [x] 7.2 Replace test-local replay harness with `ReplayRuntimeHarness` plus `TrajectoryLogStateAdapter`. -- [x] 7.3 Replace test-local markdown agent backend with framework `AgentJudgeBackend.from_agent_markdown`. -- [x] 7.4 Replace test-local schema flattening with explicit trajectory judge normalizer/model. -- [x] 7.5 Keep the manual LLM-backed regression opt-in through explicit pytest parameters. - -## 8. Verification - -- [x] 8.1 Add focused tests for task+answer source replay without execution. -- [x] 8.2 Add focused tests for AWorld trajectory-log replay. -- [x] 8.3 Add focused tests for source default adapter selection. -- [x] 8.4 Add focused tests for markdown-agent judge backend loading. -- [x] 8.5 Run evaluator regression tests. -- [x] 8.6 Validate this OpenSpec change with `openspec validate aworld-evaluator-input-sources-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml deleted file mode 100644 index 2cb80411e..000000000 --- a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: spec-driven -created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md deleted file mode 100644 index 38b88c158..000000000 --- a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/design.md +++ /dev/null @@ -1,77 +0,0 @@ -## Context - -Runtime composition currently includes: - -- `ScriptedUserSimulator` for fixed user turns -- `SinglePromptUserSimulator` for one-shot prompts -- `CallableRuntimeHarness` for multi-turn rollout execution - -This is enough for deterministic tests but not for adaptive dialog evaluation. A realistic user simulator should inspect the current conversation, the last assistant output, and case goal before deciding whether to continue, clarify, challenge, or stop. - -## Goals / Non-Goals - -**Goals:** - -- Add a provider-agnostic adaptive simulator class. -- Support sync and async simulator generation. -- Keep generated turns serializable and report-safe. -- Give the generator enough structured context to implement LLM-backed user behavior. -- Preserve existing scripted and single-prompt behavior. - -**Non-Goals:** - -- Adding a concrete LLM provider client. -- Storing live model clients, credentials, or API responses in suite/report state. -- Adding training or optimizer integration. -- Replacing deterministic scripted simulators. - -## Decisions - -### 1. Use injected generator callable - -`LLMUserSimulator` accepts a `turn_generator` callable. The callable receives: - -- `case` -- `target` -- `state` -- `last_output` -- `turn_index` - -It may return: - -- `str`: user content -- `RolloutTurn`: full turn -- `Mapping`: `{"content": "...", "metadata": {...}}` -- `Mapping` with `{"stop": True}` or `None`: stop conversation - -This keeps provider integration outside the substrate while making the runtime API ready for LLM-backed adapters. - -### 2. Await simulator outputs in the harness - -`CallableRuntimeHarness` should call `await _maybe_await(simulator.next_turn(...))`. Existing sync simulators keep working, and async LLM-backed simulators become first-class. - -### 3. Keep metadata serializable - -Generated mapping metadata is filtered through existing `RolloutTurn.to_dict()` serialization. Live clients remain in the simulator instance, not in `RolloutState`. - -### 4. Stop behavior is explicit - -The simulator returns `None` or `{"stop": True}` to end the rollout. This keeps max-turn enforcement in `CallableRuntimeHarness` and stop-decision semantics in the simulator. - -## Risks / Trade-offs - -- [Provider ambiguity] -> Mitigation: this change is adapter-ready but provider-neutral. -- [Non-determinism in tests] -> Mitigation: tests use deterministic fake generators. -- [Live handle leakage] -> Mitigation: turn metadata is serialized through existing filtering; simulator internals are never copied into state. - -## Migration Plan - -1. Add async simulator support to `CallableRuntimeHarness`. -2. Add `LLMUserSimulator`. -3. Add tests for string, mapping, turn, stop, and async generation behavior. -4. Keep existing simulator tests green. - -## Deferred Questions - -- Concrete provider adapters should be separate changes. -- Training/optimizer integration remains deferred until evaluator runtime primitives stabilize. diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md deleted file mode 100644 index 937dd8029..000000000 --- a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/implementation-plan.md +++ /dev/null @@ -1,94 +0,0 @@ -# AWorld Evaluator LLM User Simulator Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add provider-neutral adaptive user simulation for runtime-composed evaluator rollouts. - -**Architecture:** Extend `CallableRuntimeHarness` to await simulator outputs, then add `LLMUserSimulator` as a thin adapter around an injected generator callable. The simulator normalizes string/mapping/turn/stop outputs into `RolloutTurn | None` and leaves provider clients outside serializable rollout state. - -**Tech Stack:** Python protocols/dataclasses, existing runtime-composition harness, pytest, OpenSpec. - ---- - -## File Structure - -- Modify: `aworld/evaluations/runtime_composition.py` - Add async simulator support and `LLMUserSimulator`. -- Test: `tests/evaluations/test_llm_user_simulator.py` - Focused TDD coverage for adaptive generation and stop behavior. - -## Task 1: Async Simulator Support - -- [x] **Step 1: Write failing async simulator test** - -Create `tests/evaluations/test_llm_user_simulator.py` with an async simulator whose `next_turn` returns a `RolloutTurn`. - -- [x] **Step 2: Run and confirm failure** - -Run: `pytest tests/evaluations/test_llm_user_simulator.py::test_callable_runtime_harness_awaits_async_simulator -q` - -Expected: FAIL because `CallableRuntimeHarness` does not await simulator output. - -- [x] **Step 3: Await simulator next turn** - -Change `CallableRuntimeHarness.run_rollout()` to call `await _maybe_await(self.simulator.next_turn(...))`. - -- [x] **Step 4: Run test until green** - -Run: `pytest tests/evaluations/test_llm_user_simulator.py -q` - -Expected: PASS. - -## Task 2: LLMUserSimulator - -- [x] **Step 1: Write failing adaptive generation tests** - -Add tests for string output, mapping output with metadata, explicit stop output, and generator context arguments. - -- [x] **Step 2: Run and confirm failure** - -Run: `pytest tests/evaluations/test_llm_user_simulator.py -q` - -Expected: FAIL because `LLMUserSimulator` does not exist. - -- [x] **Step 3: Implement `LLMUserSimulator`** - -Add a class accepting `turn_generator`. Normalize outputs: - -- `None` -> `None` -- `{"stop": True}` -> `None` -- `str` -> `RolloutTurn(role="user", content=value)` -- `RolloutTurn` -> returned directly -- mapping -> `RolloutTurn(role=..., content=..., metadata=...)` - -- [x] **Step 4: Run simulator tests until green** - -Run: `pytest tests/evaluations/test_llm_user_simulator.py -q` - -Expected: PASS. - -## Task 3: Verification And Commit - -- [x] **Step 1: Run runtime/evaluator regression** - -Run: - -```bash -pytest tests/evaluations/test_llm_user_simulator.py tests/evaluations/test_runtime_composition.py tests/evaluations/test_environment_isolation.py tests/evaluations/test_evaluator_trials.py -q -``` - -Expected: PASS. - -- [x] **Step 2: Validate OpenSpec** - -Run: `openspec validate aworld-evaluator-llm-user-simulator-2026-06-10 --strict` - -Expected: `Change 'aworld-evaluator-llm-user-simulator-2026-06-10' is valid` - -- [x] **Step 3: Commit** - -```bash -git add aworld/evaluations/runtime_composition.py tests/evaluations/test_llm_user_simulator.py -git add -f openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10 -git commit -m "feat: add adaptive evaluator user simulator" -``` diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md deleted file mode 100644 index e91363319..000000000 --- a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/proposal.md +++ /dev/null @@ -1,19 +0,0 @@ -# AWorld Evaluator LLM User Simulator - -## Why - -Scripted and single-prompt simulators are useful for deterministic smoke tests, but conversational agent evaluation often needs an adaptive user that reacts to the assistant's previous output and rollout state. The evaluator runtime already owns turns and rollout state, so the next step is a provider-agnostic LLM-backed simulator contract that can drive multi-turn conversations without coupling the substrate to one model vendor. - -## What Changes - -- Add an adaptive `LLMUserSimulator` that delegates user-turn generation to an injected sync or async callable. -- Allow `CallableRuntimeHarness` to await simulator `next_turn` implementations. -- Pass case input, target metadata, current rollout state, previous assistant output, and turn index to the simulator generator. -- Support generator outputs as strings, mappings, `RolloutTurn`, or explicit stop signals. -- Preserve only serializable simulator metadata in emitted turns. - -## Impact - -- Affected code: `aworld/evaluations/runtime_composition.py`. -- Affected tests: add focused coverage for async simulator support, adaptive LLM-style generation, stop behavior, and report-safe metadata. -- Non-goal: this change does not ship a concrete OpenAI/Anthropic client adapter or manage API keys. diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md deleted file mode 100644 index 1b7a61b96..000000000 --- a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/specs/evaluation-substrate/spec.md +++ /dev/null @@ -1,34 +0,0 @@ -## MODIFIED Requirements - -### Requirement: Adaptive user simulation - -Runtime-composed evaluation flows SHALL support adaptive user simulators that can react to previous assistant outputs and rollout state. - -#### Scenario: Simulator generates user turn from rollout context -- **WHEN** a runtime harness requests the next user turn from an adaptive simulator -- **THEN** the simulator SHALL receive the evaluation case, target metadata, current rollout state, last assistant output, and turn index -- **AND** it SHALL be able to return the next serializable user turn - -#### Scenario: Simulator is async -- **WHEN** a simulator returns an awaitable next-turn result -- **THEN** the runtime harness SHALL await the result before appending the user turn - -#### Scenario: Simulator stops conversation -- **WHEN** a simulator returns `None` or an explicit stop signal -- **THEN** the runtime harness SHALL stop requesting additional user turns for that rollout - -#### Scenario: Simulator returns metadata -- **WHEN** a simulator returns turn metadata -- **THEN** the framework SHALL preserve only serializable metadata in trajectory/report state - -### Requirement: Provider-neutral LLM simulator boundary - -LLM-backed user simulation SHALL be provider-neutral at the evaluator substrate layer. - -#### Scenario: Suite uses external LLM client -- **WHEN** a suite author wants to use an OpenAI, Anthropic, local, or custom model-backed user simulator -- **THEN** the evaluator substrate SHALL accept an injected callable or simulator instance rather than constructing a provider client itself - -#### Scenario: Simulator contains live model client -- **WHEN** a simulator instance holds a live client, credential, or transport handle -- **THEN** the framework SHALL NOT serialize that handle into rollout state, evaluator state, or reports diff --git a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md deleted file mode 100644 index 38f833b02..000000000 --- a/openspec/changes/aworld-evaluator-llm-user-simulator-2026-06-10/tasks.md +++ /dev/null @@ -1,17 +0,0 @@ -## 1. Async Simulator Support - -- [x] 1.1 Update `CallableRuntimeHarness` to await awaitable simulator `next_turn` results. -- [x] 1.2 Preserve existing scripted and single-prompt simulator behavior. - -## 2. Adaptive LLM User Simulator - -- [x] 2.1 Add `LLMUserSimulator`. -- [x] 2.2 Pass case, target, rollout state, last output, and turn index to its generator. -- [x] 2.3 Support string, mapping, `RolloutTurn`, `None`, and explicit stop outputs. -- [x] 2.4 Filter generated metadata through existing serializable turn serialization. - -## 3. Verification - -- [x] 3.1 Add focused tests for async simulator support, adaptive generation, stop behavior, and metadata filtering. -- [x] 3.2 Run runtime/evaluator regression tests. -- [x] 3.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-llm-user-simulator-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml deleted file mode 100644 index 2cb80411e..000000000 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: spec-driven -created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md deleted file mode 100644 index 1cce886cb..000000000 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/design.md +++ /dev/null @@ -1,247 +0,0 @@ -## Context - -Evaluator v2 extensibility made the AWorld evaluator substrate more configurable, but it deliberately left verifiers-style runtime behavior out of scope. The current pipeline still compiles suites into the existing `EvalTarget -> Evaluator -> EvaluateRunner` skeleton and scores a single normalized state after execution. That is enough for post-hoc result and trajectory checks, but it cannot express: - -- a harness that owns rollout lifecycle and produces state -- controlled multi-turn user simulation -- per-step reward/reason records -- outcome checks against final environment or artifact state -- retry/fallback/wrapper harness composition -- child-state borrowing or links between rollout attempts -- trial-based pass@k/pass^k metrics for nondeterministic agents -- an adoption suite that exercises these capabilities outside tests - -This change introduces runtime composition as a framework-owned layer under `aworld/evaluations/` while preserving the v2 single-shot substrate. It also adds outcome/state-check grading because outcome verification is tightly coupled to rollout state and environment snapshots. Trial-based pass@k/pass^k execution is explicitly deferred because retry composition and independent trials have different semantics. This change should therefore be described as the runtime-composition and outcome-grading slice of complete evaluation capability, not as the full evaluator roadmap. - -## Goals / Non-Goals - -**Goals:** - -- Add a rollout-owning harness contract that executes evaluation cases and returns normalized rollout state. -- Support multi-turn rollout state with turns, messages, tool calls, usage, timing, terminal outcome, child-state links, and step rewards. -- Add a user simulator abstraction that can drive controlled multi-turn interactions. -- Add outcome/state-check grader definitions that verify final environment or artifact state separately from final text answer and trajectory. -- Add step-level reward definitions and aggregation so process quality can participate in reports and gates. -- Add at least one runtime composition wrapper, such as retry or fallback, that composes around a base harness. -- Add one builtin/adoption suite that uses typed judge schema, composite gate, outcome grader, trajectory scorer, step reward, and rollout harness together. -- Add standard rollout metrics such as turn count, tool-call count, token usage, and timing/latency when the underlying runtime exposes them. -- Keep current static/agent/task/program single-shot flows compatible. - -**Non-Goals:** - -- Implementing a verifiers public API compatibility layer. -- Building a training optimizer, RL loop, or policy update system. -- Adding untrusted code execution, sandbox command execution, or package registry loading. -- Adding clean-environment isolation or sandbox reset semantics for each trial. -- Adding multi-trial execution, pass@k, pass^k, or trial-distribution metrics. -- Adding LLM-backed adaptive user simulation. -- Reworking `aworld-cli evaluator` UX or command syntax. -- Migrating every builtin suite in this change. -- Replacing `EvaluateRunner`; runtime composition should integrate with it through framework-owned targets/adapters. - -## Ownership Model - -| Concept | Owns | Must not own | -| --- | --- | --- | -| `EvalSuiteDef` / `EvalCaseDef` | Domain inputs, judge schema, gates, scorer declarations, runtime references | Live runtime handles in declarative manifests | -| `EvalRuntimeHarnessDef` | Rollout lifecycle configuration, simulator wiring, reward hooks, composition wrappers | Judge/scorer report assembly | -| `RuntimeHarness` | Executing one case through a rollout and returning rollout state | Gate policy decisions | -| `UserSimulator` | Producing user turns from case, rollout state, and previous assistant output | Agent execution internals | -| `OutcomeGrader` / `StateCheckGrader` | Checking final environment, artifact, or domain state | Driving rollout turns or replacing trajectory scoring | -| `StepRewarder` | Per-step reward values and reasons | Mutating rollout state or model behavior | -| `RolloutState` / `EvalState` bridge | Serializable rollout transcript and state normalization | Live clients, sandboxes, runners | - -## Decisions - -### 1. Add a rollout-owning harness layer - -Introduce a framework-owned runtime harness abstraction separate from the lightweight `EvalHarnessDef` from v2 extensibility. The runtime harness owns the lifecycle of a rollout: - -1. initialize state for one case -2. ask the user simulator or case input for the next user turn -3. execute the target runtime for one assistant/tool step -4. record messages, tool calls, observations, rewards, usage, and timing -5. decide whether the rollout is terminal -6. return a normalized rollout state - -The first implementation should keep the public surface small: - -- `EvalRuntimeHarnessDef`: immutable configuration object -- `RuntimeHarness`: protocol or base class for executing one case -- `run_rollout(case, target, harness) -> RolloutState`: internal framework entry point -- compatibility bridge from rollout state into existing `EvalState` - -This is the first AWorld harness that owns rollout. The older `EvalHarnessDef` remains a compatibility holder for single-shot execution specs. - -### 2. Model rollout state explicitly - -Add a serializable rollout state model rather than overloading arbitrary trajectory dictionaries. It should include: - -- `case_id` -- `status` -- `turns`: ordered user/assistant/tool records -- `messages`: normalized conversation messages when available -- `trajectory`: scorer-compatible trajectory view -- `tool_calls` -- `step_rewards` -- `outcome`: final answer plus optional environment/artifact snapshot references and state-check results -- `child_states` or `attempts` for composed runtimes -- `usage` -- `timing` -- `standard_metrics`: turn count, tool-call count, token counts, and latency/timing metrics derived from rollout state -- `error` -- `metadata` - -The bridge into `EvalState` should preserve the existing state summary and scorer helpers. Existing trajectory scorers should work against the bridge without needing a report format fork. - -### 3. Add outcome/state-check grading - -Outcome evaluation must not be limited to terminal text. In this design, `answer` is the target's terminal response, while `outcome` is the serializable final environment, artifact, or domain state captured by the harness after rollout. Add an outcome/state-check contract that can verify that final state: - -- file or artifact existence/content checks -- structured environment snapshot checks -- database or domain-state assertions when the harness provides a serializable snapshot -- coding-task results such as precomputed test summaries when produced by a trusted harness -- test-command or sandbox execution only through a future trusted execution/sandbox change - -The first implementation should keep state checks deterministic and in-process. A state-check grader receives the `RolloutState`, case, target, and optional serializable environment snapshot. It returns normal evaluator metric results plus structured details explaining which checks passed or failed. - -A minimal state-check definition should support: - -- `metric_name` -- `source`: for example `outcome`, `metadata`, or `artifacts` -- `path`: a structured path into the selected source -- `op`: equality or numeric comparison against an expected value -- `expected` -- `weight` -- `required` - -Outcome graders must emit numeric metric values for gate compatibility, plus pass/fail details for report inspection. They must not open live databases, inspect arbitrary file paths, run shell commands, or retain environment handles. If a harness needs external checks, it must capture a serializable snapshot or summary into `RolloutState.outcome` before grading. - -Outcome graders are distinct from: - -- typed judge output, which evaluates semantic result quality -- trajectory scorers, which evaluate process/transcript quality -- step rewarders, which evaluate individual rollout steps - -Composite gates may reference all of these metric families side by side. - -### 4. Add user simulator contracts - -Add a user simulator interface that can be deterministic and testable: - -- input: case, target, rollout state, last assistant output -- output: next user message, terminal signal, or simulator error - -Built-in simulators should start small: - -- scripted simulator over case-provided turns -- static single-prompt simulator for compatibility - -LLM-backed simulators are a future extension. Scripted simulators are sufficient for this change's deterministic adoption suite, but they do not complete adaptive conversation-agent evaluation. - -### 5. Add step-level rewards - -Add reward records independent of final judge output: - -- `metric_name` -- `step_index` -- `value` -- `weight` -- `partial_credit` -- `reason` -- `metadata` - -Rewarders should be pure evaluators over rollout state or an individual step. They must not mutate state or call model execution. Aggregation should produce normal evaluator metrics, for example weighted mean reward, total reward, partial-credit rate, pass/fail threshold status, and report-level gate inputs. - -### 6. Add runtime composition wrappers - -Add one wrapper mechanism in this change so composition is real, not only a type hierarchy. The first wrapper should be retry or fallback: - -- retry wrapper: reruns a base harness when terminal state is failed or a configured reward/gate condition is not met -- fallback wrapper: tries alternate harnesses when one fails - -The wrapper must preserve child/attempt state so reports can explain which attempt passed or failed. The first implementation should support one wrapper style only if both would make the change too large. - -Retry and fallback are not trials. Retry is an execution strategy that tries to produce one terminal rollout; trials are independent repeated evaluations used to estimate nondeterministic performance. This change must not label retry results as pass@k or pass^k. - -### 7. Defer multi-trial evaluation explicitly - -Complete agent evaluation needs independent trial execution and distribution-level metrics, but that is a scheduler and aggregation concern rather than a harness-wrapper concern. A later evaluator-trials change should own: - -- `num_trials` or equivalent independent repeat configuration -- clean-environment reset requirements per trial -- trial-level report records -- pass@k and pass^k aggregation -- separation between retry attempts inside a trial and independent trials across the same case - -Until that exists, runtime-composed reports should expose retry attempts only as child states for one rollout and should not compute nondeterminism metrics from them. - -### 8. Add suite purpose metadata and standard metrics - -Suites should be able to describe whether they are intended for capability evaluation or regression evaluation. The first implementation can use suite metadata, for example: - -- `evaluation_purpose`: `capability` or `regression` -- `expected_pass_rate`: optional descriptive threshold or range - -Runtime-composed harnesses should derive standard transcript and latency metrics when data is available: - -- `n_turns` -- `n_tool_calls` -- `n_tokens` or token usage fields such as prompt, completion, and total tokens -- wall-clock duration / time cost -- optional first-token or first-action latency when exposed by the runtime - -Suites can still declare custom metrics, but these baseline metrics should not require every suite to hand-roll them. - -### 9. Add one adoption suite - -Add one builtin or framework-registered adoption suite that consumes the new runtime: - -- typed judge schema -- composite gate -- outcome/state-check grader -- trajectory scorer -- step-level reward metric -- rollout-owning harness with scripted simulator -- suite metadata marking whether the suite is for capability or regression use - -This suite can be narrow and deterministic. Its purpose is to prove that the substrate is active in production code paths, not only in isolated unit tests. It should not replace `app-evaluator` unless that public contract is ready to change. - -### 10. Keep CLI additive - -`aworld-cli evaluator` should discover and run the adoption suite through existing suite selection paths. Do not add CLI-only runtime syntax in this change. If CLI ergonomics are needed later, handle them in a product-focused change after the framework contract settles. - -## Risks / Trade-offs - -- [Scope growth] -> Mitigation: ship one simulator, one wrapper style, deterministic outcome checks, and one adoption suite; defer untrusted execution, LLM simulators, trials, and training loops. -- [Duplicate state models] -> Mitigation: rollout state must bridge into `EvalState` and reuse existing scorer/report helpers. -- [Hard-to-debug composed runs] -> Mitigation: preserve attempt/child state and reward reasons in serializable report metadata. -- [Adoption suite changes public behavior] -> Mitigation: add a new suite or opt-in registration rather than silently changing `app-evaluator`. -- [Runtime harness conflicts with existing adapter layer] -> Mitigation: keep adapters for single-shot execution; runtime harnesses own multi-turn rollout and may call adapters internally. -- [Retry metrics confused with pass@k] -> Mitigation: document retry as one composed rollout, not independent trials, and defer pass@k/pass^k to a separate multi-trial change. - -## Migration Plan - -1. Add rollout state and harness interfaces without changing existing suite behavior. -2. Add outcome/state-check grader contracts and deterministic state-check metrics. -3. Add scripted user simulator and reward records with weights and partial credit. -4. Add rollout target/adapter bridge into `EvalState`. -5. Add one runtime wrapper style with child-state reporting while keeping retry distinct from trials. -6. Add adoption suite that consumes typed schema, composite gate, outcome grader, trajectory scorer, rollout harness, and step rewards. -7. Keep existing evaluator regression suite green and add focused runtime-composition coverage. - -Rollback strategy: - -- runtime-composition suites are opt-in -- single-shot suite behavior and existing report fields remain compatible -- adoption suite can be unregistered or hidden without removing the underlying framework interfaces - -## Deferred Questions - -- LLM-backed user simulators should wait until deterministic scripted simulators are stable. -- Sandbox/command-backed harness execution should wait for a dedicated trusted execution change. -- Clean-environment reset semantics should wait for a sandbox/environment isolation change. -- Multi-trial execution, pass@k, and pass^k should be handled in a separate evaluator-trials change. -- Public API naming can be refined after the internal framework contract proves itself. -- Training reward integration should wait for a separate optimizer/training change. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md deleted file mode 100644 index 47044256a..000000000 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/implementation-plan.md +++ /dev/null @@ -1,285 +0,0 @@ -# AWorld Evaluator Runtime Composition Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add rollout-owning evaluator runtime composition with multi-turn harnesses, outcome/state-check grading, user simulation, step-level rewards, and one adoption suite that actively consumes v2 evaluator capabilities. - -**Architecture:** Keep the current single-shot evaluator substrate intact. Add a runtime-composition layer under `aworld/evaluations/` that can execute multi-turn rollouts, normalize them into `EvalState`, evaluate final outcome snapshots, aggregate weighted reward metrics, derive standard rollout metrics, and compose retry attempts while preserving child state. Retry remains an execution wrapper, not pass@k/pass^k trial evaluation. - -**Tech Stack:** Python dataclasses/protocols, AWorld evaluator substrate, existing scorer/report infrastructure, Pydantic for typed judge outputs, pytest, OpenSpec. - ---- - -## File Structure - -- Create: `aworld/evaluations/runtime_composition.py` - Rollout state, turn records, outcome check records, user simulator protocols, runtime harness protocols, reward records, and retry wrapper primitives. -- Modify: `aworld/evaluations/substrate.py` - Compile opt-in runtime-composition suites and register the adoption suite. -- Modify: `aworld/evaluations/execution.py` - Add rollout-state-to-`EvalState` normalization helpers if they do not fit cleanly in `runtime_composition.py`. -- Modify: `aworld/evaluations/report.py` - Preserve attempt/reward/outcome metadata in existing report shape without breaking schema. -- Modify: `aworld/evaluations/scorers/**` - Add outcome and reward aggregation scorers or reuse existing scorer infrastructure for those metrics. -- Test: `tests/evaluations/test_runtime_composition.py` - Focused tests for rollout state, outcome grading, simulator, harness, retry wrapper, reward aggregation, standard metrics, and adoption suite. -- Test: existing evaluator regression tests - Ensure single-shot behavior remains compatible. - -## Task 1: Rollout State and Harness Contracts - -- [x] **Step 1: Write failing rollout state tests** - -Add tests in `tests/evaluations/test_runtime_composition.py` for: - -```python -def test_rollout_state_to_eval_state_excludes_live_handles(): - live_agent = object() - state = RolloutState( - case_id="case-1", - status="success", - answer="done", - turns=[RolloutTurn(role="user", content="hello")], - outcome={"artifact_exists": True}, - metadata={"live_agent": live_agent, "safe": "ok"}, - ) - - eval_state = state.to_eval_state(target={"target_kind": "inline"}) - - assert eval_state.case_id == "case-1" - assert eval_state.answer == "done" - assert eval_state.trajectory - assert eval_state.artifacts["outcome"]["artifact_exists"] is True - assert "live_agent" not in eval_state.metadata - assert eval_state.metadata["safe"] == "ok" -``` - -- [x] **Step 2: Run test and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_rollout_state_to_eval_state_excludes_live_handles -q` - -Expected: FAIL because `runtime_composition.py` and `RolloutState` do not exist. - -- [x] **Step 3: Add minimal rollout models** - -Create `aworld/evaluations/runtime_composition.py` with serializable dataclasses for `RolloutTurn`, `OutcomeCheckResult`, `StepReward`, `RolloutState`, `EvalRuntimeHarnessDef`, `RuntimeHarness`, and `UserSimulator`. - -- [x] **Step 4: Run rollout tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS for initial rollout state tests. - -## Task 2: Outcome / State-Check Grading - -- [x] **Step 1: Write failing outcome grader tests** - -Cover deterministic final-state checks: - -```python -def test_state_check_grader_emits_outcome_metric(): - state = RolloutState( - case_id="case-1", - status="success", - outcome={"ticket": {"status": "resolved"}}, - ) - grader = StateCheckGrader( - metric_name="ticket_resolved", - path=("ticket", "status"), - expected="resolved", - ) - - result = grader.grade(state=state, case=None, target={}) - - assert result.metric_name == "ticket_resolved" - assert result.value == 1.0 - assert result.passed is True -``` - -- [x] **Step 2: Run outcome tests and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_state_check_grader_emits_outcome_metric -q` - -Expected: FAIL because `StateCheckGrader` does not exist. - -- [x] **Step 3: Implement deterministic state-check grader** - -Add an in-process state-check grader that reads serializable rollout `outcome` data and emits normal metric-compatible results. Reject checks that require command execution, sandbox reset, or non-serializable live handles. - -- [x] **Step 4: Run outcome tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS. - -## Task 3: Scripted User Simulator - -- [x] **Step 1: Write failing simulator tests** - -Cover scripted turns and single-prompt behavior: - -```python -def test_scripted_user_simulator_emits_turns_in_order(): - simulator = ScriptedUserSimulator() - state = RolloutState(case_id="case-1") - case = EvalCaseDef(case_id="case-1", input={"turns": ["hi", "again"]}) - - first = simulator.next_turn(case=case, target={}, state=state, last_output=None) - state.turns.append(first) - second = simulator.next_turn(case=case, target={}, state=state, last_output="ok") - - assert first.content == "hi" - assert second.content == "again" -``` - -- [x] **Step 2: Run simulator tests and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_scripted_user_simulator_emits_turns_in_order -q` - -Expected: FAIL because simulator implementation does not exist. - -- [x] **Step 3: Implement scripted and single-prompt simulators** - -Add `ScriptedUserSimulator` and `SinglePromptUserSimulator` to `runtime_composition.py`. - -- [x] **Step 4: Run simulator tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS. - -## Task 4: Runtime Harness Execution - -- [x] **Step 1: Write failing harness execution tests** - -Add a deterministic harness test that consumes simulator turns and returns rollout state with assistant turns and trajectory. - -- [x] **Step 2: Run harness test and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_runtime_harness_executes_multi_turn_rollout -q` - -Expected: FAIL because harness implementation does not exist. - -- [x] **Step 3: Implement a minimal scripted runtime harness** - -Add a framework test harness or deterministic harness class that uses a simulator and a callable assistant step function. - -- [x] **Step 4: Run harness tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS. - -## Task 5: Step Rewards and Aggregation - -- [x] **Step 1: Write failing reward aggregation tests** - -Cover reward records becoming weighted and partial-credit case/aggregate metrics. - -- [x] **Step 2: Run reward tests and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_step_rewards_aggregate_into_metrics -q` - -Expected: FAIL because reward aggregation is not wired. - -- [x] **Step 3: Implement step reward records and aggregation scorer** - -Use existing scorer/report metric shapes. Keep reward metrics distinct from judge and outcome metrics. - -- [x] **Step 4: Run reward tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS. - -## Task 6: Retry Wrapper Composition - -- [x] **Step 1: Write failing retry wrapper tests** - -Cover failed first attempt, successful second attempt, preserved child/attempt state, and explicit absence of pass@k/pass^k labels. - -- [x] **Step 2: Run retry tests and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_retry_wrapper_preserves_failed_attempts -q` - -Expected: FAIL because retry wrapper does not exist. - -- [x] **Step 3: Implement retry wrapper** - -Add a retry wrapper around a base `RuntimeHarness` with max attempts and selected terminal attempt. Preserve attempts as child state and do not emit trial metrics. - -- [x] **Step 4: Run retry tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS. - -## Task 7: Standard Metrics and Suite Purpose - -- [x] **Step 1: Write failing standard metric tests** - -Cover `n_turns`, `n_tool_calls`, token usage, and duration derivation from rollout state. - -- [x] **Step 2: Run standard metric tests and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_rollout_standard_metrics_are_derived -q` - -Expected: FAIL because standard metric derivation does not exist. - -- [x] **Step 3: Implement standard metric derivation and purpose metadata preservation** - -Add rollout standard metrics and preserve suite metadata such as `evaluation_purpose="capability"` or `evaluation_purpose="regression"` in report context. - -- [x] **Step 4: Run standard metric tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py -q` - -Expected: PASS. - -## Task 8: Adoption Suite - -- [x] **Step 1: Write failing adoption suite tests** - -Assert the new suite is registered, uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step reward metric, scripted simulator, purpose metadata, and runtime harness. - -- [x] **Step 2: Run adoption tests and confirm failure** - -Run: `pytest tests/evaluations/test_runtime_composition.py::test_runtime_composition_adoption_suite_runs_end_to_end -q` - -Expected: FAIL because suite does not exist. - -- [x] **Step 3: Implement opt-in adoption suite** - -Add a narrow deterministic suite without changing `app-evaluator` behavior. - -- [x] **Step 4: Run adoption tests until green** - -Run: `pytest tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluation_substrate.py -q` - -Expected: PASS. - -## Task 9: Verification and Commit - -- [x] **Step 1: Run evaluator regression suite** - -Run: - -```bash -pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_runtime_composition.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -q -``` - -Expected: PASS. - -- [x] **Step 2: Validate OpenSpec** - -Run: `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict` - -Expected: `Change 'aworld-evaluator-runtime-composition-2026-06-10' is valid` - -- [x] **Step 3: Commit** - -```bash -git add aworld/evaluations/runtime_composition.py aworld/evaluations/substrate.py aworld/evaluations/execution.py aworld/evaluations/report.py aworld/evaluations/scorers tests/evaluations/test_runtime_composition.py openspec/changes/aworld-evaluator-runtime-composition-2026-06-10 -git commit -m "feat: add evaluator runtime composition" -``` diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md deleted file mode 100644 index ba5d591bf..000000000 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/proposal.md +++ /dev/null @@ -1,37 +0,0 @@ -## Why - -`aworld-evaluator-v2-extensibility-2026-06-09` hardened the single-shot evaluator substrate with execution adapters, typed judge schemas, composite gates, bounded `PROGRAM` execution, and suite-declared trajectory scorers. That change intentionally stopped short of verifiers-style runtime composition: - -- `EvalHarnessDef` is a lightweight execution-spec holder, not a rollout-owning runtime object -- trajectory evaluation inspects the already captured single-shot `EvalState.trajectory` -- there is no user simulator, lifecycle hook model, child-state composition, retry/fallback harness composition, or step-level reward contract -- there is no explicit outcome/environment-state grader for verifying final external state -- there is no multi-trial execution model for pass@k or pass^k style nondeterminism metrics -- no builtin or adoption suite currently exercises typed judge + composite gate + trajectory scorer + rollout runtime together - -The result is useful framework substrate, but not yet a complete runtime-composition evaluation capability. This change adds the missing rollout/runtime layer, adds outcome/state-check grading, and proves it through one concrete adoption suite. Multi-trial pass@k/pass^k execution remains a separate follow-up because it cuts across execution scheduling and statistical aggregation rather than harness retry behavior. - -## What Changes - -- Add a rollout-owning evaluator runtime harness abstraction that can execute multi-turn cases and produce normalized rollout state. -- Add multi-turn rollout state with turns, messages, tool calls, terminal outcome, step rewards, and child-state links. -- Add a user simulator contract for controlled multi-turn agent/user evaluation. -- Add an outcome/state-check grader contract for verifying final environment or artifact state separately from text answer and trajectory. -- Add step-level reward definitions and aggregation into report metrics and gates. -- Add runtime composition wrappers, starting with retry/fallback or equivalent wrapper harness semantics. -- Add one builtin/adoption suite that actually uses typed judge output, composite gates, trajectory scoring, and the new rollout-owning harness. -- Explicitly document that retry/fallback wrappers are not trials and must not be used as pass@k/pass^k metrics. -- Keep existing single-shot evaluator flows compatible and avoid changing the `aworld-cli evaluator` command shape. - -## Capabilities - -### Modified Capabilities - -- `evaluation-substrate`: add rollout-owning runtime composition, multi-turn harness execution, user simulation, step-level reward scoring, and one adoption suite that consumes the v2 substrate capabilities end to end. - -## Impact - -- Affected code: `aworld/evaluations/**`, especially substrate definitions, execution/runtime orchestration, scorer integration, report assembly, and builtin suite registration. -- Affected APIs: framework-owned evaluator APIs gain additive runtime-composition contracts; existing suite-backed and legacy evaluation callers remain valid. -- Affected tests: add focused coverage for harness rollout, user simulation, reward aggregation, runtime wrappers, report/gate integration, and adoption suite behavior. -- Affected docs: clarify the difference between single-shot evaluation and rollout-owning runtime composition. diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md deleted file mode 100644 index c577fa1d8..000000000 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/specs/evaluation-substrate/spec.md +++ /dev/null @@ -1,157 +0,0 @@ -## MODIFIED Requirements - -### Requirement: Runtime-composed evaluation harnesses - -Suite-backed evaluation flows SHALL support opt-in rollout-owning runtime harnesses that execute multi-turn cases and produce normalized rollout state while preserving existing single-shot evaluator behavior. - -#### Scenario: Suite selects a rollout-owning harness -- **WHEN** a suite-backed evaluator declares a runtime-composition harness -- **THEN** the framework SHALL execute the case through that harness lifecycle rather than treating the harness as only an execution-spec holder - -#### Scenario: Existing single-shot suites remain compatible -- **WHEN** a suite-backed evaluator does not declare a runtime-composition harness -- **THEN** the framework SHALL preserve the current static, agent, task, and program execution behavior - -#### Scenario: Runtime harness returns rollout state -- **WHEN** a runtime harness completes a case rollout -- **THEN** the framework SHALL normalize the rollout into evaluator state containing terminal answer, outcome data, trajectory, tool calls, usage, timing, standard rollout metrics, error, and metadata fields usable by existing scorer helpers - -### Requirement: Multi-turn rollout state - -Runtime-composed evaluation flows SHALL represent multi-turn execution as serializable rollout state. - -#### Scenario: Rollout has multiple turns -- **WHEN** a runtime-composed harness executes multiple user/assistant/tool turns -- **THEN** the framework SHALL preserve ordered turns, normalized messages, trajectory entries, tool calls, terminal status, and terminal answer - -#### Scenario: Runtime composition creates child states -- **WHEN** a runtime wrapper retries or falls back to another harness attempt -- **THEN** the framework SHALL preserve child or attempt state so reports can explain the composed execution path - -#### Scenario: Rollout state is serializable -- **WHEN** rollout state is converted into evaluator state or report payloads -- **THEN** the framework SHALL exclude live runtime handles, clients, agent instances, and simulator objects - -#### Scenario: Standard rollout metrics are derived -- **WHEN** rollout state contains turns, tool calls, token usage, or timing data -- **THEN** the framework SHALL derive standard metrics such as turn count, tool-call count, token usage, and duration without requiring suite-specific custom scorers - -### Requirement: Outcome and state-check grading - -Runtime-composed evaluation flows SHALL support outcome graders that verify final environment, artifact, or domain state separately from final text answer and trajectory. - -#### Scenario: Outcome is distinct from terminal answer -- **WHEN** a runtime-composed rollout completes -- **THEN** the framework SHALL treat the terminal answer as response text and the outcome as serializable final environment, artifact, or domain state captured by the harness - -#### Scenario: Outcome grader checks final state -- **WHEN** a runtime-composed suite declares an outcome or state-check grader -- **THEN** the framework SHALL evaluate the rollout state's terminal outcome or serializable environment snapshot and emit normal evaluator metrics with numeric values and pass/fail details - -#### Scenario: State check addresses a structured snapshot -- **WHEN** a state-check grader declares a source, path, operator, and expected value -- **THEN** the framework SHALL resolve that path against the selected serializable rollout state source and compare it without opening live files, databases, clients, or runtime handles - -#### Scenario: Outcome metrics remain distinct -- **WHEN** a suite uses typed judge output, trajectory scorers, step rewards, and outcome graders together -- **THEN** the framework SHALL keep outcome metrics distinct while allowing composite gates to reference them alongside judge, trajectory, and reward metrics - -#### Scenario: Trusted harness provides coding-task results -- **WHEN** a trusted harness runs external checks before grading and records a serializable test summary or artifact summary in rollout outcome -- **THEN** outcome graders SHALL evaluate that recorded summary rather than invoking test commands themselves - -#### Scenario: Environment check needs sandbox reset -- **WHEN** an outcome grader requires clean-environment isolation, command execution, or sandbox reset semantics -- **THEN** the framework SHALL treat that as unsupported in this change and leave it to a dedicated environment-isolation change - -### Requirement: User simulation - -Runtime-composed evaluation flows SHALL support deterministic user simulators that drive controlled multi-turn rollouts. - -#### Scenario: Scripted simulator provides turns -- **WHEN** a case includes scripted user turns -- **THEN** the framework SHALL let the scripted simulator provide those turns in order until it reaches a terminal condition - -#### Scenario: Single-prompt simulator preserves one-shot behavior -- **WHEN** a case only includes a single prompt or query -- **THEN** the framework SHALL support a simulator that emits one user turn and then terminates unless the harness requests additional turns - -#### Scenario: Adaptive LLM user simulator is requested -- **WHEN** a suite requires an LLM-backed adaptive user simulator -- **THEN** the framework SHALL treat it as out of scope for this change and require a later simulator extension - -#### Scenario: Simulator errors are captured -- **WHEN** a user simulator cannot produce a valid next turn -- **THEN** the framework SHALL mark the rollout state as failed with a serializable error rather than storing the simulator object - -### Requirement: Step-level rewards - -Runtime-composed evaluation flows SHALL support step-level reward records that can be aggregated into normal evaluator metrics. - -#### Scenario: Step rewarder evaluates rollout steps -- **WHEN** a step rewarder inspects a rollout step -- **THEN** it SHALL emit a reward record containing metric name, step index, numeric value, optional weight, optional partial-credit marker, reason, and serializable metadata - -#### Scenario: Rewards aggregate into metrics -- **WHEN** a rollout contains step reward records -- **THEN** the framework SHALL aggregate configured reward metrics into case metrics, aggregate metrics, and structured gate inputs, including weighted and partial-credit summaries when configured - -#### Scenario: Rewards do not replace final judge output -- **WHEN** a suite uses both typed judge output and step rewards -- **THEN** the framework SHALL keep judge metrics and reward metrics distinct while allowing composite gates to reference both - -### Requirement: Runtime composition wrappers - -Runtime-composed evaluation flows SHALL support at least one wrapper harness that composes around a base harness and preserves attempt state. - -#### Scenario: Retry wrapper reruns failed rollout -- **WHEN** a retry wrapper receives a failed terminal rollout or a configured failed reward condition -- **THEN** it SHALL rerun the base harness up to the configured limit and preserve each attempt as child or attempt state - -#### Scenario: Retry wrapper reports terminal attempt -- **WHEN** a retry wrapper finishes -- **THEN** it SHALL expose the selected terminal attempt as the main rollout state while retaining previous attempts for inspection - -#### Scenario: Retry is not trial evaluation -- **WHEN** retry or fallback wrapper results are reported -- **THEN** the framework SHALL NOT label those attempts as independent trials, pass@k, or pass^k metrics - -### Requirement: Evaluation purpose metadata - -Suite-backed evaluation flows SHALL allow suites to declare whether they are intended for capability evaluation or regression evaluation. - -#### Scenario: Suite declares evaluation purpose -- **WHEN** a suite declares evaluation-purpose metadata -- **THEN** the framework SHALL preserve that metadata in the resolved suite/report context without changing scorer semantics - -#### Scenario: Purpose uses supported values -- **WHEN** a suite declares `evaluation_purpose` -- **THEN** the framework SHALL accept `capability` and `regression` as supported values and leave scorer thresholds under the suite's explicit gate policy - -### Requirement: Runtime-composition adoption suite - -The framework SHALL include one opt-in adoption suite that exercises runtime composition and v2 extensibility together. - -#### Scenario: Adoption suite uses active v2 capabilities -- **WHEN** the adoption suite is selected -- **THEN** it SHALL use a typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step-level reward metric, scripted user simulator, and rollout-owning harness - -#### Scenario: App evaluator remains unchanged -- **WHEN** callers use the existing `app-evaluator` suite -- **THEN** its behavior SHALL remain compatible unless a later explicit migration change updates that suite - -### Requirement: Multi-trial metrics are deferred - -Runtime composition SHALL distinguish retry/fallback execution from independent trial-based evaluation. - -#### Scenario: Retry attempts are reported -- **WHEN** retry or fallback attempts are retained as child rollout state -- **THEN** the framework SHALL NOT count those attempts as independent trials or use them to calculate pass@k, pass^k, or trial-distribution metrics - -#### Scenario: Caller requests pass@k or pass^k -- **WHEN** a caller needs independent repeated trials, pass@k, pass^k, or trial-distribution metrics -- **THEN** the framework SHALL treat that as out of scope for this change and require a later multi-trial evaluator change - -#### Scenario: Future trial evaluation is added -- **WHEN** a later change adds independent trial execution -- **THEN** it SHALL keep trial scheduling, clean-environment reset semantics, and pass@k/pass^k aggregation separate from retry wrapper behavior diff --git a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md deleted file mode 100644 index 3911601cb..000000000 --- a/openspec/changes/aworld-evaluator-runtime-composition-2026-06-10/tasks.md +++ /dev/null @@ -1,53 +0,0 @@ -## 1. Runtime Harness Model - -- [x] 1.1 Add a rollout-owning runtime harness definition separate from lightweight `EvalHarnessDef`. -- [x] 1.2 Add a `RuntimeHarness` protocol or base class that executes one case and returns rollout state. -- [x] 1.3 Preserve existing single-shot static/agent/task/program flows unchanged. - -## 2. Rollout State - -- [x] 2.1 Add a serializable rollout state model with turns, messages, trajectory, tool calls, usage, timing, errors, metadata, and child/attempt state. -- [x] 2.2 Bridge rollout state into existing `EvalState` so current scorer helpers and report summaries keep working. -- [x] 2.3 Include outcome data and optional serializable environment/artifact snapshots in rollout state. -- [x] 2.4 Derive standard rollout metrics such as turn count, tool-call count, token usage, and duration. -- [x] 2.5 Add tests proving rollout state does not store live runtime handles. - -## 3. Outcome / State-Check Grading - -- [x] 3.1 Add deterministic outcome/state-check grader definitions. -- [x] 3.2 Emit outcome metrics separately from judge, trajectory, and reward metrics. -- [x] 3.3 Allow composite gates to reference outcome metrics. -- [x] 3.4 Explicitly reject state checks that require sandbox reset, command execution, or clean-environment isolation in this change. - -## 4. User Simulation - -- [x] 4.1 Add a deterministic user simulator contract. -- [x] 4.2 Add a scripted simulator that reads turns from case input. -- [x] 4.3 Add a single-prompt simulator for compatibility with current one-shot cases. -- [x] 4.4 Document that LLM-backed adaptive user simulation is deferred. - -## 5. Step-Level Rewards - -- [x] 5.1 Add step reward records with metric name, step index, value, weight, partial-credit marker, reason, and metadata. -- [x] 5.2 Add rewarder interfaces that inspect rollout state without mutating it. -- [x] 5.3 Aggregate step rewards into normal evaluator metrics and gate inputs, including weighted and partial-credit summaries. - -## 6. Runtime Composition - -- [x] 6.1 Add one runtime wrapper style, preferably retry, around a base runtime harness. -- [x] 6.2 Preserve child/attempt state for composed runs. -- [x] 6.3 Add tests for retry/fallback state, terminal status, and report visibility. -- [x] 6.4 Document and test that retry/fallback attempts are not independent trials and do not produce pass@k/pass^k metrics. - -## 7. Adoption Suite - -- [x] 7.1 Add one builtin or framework-registered adoption suite that uses the runtime-composition path. -- [x] 7.2 The adoption suite uses typed judge schema, composite gate, outcome/state-check grader, trajectory scorer, step-level reward, and scripted simulator. -- [x] 7.3 Mark the adoption suite with capability/regression purpose metadata. -- [x] 7.4 Keep `app-evaluator` behavior unchanged unless explicitly selected for migration later. - -## 8. Verification - -- [x] 8.1 Add focused tests for harness rollout, outcome grading, user simulator, reward aggregation, runtime wrapper composition, standard metrics, and adoption suite execution. -- [x] 8.2 Run the evaluator regression suite. -- [x] 8.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-runtime-composition-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml deleted file mode 100644 index 2cb80411e..000000000 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: spec-driven -created: 2026-06-10 diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md deleted file mode 100644 index 933e216f8..000000000 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/design.md +++ /dev/null @@ -1,97 +0,0 @@ -## Context - -Runtime composition now gives AWorld a rollout-owning harness and serializable rollout state. That solves multi-turn execution and outcome inspection for one evaluation attempt. It does not solve nondeterminism measurement: a model or agent can fail one rollout and pass another under the same case. Agent evaluation needs independent repeated trials and distribution-level metrics. - -This change adds trial execution above the existing suite/harness layer. A trial is one independent evaluation of one case. A retry attempt is not a trial; retry is an execution strategy inside one trial. - -## Goals / Non-Goals - -**Goals:** - -- Add trial configuration with a default of one trial. -- Execute each case for `num_trials` independent trials. -- Preserve trial index, trial id, terminal status, metrics, and state summary in reports. -- Compute pass@k and pass^k from independent trial outcomes. -- Keep retry/fallback attempts nested inside a trial and excluded from trial metrics. -- Support both single-shot suites and runtime-composed suites. -- Keep existing evaluator behavior unchanged when no trial configuration is supplied. - -**Non-Goals:** - -- Adding sandbox reset, filesystem/database isolation, or clean-environment orchestration. -- Adding LLM-backed adaptive user simulators. -- Adding training-loop or optimizer integration. -- Redesigning `EvaluateRunner` public API beyond additive evaluator-substrate wiring. -- Treating retry/fallback attempts as trials. - -## Decisions - -### 1. Model trials as evaluator-level repetition, not harness retries - -Add a `TrialPolicyDef` on `EvalSuiteDef`: - -- `num_trials`: positive integer, default `1` -- `pass_at_k`: tuple of k values to report, default empty -- `pass_caret_k`: tuple of k values to report, default empty -- `success_metric`: metric used to decide whether a trial passed, default derived from the gate primary metric or `score` - -The framework should normalize invalid values at compile time: `num_trials >= 1`, k values between `1` and `num_trials`, and `success_metric` must be a declared or gate-referenced metric. - -### 2. Preserve one trial as current behavior - -If `TrialPolicyDef.num_trials == 1`, report shape and existing aggregate metrics should remain compatible. Trial-specific fields may be absent or present as additive metadata, but no existing required field should change. - -### 3. Expand cases without changing case identity - -The evaluator should execute `case_id` repeatedly with trial metadata: - -- stable original case id -- `trial_index` -- `trial_id` -- optional deterministic seed metadata - -Reports should group results by original case id while still exposing individual trial case results. A practical first implementation can expand dataset case ids to `case_id::trial-N` and retain `original_case_id` in case metadata. - -### 4. Compute pass@k and pass^k from trial outcomes - -For each original case and metric: - -- pass@k is true if any of the first k independent trials passed -- pass^k is true if all of the first k independent trials passed - -Aggregate report metrics should include rates across original cases: - -- `_pass@k` -- `_pass^k` - -These values are report-level metrics and may be referenced by composite gates. - -### 5. Keep retry/fallback inside each trial - -If a runtime harness uses retry, the selected terminal attempt determines that trial's metric outcome. Child attempts stay in rollout artifacts/metadata for inspection, but pass@k/pass^k must count the trial once. - -### 6. Defer environment isolation - -Trials need independence, but this change does not create sandboxes. The implementation should provide metadata hooks for later environment reset integration and document that true clean-state independence requires the follow-up environment-isolation change. - -## Risks / Trade-offs - -- [Retry confused with trials] -> Mitigation: explicit report fields and tests that retry child attempts do not increase trial count. -- [Report bloat] -> Mitigation: per-trial state summaries stay per case result; full child states remain artifacts or references. -- [Existing repeat_times behavior collision] -> Mitigation: keep suite trial policy framework-owned and avoid relying on legacy `Evaluator.repeat_times` pass@k behavior unless it can preserve required report semantics. -- [False independence without sandbox] -> Mitigation: document clean-state reset as out of scope and preserve hooks for later isolation. - -## Migration Plan - -1. Add `TrialPolicyDef` and compile-time validation. -2. Expand suite cases into trial cases with metadata while preserving original case id. -3. Add trial-aware result grouping and pass@k/pass^k aggregation. -4. Add report fields for trial policy, trial counts, and trial metrics. -5. Add tests proving retry attempts do not count as trials. -6. Keep existing one-trial suites and `app-evaluator` behavior compatible. - -## Deferred Questions - -- Environment reset semantics should be handled in an `evaluator-environment-isolation` change. -- LLM-backed adaptive user simulators should stay in a simulator-focused change. -- Training/optimizer integration should wait until trial metrics and environment isolation are stable. diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md deleted file mode 100644 index 8d4e7c1b3..000000000 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/implementation-plan.md +++ /dev/null @@ -1,200 +0,0 @@ -# AWorld Evaluator Trials and pass@k Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add independent evaluator trials with pass@k/pass^k metrics while keeping retry/fallback attempts distinct from trials. - -**Architecture:** Add a small trial policy layer to the suite substrate. Expand cases into trial rows before evaluation, preserve original case metadata, then aggregate pass@k/pass^k from independent trial case results during report assembly. - -**Tech Stack:** Python dataclasses, existing evaluator substrate/report/scorer infrastructure, pytest, OpenSpec. - ---- - -## File Structure - -- Modify: `aworld/evaluations/substrate.py` - Add `TrialPolicyDef`, trial case expansion, trial aggregation, report metadata, and gate integration. -- Modify: `aworld/evaluations/report.py` - Allow additive trial metadata fields if needed. -- Test: `tests/evaluations/test_evaluator_trials.py` - Focused TDD coverage for trial policy, expansion, pass@k/pass^k, retry separation, and report shape. -- Test: existing evaluator regression tests - Ensure one-trial behavior remains compatible. - -## Task 1: Trial Policy - -- [x] **Step 1: Write failing trial policy tests** - -Add tests in `tests/evaluations/test_evaluator_trials.py`: - -```python -from aworld.evaluations.substrate import TrialPolicyDef - - -def test_trial_policy_rejects_invalid_k_values(): - with pytest.raises(ValueError, match="k values"): - TrialPolicyDef(num_trials=2, pass_at_k=(3,)).validate() -``` - -- [x] **Step 2: Run test and confirm failure** - -Run: `pytest tests/evaluations/test_evaluator_trials.py::test_trial_policy_rejects_invalid_k_values -q` - -Expected: FAIL because `TrialPolicyDef` does not exist. - -- [x] **Step 3: Implement `TrialPolicyDef`** - -Add a frozen dataclass in `aworld/evaluations/substrate.py`: - -```python -@dataclass(frozen=True) -class TrialPolicyDef: - num_trials: int = 1 - pass_at_k: tuple[int, ...] = tuple() - pass_caret_k: tuple[int, ...] = tuple() - success_metric: str | None = None - - def validate(self) -> None: - if self.num_trials < 1: - raise ValueError("num_trials must be >= 1") - invalid = [k for k in (*self.pass_at_k, *self.pass_caret_k) if k < 1 or k > self.num_trials] - if invalid: - raise ValueError("k values must be between 1 and num_trials") -``` - -- [x] **Step 4: Run policy tests until green** - -Run: `pytest tests/evaluations/test_evaluator_trials.py -q` - -Expected: PASS for initial policy tests. - -## Task 2: Trial Case Expansion - -- [x] **Step 1: Write failing expansion tests** - -```python -def test_build_eval_dataset_expands_trial_cases(): - suite = EvalSuiteDef( - suite_id="trial-suite", - cases=[EvalCaseDef(case_id="case-1", input={"query": "hello"})], - trial_policy=TrialPolicyDef(num_trials=3), - ) - compiled = compile_evaluation_flow(EvaluationFlowDef(target={"kind": "inline", "value": {"target_path": "demo"}}, suite=suite)) - - ids = [case.eval_case_id for case in compiled.dataset.eval_cases] - assert ids == ["case-1::trial-1", "case-1::trial-2", "case-1::trial-3"] - assert compiled.dataset.eval_cases[0].case_data["_trial"]["original_case_id"] == "case-1" - assert compiled.dataset.eval_cases[0].case_data["_trial"]["trial_index"] == 1 -``` - -- [x] **Step 2: Run expansion test and confirm failure** - -Run: `pytest tests/evaluations/test_evaluator_trials.py::test_build_eval_dataset_expands_trial_cases -q` - -Expected: FAIL because `EvalSuiteDef` does not accept `trial_policy`. - -- [x] **Step 3: Implement trial expansion** - -Add `trial_policy: TrialPolicyDef = field(default_factory=TrialPolicyDef)` to `EvalSuiteDef`. Update `compile_evaluation_flow()` to expand `flow.suite.cases` before `build_eval_dataset()`, preserving `_trial` metadata. - -- [x] **Step 4: Run expansion tests until green** - -Run: `pytest tests/evaluations/test_evaluator_trials.py -q` - -Expected: PASS. - -## Task 3: pass@k/pass^k Aggregation - -- [x] **Step 1: Write failing aggregation tests** - -Use a deterministic judge that passes trial 2 and fails trials 1/3. Assert `score_pass@2 == 1.0` and `score_pass^2 == 0.0`. - -- [x] **Step 2: Run aggregation test and confirm failure** - -Run: `pytest tests/evaluations/test_evaluator_trials.py::test_run_evaluation_flow_reports_pass_at_k_and_pass_caret_k -q` - -Expected: FAIL because trial aggregation does not exist. - -- [x] **Step 3: Implement trial aggregation** - -In `run_evaluation_flow()`, group case results by `_trial.original_case_id`, derive each trial pass/fail from `TrialPolicyDef.success_metric` or gate primary metric, then add aggregate metrics named `_pass@k` and `_pass^k`. - -- [x] **Step 4: Run aggregation tests until green** - -Run: `pytest tests/evaluations/test_evaluator_trials.py -q` - -Expected: PASS. - -## Task 4: Retry Separation - -- [x] **Step 1: Write failing retry/trial separation test** - -Use a runtime harness wrapped in retry with `num_trials=2`. Assert report trial count is `2`, not the number of retry attempts, and pass@k counts terminal trial outcomes only. - -- [x] **Step 2: Run retry separation test and confirm failure** - -Run: `pytest tests/evaluations/test_evaluator_trials.py::test_retry_attempts_do_not_count_as_trials -q` - -Expected: FAIL until trial grouping ignores retry child attempts. - -- [x] **Step 3: Preserve retry attempts as artifacts only** - -Ensure trial aggregation reads only top-level trial results and never inspects `artifacts.attempts` as independent outcomes. - -- [x] **Step 4: Run retry separation tests until green** - -Run: `pytest tests/evaluations/test_evaluator_trials.py -q` - -Expected: PASS. - -## Task 5: Report Shape and Compatibility - -- [x] **Step 1: Write failing report compatibility tests** - -Assert one-trial `app-evaluator` style suites keep current required report fields, while multi-trial reports include `trial_policy`, `trial_counts`, and per-result trial metadata. - -- [x] **Step 2: Run report tests and confirm failure** - -Run: `pytest tests/evaluations/test_evaluator_trials.py::test_multi_trial_report_exposes_trial_metadata -q` - -Expected: FAIL until report metadata is added. - -- [x] **Step 3: Add additive report fields** - -Add report fields without changing existing required fields: - -```python -report["trial_policy"] = {...} -report["trial_counts"] = {"original_cases": n, "trials_total": m} -``` - -- [x] **Step 4: Run report tests until green** - -Run: `pytest tests/evaluations/test_evaluator_trials.py tests/evaluations/test_evaluation_substrate.py -q` - -Expected: PASS. - -## Task 6: Verification and Commit - -- [x] **Step 1: Run evaluator regression suite** - -Run: - -```bash -pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/evaluations/test_runtime_composition.py tests/evaluations/test_evaluator_trials.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -q -``` - -Expected: PASS. - -- [x] **Step 2: Validate OpenSpec** - -Run: `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict` - -Expected: `Change 'aworld-evaluator-trials-passk-2026-06-10' is valid` - -- [x] **Step 3: Commit** - -```bash -git add aworld/evaluations/substrate.py aworld/evaluations/report.py tests/evaluations/test_evaluator_trials.py openspec/changes/aworld-evaluator-trials-passk-2026-06-10 -git commit -m "feat: add evaluator trial pass metrics" -``` diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md deleted file mode 100644 index e33cab757..000000000 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/proposal.md +++ /dev/null @@ -1,34 +0,0 @@ -## Why - -`aworld-evaluator-runtime-composition-2026-06-10` added rollout-owning harnesses, outcome/state-check grading, step rewards, retry wrappers, and an adoption suite. It deliberately kept retry/fallback separate from independent trial evaluation. - -Complete agent evaluation still needs a first-class way to measure nondeterministic target behavior: - -- run the same case multiple independent times -- preserve per-trial rollout state and metrics -- compute pass@k and pass^k without confusing retry attempts for trials -- report trial distributions without changing existing single-shot suite behavior - -Without this layer, users can run a deterministic regression suite, but cannot answer "does this agent solve the task at least once in k attempts?" or "does it solve the task every time across k attempts?". - -## What Changes - -- Add suite-level trial configuration for independent repeated evaluation. -- Add trial-aware execution/report structures that retain per-trial case results. -- Add pass@k and pass^k aggregate metrics computed from independent trial outcomes. -- Keep retry/fallback attempts inside a trial and explicitly exclude them from pass@k/pass^k calculation. -- Add one opt-in adoption suite or test fixture proving trials work with runtime-composed suites and existing single-shot suites remain compatible. -- Defer clean-environment reset/sandbox orchestration to a dedicated environment-isolation change. - -## Capabilities - -### Modified Capabilities - -- `evaluation-substrate`: add independent trial execution, trial reports, and pass@k/pass^k aggregation for suite-backed evaluation flows. - -## Impact - -- Affected code: `aworld/evaluations/**`, especially suite definitions, flow compilation, evaluator orchestration, report assembly, and runtime-composition integration. -- Affected APIs: additive trial configuration on suite-backed evaluator APIs; existing callers default to one trial. -- Affected tests: add focused coverage for trial expansion, pass@k/pass^k math, retry/trial separation, report shape, and compatibility. -- Affected docs: clarify trial semantics and how they differ from retry/fallback wrappers. diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md deleted file mode 100644 index ee1f8e36e..000000000 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/specs/evaluation-substrate/spec.md +++ /dev/null @@ -1,65 +0,0 @@ -## MODIFIED Requirements - -### Requirement: Trial-based evaluation - -Suite-backed evaluation flows SHALL support independent repeated trials for each evaluation case while preserving current one-trial behavior by default. - -#### Scenario: Suite declares multiple trials -- **WHEN** a suite-backed evaluator declares a trial policy with `num_trials` greater than one -- **THEN** the framework SHALL execute each original case independently for the configured number of trials - -#### Scenario: Suite does not declare trials -- **WHEN** a suite-backed evaluator does not declare a trial policy -- **THEN** the framework SHALL execute each case once and preserve existing report behavior - -#### Scenario: Trial metadata is attached -- **WHEN** a case is expanded into trial executions -- **THEN** each trial execution SHALL preserve the original case id, trial index, and trial id in serializable case or state metadata - -### Requirement: pass@k and pass^k metrics - -Trial-based evaluation SHALL compute pass@k and pass^k metrics from independent trial outcomes. - -#### Scenario: pass@k is computed -- **WHEN** a suite declares pass@k for a metric and a case has at least k trials -- **THEN** the framework SHALL mark that case as pass@k when any of the first k independent trials passes the configured success metric - -#### Scenario: pass^k is computed -- **WHEN** a suite declares pass^k for a metric and a case has at least k trials -- **THEN** the framework SHALL mark that case as pass^k when all of the first k independent trials pass the configured success metric - -#### Scenario: Trial metrics are aggregated -- **WHEN** pass@k or pass^k is computed for all cases -- **THEN** the framework SHALL expose aggregate pass@k/pass^k rates as normal report metrics that composite gates can reference - -### Requirement: Retry attempts are not trials - -Trial-based evaluation SHALL keep runtime retry/fallback attempts separate from independent trials. - -#### Scenario: Retry wrapper runs inside a trial -- **WHEN** a runtime-composed trial uses a retry or fallback wrapper -- **THEN** the framework SHALL count the selected terminal rollout as one trial and preserve child attempts only as trial artifacts or metadata - -#### Scenario: pass@k excludes retry attempts -- **WHEN** pass@k or pass^k metrics are calculated -- **THEN** retry or fallback child attempts SHALL NOT increase the number of trials or directly contribute separate trial outcomes - -### Requirement: Trial reports - -Evaluator reports SHALL expose trial metadata and aggregate trial metrics additively. - -#### Scenario: Multiple trials are reported -- **WHEN** a suite runs multiple trials -- **THEN** the report SHALL include trial policy metadata, total trial counts, and per-trial metadata sufficient to group trials by original case id - -#### Scenario: Single-trial reports remain compatible -- **WHEN** a suite runs one trial -- **THEN** existing required report fields SHALL remain compatible and trial-specific fields SHALL be additive only - -### Requirement: Environment reset is deferred - -Trial-based evaluation SHALL acknowledge clean-environment isolation as a separate concern. - -#### Scenario: Suite requires clean environment per trial -- **WHEN** a suite requires filesystem, database, sandbox, or external state reset between trials -- **THEN** the framework SHALL treat that reset orchestration as out of scope for this change and leave it to a dedicated environment-isolation change diff --git a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md b/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md deleted file mode 100644 index d9f181c5c..000000000 --- a/openspec/changes/aworld-evaluator-trials-passk-2026-06-10/tasks.md +++ /dev/null @@ -1,40 +0,0 @@ -## 1. Trial Policy Model - -- [x] 1.1 Add `TrialPolicyDef` with `num_trials`, `pass_at_k`, `pass_caret_k`, and `success_metric`. -- [x] 1.2 Add `trial_policy` to `EvalSuiteDef` with a default one-trial policy. -- [x] 1.3 Validate trial policy during flow compilation. -- [x] 1.4 Preserve existing single-shot behavior when `num_trials == 1`. - -## 2. Trial Case Expansion - -- [x] 2.1 Expand each suite case into independent trial case rows when `num_trials > 1`. -- [x] 2.2 Preserve `original_case_id`, `trial_index`, and `trial_id` in case metadata. -- [x] 2.3 Ensure runtime-composed harnesses receive trial metadata without storing live handles. -- [x] 2.4 Add tests for stable case grouping and trial metadata. - -## 3. Trial Outcome Aggregation - -- [x] 3.1 Determine per-trial pass/fail from the configured `success_metric`. -- [x] 3.2 Compute pass@k per original case. -- [x] 3.3 Compute pass^k per original case. -- [x] 3.4 Aggregate pass@k/pass^k rates across original cases. -- [x] 3.5 Allow composite gates to reference trial aggregate metrics. - -## 4. Retry / Trial Separation - -- [x] 4.1 Add tests proving retry child attempts do not increase trial count. -- [x] 4.2 Ensure pass@k/pass^k uses the selected terminal rollout of each trial. -- [x] 4.3 Preserve retry attempts only inside trial artifacts/metadata. - -## 5. Report Shape - -- [x] 5.1 Add report-level trial policy metadata. -- [x] 5.2 Add report-level trial count summaries. -- [x] 5.3 Add per-case trial grouping or trial metadata sufficient to reconstruct groups. -- [x] 5.4 Keep existing report schema compatible via additive fields. - -## 6. Verification - -- [x] 6.1 Add focused tests for trial policy validation, trial expansion, pass@k/pass^k aggregation, retry separation, and report shape. -- [x] 6.2 Run evaluator regression tests. -- [x] 6.3 Validate this OpenSpec change with `openspec validate aworld-evaluator-trials-passk-2026-06-10 --strict`. diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md deleted file mode 100644 index 9daed18b4..000000000 --- a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/design.md +++ /dev/null @@ -1,183 +0,0 @@ -## Context - -Evaluator v1 deliberately optimized for shipping a working framework substrate and official CLI flow first. The result is structurally sound, but its abstraction ceiling is still low in four places: - -- execution can only target AWorld-native `agent` and `task` paths plus judge-only `static` -- eval targets still know too much about runner entrypoints -- judge contracts are validated structurally but not typed as first-class models -- gate policies are inspectable but too narrow for multi-metric release decisions - -The goal of this follow-up is to raise that ceiling without discarding the v1 substrate. The evaluator remains an AWorld framework capability under `aworld/evaluations/`, and `aworld-cli` remains only an official consumer and assembly layer. This is a v1 extensibility increment, not a verifiers-parity runtime: rollout-owning harnesses, user simulators, lifecycle hooks, child-state composition, and training reward semantics are deferred. - -## Goals / Non-Goals - -**Goals:** - -- Add a program-backed execution path that fits the existing suite/case/execution/state model. -- Add a lightweight harness definition so reusable execution behavior is explicit without adopting verifiers' broader object model. -- Isolate execution mechanics behind a framework-owned adapter boundary so runtime-specific invocation does not leak across evaluator code. -- Make typed judge-output models the primary contract for suite-backed evaluation. -- Support structured composite gate policies for multi-metric pass, fail, and approval decisions. -- Support suite-declared trajectory scoring alongside final-result scoring. -- Preserve compatibility for existing v1 suite-backed flows, reports, and CLI evaluator behavior. - -**Non-Goals:** - -- Replacing the existing `EvalTarget -> Evaluator -> EvaluateRunner` orchestration skeleton. -- Creating a public external evaluator API v2 in this change. -- Reworking the `aworld-cli evaluator` command shape beyond compatibility adjustments required by framework changes. -- Shipping baseline history, trend analysis, or evaluator comparison workflows. -- Converging AWorld onto verifiers' public API terminology or object model. -- Adding external harness package registries, lifecycle decorators, training reward semantics, sandbox command execution, or child-state composition. - -## Ownership Model - -| Concept | Owns | Must not own | -| --- | --- | --- | -| `EvalSuiteDef` / `EvalCaseDef` | Domain inputs, case metadata, judge contract, declared scorers, gates | Runtime handles in declarative or persisted suite definitions | -| `EvalHarnessDef` | Reusable execution selection and execution defaults for a suite flow | Scoring, judge validation, report assembly | -| `EvalExecutionSpec` | Typed execution configuration for one harness or suite execution path | Arbitrary workflow engines or command execution | -| `ExecutionAdapter` | Invocation and normalization into `EvalState` | Orchestration, score calculation, gate decisions | -| `Evaluator` / `EvaluateRunner` | Existing dataset, target, scorer orchestration | Suite-specific execution semantics | - -Cases remain serializable input data. `EvalState` remains serializable rollout output containing answer, completion, artifacts, trajectory, usage, timing, errors, raw response, and metadata. Runtime clients, runners, sandboxes, program objects, and other live handles may be used transiently by adapters but must not be stored in `EvalState`. - -In-memory framework callers may still pass live AWorld agent/task objects through `EvalExecutionSpec.target_config` for compatibility with existing agent/task evaluation APIs. That path is not a declarative or JSON-serializable suite contract. Declared JSON manifests intentionally do not accept `execution`, `target_ref`, `task_builder_ref`, or live runtime handles; they only layer safe suite metadata and simple gate overrides on supported builtin suites. - -## Decisions - -### 1. Add `PROGRAM` execution as an extension of the current execution model - -`EvalExecutionMode` should gain a `PROGRAM` mode that lets a suite execute an importable callable without pretending every evaluation target is an AWorld agent or task. - -`PROGRAM` is for evaluation targets that do not use AWorld's agent or task runtime, such as a third-party API client, local library evaluator, or custom callable harness. It is not for customizing AWorld agent behavior, preprocessing case inputs, replacing judge/scorer logic, command execution, sandbox placement, or general workflow engines. - -The callable reference must be an import string (`module:attribute` or `module.attribute`) that resolves to a callable. `EvalExecutionSpec` validation should reject `PROGRAM` specs without `target_ref` and reject unsupported command or workflow forms in this change. TASK builder references use the same importable-callable validation. - -Program callables receive `(case, spec, target)` and may be sync or async. They must return one of: - -1. an `EvalState` -2. a mapping matching `EvalState` fields, including optional `status`, `answer`, `completion`, `trajectory`, `tool_calls`, `usage`, `timing`, `error`, and `metadata` -3. a `TaskResponse` -4. a bare value, treated as the final answer with success status - -If custom normalization is needed, the program should return a mapping with all relevant `EvalState` fields set explicitly and document the mapping in suite metadata. Exceptions from the program should propagate as execution failures rather than being silently converted into judge payloads. - -The program-backed path should still compile into the same evaluator substrate: - -- case definitions still provide the task-level inputs -- execution specs still describe runtime wiring -- execution output must still normalize into `EvalState` -- scorers and gate policies remain agnostic to how execution happened - -`PROGRAM` is a framework extensibility mechanism, not a new CLI product mode. - -Importable callable execution is a trusted in-process extension point. Importing a module can execute module top-level code, so `PROGRAM` and TASK builder refs must only be used for evaluator code controlled by the runner or workspace owner. This change does not sandbox imported code, provide an allowlist, sanitize third-party program payloads, or make untrusted suite manifests executable. - -### 2. Add a lightweight harness boundary over execution specs - -AWorld should not adopt verifiers' `Taskset` / `Harness` / `Env` object model, but it should make the missing execution reuse boundary explicit. - -`EvalHarnessDef` should be a small framework-owned dataclass that can be attached to a suite or flow: - -- `harness_id`: stable reusable identifier -- `execution`: `EvalExecutionSpec` -- `metadata`: optional serializable harness metadata - -Suites may continue to set `execution` directly for v1 compatibility. At compile time, direct `suite.execution` lowers into an equivalent harness so the substrate has one execution boundary. Harnesses own execution defaults and adapter selection; suites still own cases, judges, scorers, and gates. - -This is intentionally not a BYO harness plugin system and not equivalent to verifiers' rollout-owning harness. External package loading, lifecycle decorators, retry/fallback composition, multi-turn rollout ownership, and runtime handle borrowing are deferred. - -### 3. Route execution through adapters instead of hardcoded runner calls - -The follow-up should introduce an internal adapter boundary in `aworld/evaluations/`, for example an `ExecutionAdapter` protocol plus concrete adapters for: - -- static/judge-only execution -- AWorld agent execution -- AWorld task execution -- program-backed execution - -This keeps runner coupling local. If runner invocation details change later, the evaluator substrate should only need adapter updates instead of cross-cutting target rewrites. - -Adapters are a hard internal boundary: they must not replace the current `EvalTarget -> Evaluator -> EvaluateRunner` orchestration skeleton. They only execute one case through the configured runtime and normalize the result into `EvalState`. - -### 4. Make typed judge models the primary schema contract - -Judge output validation should move from required-field checks toward typed models using Pydantic, which already exists across the codebase. - -The primary suite contract should become: - -- a typed judge-output model for validation and documentation -- JSON schema derivation from that model for report and tooling integration -- a compatibility bridge so current `JudgeSchemaDef(required_fields=...)` style suites continue to work during migration - -This change is about stronger framework contracts, not about forcing every existing scorer to migrate in one pass. - -Legacy required-field definitions should lower through the same `JudgeSchemaDef` validation and schema-export API used by typed models. They should not create a parallel scoring path. - -Judge schema metadata should be surfaced once at the top level of the evaluator report, not copied into every case result. Per-case judge metadata should continue to include judge payload fields and backend id. - -### 5. Use structured composite gate conditions instead of a string DSL - -The follow-up should expand gate expressiveness, but it should avoid introducing a loose string expression DSL as the first step. - -The preferred direction is a structured gate model such as: - -- condition objects over named metrics and comparison operators -- explicit combinators like `all` / `any` -- optional approval-stage conditions separate from pass/fail conditions -- compatibility lowering from the current single-threshold policy into the new structure - -Supported operators should include `>=`, `<=`, `>`, `<`, `==`, and `!=` from the first implementation so adding strict bounds or categorical metrics later does not require an API break. - -This keeps gate logic inspectable, serializable, and consistent with AWorld's existing preference for explicit typed configuration objects. - -Legacy threshold gates should lower into structured conditions at substrate boundaries. They should not keep a separate gate evaluation path. - -### 6. Add suite-declared trajectory scoring - -The substrate already preserves trajectory in `EvalState`, and existing scorer extractors can inspect it. This change should make trajectory evaluation explicit in suite definitions so final-result scoring and process scoring can be configured together. - -`EvalSuiteDef` should gain a `trajectory_scorers` tuple of structured scorer definitions. The first implementation should lower these definitions into normal `EvalCriteria` entries for existing trajectory scorer classes, preserving the current scorer registry and report metric shapes. - -Trajectory scorers evaluate `EvalState.trajectory` and related state fields produced by the current single-shot execution flow. They should not mutate state, replace the judge layer, introduce step-level reward semantics, run multi-turn user simulation, or introduce a separate report format. - -### 7. Keep CLI changes additive and framework-driven - -The official `aworld-cli evaluator` command should inherit these improvements through framework compilation and execution, not through CLI-owned evaluator semantics. - -That means: - -- no second evaluator stack inside `aworld-cli` -- no CLI-only gate language -- no CLI-only program execution abstraction - -If follow-up CLI work becomes necessary later, it should be a separate product-focused change. - -## Risks / Trade-offs - -- [Program execution shape too generic] -> Mitigation: keep `PROGRAM` scoped to trusted importable callable refs plus normalized `EvalState` output, not arbitrary workflow engines or untrusted manifest execution. -- [Typed judge model migration friction] -> Mitigation: provide compatibility bridging from current `JudgeSchemaDef`; builtin typed-model migration may be staged after the substrate lands. -- [Composite gate policies become overdesigned] -> Mitigation: prefer structured operators and combinators over a general-purpose DSL. -- [Adapter layer duplicates existing target abstractions] -> Mitigation: keep adapters narrowly focused on execution invocation and normalization, not on replacing the orchestration skeleton. -- [Harness concept expands into a second framework] -> Mitigation: keep `EvalHarnessDef` as a lightweight typed holder for execution specs and defer lifecycle/composition/package features. - -## Migration Plan - -1. Add the harness, execution, and adapter abstractions behind compatibility paths so current suites still resolve. -2. Introduce typed judge models and bridge legacy schema definitions. -3. Expand gate evaluation logic while preserving current threshold-style gate definitions. -4. Add suite-declared trajectory scorer lowering. -5. Keep builtin suites compatible and exercise the richer substrate through focused tests; migrate builtin suites to typed models only when their public output contract is ready to change. -6. Keep CLI evaluator behavior stable while letting it consume the new framework-owned capabilities. - -Rollback strategy: - -- retain current `static` / `agent` / `task` suite behavior through compatibility lowering -- keep legacy threshold gate definitions and lightweight judge schemas valid until follow-up migrations are complete - -## Deferred Questions - -- Rich harness lifecycle hooks, retry/fallback composition, and child-state borrowing should wait for a later runtime-composition change. -- Command-backed or sandbox-backed program execution should wait for a dedicated execution-runtime change. -- Manifest exposure for every structured gate and trajectory scorer field may be staged after the core substrate supports the model. diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md deleted file mode 100644 index 7c6215e5a..000000000 --- a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/implementation-plan.md +++ /dev/null @@ -1,681 +0,0 @@ -# AWorld Evaluator V2 Extensibility Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -> **Implementation status note:** This file records the original execution plan and is no longer the authoritative description of shipped behavior. The authoritative status is `tasks.md`, the delta spec, and the code/tests. Final implementation deliberately keeps declared JSON manifests metadata-only, defers builtin typed-model migration, treats `judge_schema` as an optional report-level object, and defines trajectory evaluation as single-shot `EvalState` inspection rather than verifiers-style rollout ownership. - -**Goal:** Extend the framework-owned evaluator substrate with lightweight harness reuse, bounded program-backed execution, adapter-isolated runtime invocation, typed judge-output contracts, structured composite gate policies, and suite-declared trajectory scoring while keeping v1 evaluator flows compatible. - -**Architecture:** Keep the current `EvalTarget -> Evaluator -> EvaluateRunner` skeleton, but move execution dispatch behind framework-owned adapters under `aworld/evaluations/`. Evolve suite-backed contracts additively: direct `suite.execution` lowers into a lightweight `EvalHarnessDef`, `EvalExecutionSpec` gains bounded import-callable `PROGRAM`, judge schemas gain typed-model support with a legacy bridge, gate policies gain structured composite conditions with lowering from the current threshold form, and trajectory scorer declarations lower into existing scorer criteria. - -**Tech Stack:** Python, AWorld evaluation substrate under `aworld/evaluations/`, Pydantic v2 models already used in the repo, pytest, OpenSpec. - ---- - -## File Structure - -- `aworld/evaluations/execution.py` - Extend execution mode definitions and shared normalization helpers used by all adapter paths. -- `aworld/evaluations/execution_adapters.py` - New internal adapter boundary for static, agent, task, and program-backed execution. -- `aworld/evaluations/substrate.py` - Compile suites onto harnesses/adapters, typed judge contracts, trajectory scorer criteria, and richer gate models while preserving compatibility. -- `aworld/evaluations/eval_targets/agent_eval.py` - Reduce direct runtime coupling so existing eval targets align with adapter-backed execution. -- `aworld/evaluations/report.py` - Surface typed judge schema metadata once at report level and structured gate outputs in the report contract where needed. -- `tests/evaluations/test_execution_state.py` - Extend execution-state tests to cover program-backed normalization. -- `tests/evaluations/test_execution_adapters.py` - New focused coverage for adapter selection and execution. -- `tests/evaluations/test_evaluation_substrate.py` - Add substrate-level coverage for harness lowering, typed judge schemas, composite gates, trajectory scorers, and backward compatibility. -- `tests/core/test_evaluator_runtime.py` - Guard that CLI-facing runtime assembly still works on top of the evolved framework substrate. -- `aworld/evaluations/README.md` - Document the new framework-owned extension points once implementation settles. - -### Task 1: Add harness lowering, execution adapters, and `PROGRAM` execution mode - -**Files:** -- Modify: `aworld/evaluations/execution.py` -- Create: `aworld/evaluations/execution_adapters.py` -- Modify: `aworld/evaluations/substrate.py` -- Modify: `aworld/evaluations/eval_targets/agent_eval.py` -- Test: `tests/evaluations/test_execution_state.py` -- Test: `tests/evaluations/test_execution_adapters.py` -- Test: `tests/evaluations/test_evaluation_substrate.py` - -- [ ] **Step 1: Write the failing harness, adapter, and program-execution tests** - -```python -# tests/evaluations/test_execution_adapters.py -from __future__ import annotations - -import pytest - -from aworld.evaluations.execution import EvalExecutionMode, EvalExecutionSpec -from aworld.evaluations.execution_adapters import resolve_execution_adapter -from aworld.evaluations.substrate import EvalCaseDef - - -async def _demo_program(case, spec, target): - return { - "status": "success", - "answer": f"ran:{case.input['query']}", - "completion": [{"role": "assistant", "content": "final"}], - "trajectory": [{"role": "assistant", "content": "step"}], - "usage": {"total_tokens": 7}, - } - - -@pytest.mark.asyncio -async def test_program_execution_adapter_normalizes_result(monkeypatch): - monkeypatch.setattr( - "aworld.evaluations.execution_adapters.load_program_callable", - lambda ref: _demo_program, - ) - adapter = resolve_execution_adapter( - EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case") - ) - state = await adapter.execute( - case=EvalCaseDef(case_id="case-1", input={"query": "demo"}), - target={"target_kind": "directory"}, - spec=EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="pkg.module:run_case"), - ) - - assert state.case_id == "case-1" - assert state.answer == "ran:demo" - assert state.completion[0]["content"] == "final" - assert state.trajectory[0]["content"] == "step" - assert state.usage["total_tokens"] == 7 - - -def test_resolve_execution_adapter_rejects_missing_program_ref(): - with pytest.raises(ValueError, match="target_ref"): - resolve_execution_adapter(EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM)) - - -def test_resolve_execution_adapter_rejects_command_style_program_ref(): - with pytest.raises(ValueError, match="importable callable"): - resolve_execution_adapter( - EvalExecutionSpec(mode=EvalExecutionMode.PROGRAM, target_ref="python script.py") - ) -``` - -```python -# tests/evaluations/test_execution_state.py -from aworld.evaluations.execution import normalize_task_response_to_eval_state - - -def test_normalize_mapping_response_preserves_completion_and_tool_calls(): - state = normalize_task_response_to_eval_state( - case_id="case-2", - response={ - "status": "success", - "answer": "ok", - "completion": [{"role": "assistant", "content": "ok"}], - "trajectory": [{"tool_calls": [{"name": "search"}]}], - }, - ) - - assert state.completion[0]["content"] == "ok" - assert state.tool_calls[0]["name"] == "search" -``` - -- [ ] **Step 2: Run the targeted tests and confirm they fail** - -Run: `pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py -q` -Expected: FAIL because `EvalExecutionMode.PROGRAM`, `execution_adapters.py`, harness lowering, and adapter resolution do not exist yet. - -- [ ] **Step 3: Add `PROGRAM` to execution definitions and create adapter implementations** - -```python -# aworld/evaluations/execution.py -class EvalExecutionMode(str, Enum): - STATIC = "static" - AGENT = "agent" - TASK = "task" - PROGRAM = "program" - - -def load_program_callable(ref: str): - if ":" in ref: - module_name, attr_name = ref.split(":", 1) - elif "." in ref: - module_name, attr_name = ref.rsplit(".", 1) - else: - raise ValueError(f"invalid program ref: {ref}") - module = importlib.import_module(module_name) - return getattr(module, attr_name) -``` - -```python -# aworld/evaluations/execution_adapters.py -from __future__ import annotations - -import inspect -from dataclasses import dataclass -from typing import Protocol - -from aworld.evaluations.execution import ( - EvalExecutionMode, - EvalExecutionSpec, - EvalState, - load_program_callable, - normalize_task_response_to_eval_state, -) -from aworld.runner import Runners - - -class ExecutionAdapter(Protocol): - async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: - pass - - -@dataclass(frozen=True) -class StaticExecutionAdapter: - async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: - return EvalState(case_id=case.case_id, status="not_evaluated", metadata={"_target": dict(target)}) - - -@dataclass(frozen=True) -class AgentExecutionAdapter: - async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: - query = case.input[spec.query_column or "query"] - response = await Runners.run(query, agent=spec.target_config["agent"]) - return normalize_task_response_to_eval_state(case_id=case.case_id, response=response, target=target) - - -@dataclass(frozen=True) -class TaskExecutionAdapter: - async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: - builder = load_program_callable(spec.task_builder_ref) - task = builder(case=case, target=target, spec=spec) - if inspect.isawaitable(task): - task = await task - response = await Runners.run_task(task=task) - return normalize_task_response_to_eval_state(case_id=case.case_id, response=response, target=target) - - -@dataclass(frozen=True) -class ProgramExecutionAdapter: - async def execute(self, *, case, target: dict, spec: EvalExecutionSpec) -> EvalState: - if not spec.target_ref: - raise ValueError("program execution requires target_ref") - program = load_program_callable(spec.target_ref) - result = program(case, spec, target) - if inspect.isawaitable(result): - result = await result - return normalize_task_response_to_eval_state( - case_id=case.case_id, - response=result, - target=target, - metadata={"_execution_mode": spec.mode.value}, - ) - - -def resolve_execution_adapter(spec: EvalExecutionSpec) -> ExecutionAdapter: - if spec.mode == EvalExecutionMode.STATIC: - return StaticExecutionAdapter() - if spec.mode == EvalExecutionMode.AGENT: - return AgentExecutionAdapter() - if spec.mode == EvalExecutionMode.TASK: - return TaskExecutionAdapter() - if spec.mode == EvalExecutionMode.PROGRAM: - if not spec.target_ref: - raise ValueError("program execution requires target_ref") - return ProgramExecutionAdapter() - raise ValueError(f"unsupported execution mode: {spec.mode}") -``` - -- [ ] **Step 4: Compile suite execution through lightweight harnesses and adapters in the substrate** - -```python -# aworld/evaluations/substrate.py -from aworld.evaluations.execution_adapters import resolve_execution_adapter - - -@dataclass(frozen=True) -class EvalHarnessDef: - harness_id: str - execution: EvalExecutionSpec = field(default_factory=EvalExecutionSpec) - metadata: dict[str, Any] = field(default_factory=dict) - - -def resolve_eval_harness(suite: EvalSuiteDef) -> EvalHarnessDef: - if suite.harness is not None: - return suite.harness - if suite.execution is not None: - return EvalHarnessDef( - harness_id=f"{suite.suite_id}-execution", - execution=suite.execution, - metadata={"lowered_from": "suite.execution"}, - ) - return EvalHarnessDef(harness_id=f"{suite.suite_id}-static") -``` - -```python -# aworld/evaluations/substrate.py -class _AdapterExecutionEvalTarget(EvalTarget[dict]): - async def predict(self, index: int, input: EvalDataCase[dict]) -> dict: - case = EvalCaseDef(case_id=input.eval_case_id, input=dict(input.case_data)) - state = await self._adapter.execute(case=case, target=self._target, spec=self._harness.execution) - return {"answer": state.answer, "state": state.to_dict()} -``` - -Adapters must not replace `EvalTarget -> Evaluator -> EvaluateRunner`; they only localize per-case invocation and normalization. - -```python -# aworld/evaluations/eval_targets/agent_eval.py -class AworldTaskEvalTarget(EvalTarget[dict]): - async def run_task_response(self, task: Task) -> TaskResponse | dict | object: - return await Runners.run_task(task=task) -``` - -- [ ] **Step 5: Run adapter and substrate tests until green** - -Run: `pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py -q` -Expected: PASS, including coverage for harness lowering, adapter-backed `PROGRAM` execution, rejected invalid program refs, and unchanged `static`/`agent`/`task` compatibility. - -- [ ] **Step 6: Commit the execution-extensibility slice** - -```bash -git add aworld/evaluations/execution.py aworld/evaluations/execution_adapters.py aworld/evaluations/substrate.py aworld/evaluations/eval_targets/agent_eval.py tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py -git commit -m "feat: add adapter-backed evaluator execution" -``` - -### Task 2: Add typed judge-output contracts with a legacy compatibility bridge - -**Files:** -- Modify: `aworld/evaluations/substrate.py` -- Modify: `aworld/evaluations/scorers/suite_judge.py` -- Modify: `aworld/evaluations/report.py` -- Test: `tests/evaluations/test_evaluation_substrate.py` -- Test: `tests/core/test_evaluator_runtime.py` - -- [ ] **Step 1: Write failing tests for typed judge validation and legacy fallback** - -```python -# tests/evaluations/test_evaluation_substrate.py -from pydantic import BaseModel - -from aworld.evaluations.substrate import EvalSuiteDef, JudgeSchemaDef - - -class DemoJudgeOutput(BaseModel): - score: float - verdict: str - - -def test_typed_judge_model_accepts_valid_payload(): - suite = EvalSuiteDef( - suite_id="demo", - judge_schema=JudgeSchemaDef(output_model=DemoJudgeOutput), - ) - - payload = suite.judge_schema.validate_payload({"score": 0.8, "verdict": "ok"}) - assert payload["score"] == 0.8 - assert payload["verdict"] == "ok" - - -def test_typed_judge_model_rejects_invalid_payload(): - suite = EvalSuiteDef( - suite_id="demo", - judge_schema=JudgeSchemaDef(output_model=DemoJudgeOutput), - ) - - with pytest.raises(ValueError, match="verdict"): - suite.judge_schema.validate_payload({"score": 0.8}) - - -def test_legacy_required_fields_schema_still_validates(): - schema = JudgeSchemaDef(required_fields=("score", "rank")) - payload = schema.validate_payload({"score": 0.9, "rank": 1}) - assert payload["rank"] == 1 -``` - -- [ ] **Step 2: Run judge-schema tests to confirm failure** - -Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` -Expected: FAIL because `JudgeSchemaDef` does not yet support `output_model` or `validate_payload()`. - -- [ ] **Step 3: Evolve `JudgeSchemaDef` into a typed contract with compatibility bridging** - -```python -# aworld/evaluations/substrate.py -from pydantic import BaseModel, ValidationError - - -@dataclass(frozen=True) -class JudgeSchemaDef: - required_fields: tuple[str, ...] = tuple() - output_model: type[BaseModel] | None = None - - def validate_payload(self, payload: Mapping[str, Any]) -> dict[str, Any]: - if self.output_model is not None: - try: - model = self.output_model.model_validate(dict(payload)) - except ValidationError as exc: - raise ValueError(str(exc)) from exc - return model.model_dump(mode="json") - - missing = [field for field in self.required_fields if field not in payload] - if missing: - raise ValueError(f"missing required judge fields: {', '.join(missing)}") - return dict(payload) - - def json_schema(self) -> dict[str, Any]: - if self.output_model is not None: - return self.output_model.model_json_schema() - return { - "type": "object", - "required": list(self.required_fields), - "properties": {field: {} for field in self.required_fields}, - } -``` - -- [ ] **Step 4: Route judge scoring and report metadata through the typed schema contract** - -```python -# aworld/evaluations/scorers/suite_judge.py -payload = self.suite.judge_schema.validate_payload(dict(execution.payload)) - -metric_result = { - "value": float(payload["score"]), - "metadata": { - **payload, - "_judge_backend": execution.backend_id, - }, -} -``` - -```python -# aworld/evaluations/report.py -"judge_backend": {"type": "object"}, -"judge_schema": {"type": ["object", "null"]}, -``` - -`run_evaluation_flow()` should attach `report["judge_schema"] = suite.judge_schema.json_schema()` once at the top level when the schema is non-empty. Do not copy schema metadata into every case result. - -- [ ] **Step 5: Run substrate and runtime tests until green** - -Run: `pytest tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py -q` -Expected: PASS, including typed-model validation and unchanged legacy required-field flows. - -- [ ] **Step 6: Commit the typed-judge-contract slice** - -```bash -git add aworld/evaluations/substrate.py aworld/evaluations/scorers/suite_judge.py aworld/evaluations/report.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py -git commit -m "feat: add typed evaluator judge schemas" -``` - -### Task 3: Add structured composite gate policies with threshold compatibility lowering - -**Files:** -- Modify: `aworld/evaluations/substrate.py` -- Modify: `aworld/evaluations/manifests.py` -- Modify: `aworld/evaluations/report.py` -- Test: `tests/evaluations/test_evaluation_substrate.py` -- Test: `tests/docs/test_evaluator_report_docs.py` - -- [ ] **Step 1: Write failing tests for composite gates and legacy threshold compatibility** - -```python -# tests/evaluations/test_evaluation_substrate.py -from aworld.evaluations.substrate import GateMetricCondition, GatePolicyDef - - -def test_composite_gate_returns_pass_when_all_conditions_hold(): - policy = GatePolicyDef( - pass_all=( - GateMetricCondition(metric_name="score", op=">=", threshold=0.9), - GateMetricCondition(metric_name="latency", op="<=", threshold=5.0), - ) - ) - - decision = policy.evaluate({"score": 0.95, "latency": 4.2}) - assert decision.status == "pass" - - -def test_composite_gate_returns_needs_approval_when_approval_conditions_hold(): - policy = GatePolicyDef( - pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.9),), - approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.75),), - ) - - decision = policy.evaluate({"score": 0.8}) - assert decision.status == "needs_approval" - - -def test_legacy_threshold_gate_lowers_to_structured_policy(): - policy = GatePolicyDef(metric_name="score", pass_threshold=0.9, approval_threshold=0.8) - decision = policy.evaluate({"score": 0.85}) - assert decision.status == "needs_approval" - - -@pytest.mark.parametrize( - ("op", "threshold", "value"), - [ - (">", 0.9, 0.91), - ("<", 0.9, 0.89), - (">=", 0.9, 0.9), - ("<=", 0.9, 0.9), - ("==", "approved", "approved"), - ("!=", "blocked", "approved"), - ], -) -def test_gate_metric_condition_supports_all_declared_operators(op, threshold, value): - policy = GatePolicyDef( - pass_all=(GateMetricCondition(metric_name="metric", op=op, threshold=threshold),) - ) - - assert policy.evaluate({"metric": value}).status == "pass" -``` - -- [ ] **Step 2: Run gate tests to confirm failure** - -Run: `pytest tests/evaluations/test_evaluation_substrate.py -q` -Expected: FAIL because structured gate condition types do not exist yet. - -- [ ] **Step 3: Add structured gate condition objects and compatibility lowering** - -```python -# aworld/evaluations/substrate.py -@dataclass(frozen=True) -class GateMetricCondition: - metric_name: str - op: str - threshold: float | int | str | bool - - def matches(self, metrics: Mapping[str, Any]) -> bool: - value = metrics[self.metric_name] - if self.op == ">=": - return float(value) >= float(self.threshold) - if self.op == "<=": - return float(value) <= float(self.threshold) - if self.op == ">": - return float(value) > float(self.threshold) - if self.op == "<": - return float(value) < float(self.threshold) - if self.op == "==": - return value == self.threshold - if self.op == "!=": - return value != self.threshold - raise ValueError(f"unsupported gate operator: {self.op}") - - -@dataclass(frozen=True) -class GatePolicyDef: - metric_name: str | None = None - pass_threshold: float | None = None - approval_threshold: float | None = None - pass_all: tuple[GateMetricCondition, ...] = tuple() - approval_all: tuple[GateMetricCondition, ...] = tuple() - - def normalized_conditions(self) -> tuple[tuple[GateMetricCondition, ...], tuple[GateMetricCondition, ...]]: - pass_all = self.pass_all - approval_all = self.approval_all - if not pass_all and self.metric_name is not None and self.pass_threshold is not None: - pass_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.pass_threshold),) - if not approval_all and self.metric_name is not None and self.approval_threshold is not None: - approval_all = (GateMetricCondition(metric_name=self.metric_name, op=">=", threshold=self.approval_threshold),) - return pass_all, approval_all -``` - -Gate evaluation should collect every metric referenced by normalized pass/approval conditions. Missing metrics should raise a clear `KeyError` naming the metric, and unsupported operators should raise `ValueError`. - -- [ ] **Step 4: Reflect the richer gate structure into manifests and report payloads** - -```python -# aworld/evaluations/manifests.py -"gate_policy": { - "type": "object", - "properties": { - "metric_name": {"type": "string"}, - "pass_threshold": {"type": "number"}, - "approval_threshold": {"type": ["number", "null"]}, - "pass_all": {"type": "array"}, - "approval_all": {"type": "array"}, - }, -} -``` - -```python -# aworld/evaluations/report.py -"gateDecision": { - "type": "object", - "required": ["status", "metric_name", "value"], - "properties": { - "status": {"type": "string", "enum": ["pass", "fail", "needs_approval"]}, - "metric_name": {"type": ["string", "null"]}, - "value": {"type": ["number", "null"]}, - "matched_conditions": {"type": "array"}, - "failed_conditions": {"type": "array"}, - }, -} -``` - -- [ ] **Step 5: Run gate and report-contract tests until green** - -Run: `pytest tests/evaluations/test_evaluation_substrate.py tests/docs/test_evaluator_report_docs.py -q` -Expected: PASS, including both composite-gate and legacy-threshold cases. - -- [ ] **Step 6: Commit the composite-gate slice** - -```bash -git add aworld/evaluations/substrate.py aworld/evaluations/manifests.py aworld/evaluations/report.py tests/evaluations/test_evaluation_substrate.py tests/docs/test_evaluator_report_docs.py -git commit -m "feat: add composite evaluator gate policies" -``` - -### Task 4: Add trajectory scorer declarations, migrate builtin suites, document the new substrate, and run full verification - -**Files:** -- Modify: `aworld/evaluations/substrate.py` -- Modify: `aworld/evaluations/README.md` -- Modify: `openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md` -- Test: `tests/evaluations/test_execution_state.py` -- Test: `tests/evaluations/test_execution_adapters.py` -- Test: `tests/evaluations/test_evaluation_substrate.py` -- Test: `tests/core/test_evaluator_runtime.py` -- Test: `tests/core/test_evaluator_top_level_command.py` -- Test: `tests/plugins/test_plugin_hooks.py` -- Test: `tests/test_plugin_cli_entrypoint.py` -- Test: `tests/docs/test_evaluator_report_docs.py` - -- [ ] **Step 1: Add suite-declared trajectory scorer lowering** - -```python -# aworld/evaluations/substrate.py -@dataclass(frozen=True) -class TrajectoryScorerDef: - metric_name: str - scorer_class: str | None = None - threshold: float = 0.0 - scorer_params: dict[str, Any] = field(default_factory=dict) - - -@dataclass(frozen=True) -class EvalSuiteDef: - # ... existing fields ... - trajectory_scorers: tuple[TrajectoryScorerDef, ...] = tuple() -``` - -`compile_evaluation_flow()` should append one `EvalCriteria` per trajectory scorer after the suite judge criterion. This reuses the existing scorer registry and report metric shape instead of creating a separate trajectory report path. - -- [ ] **Step 2: Add or migrate a builtin suite to exercise the new contracts end to end** - -```python -# aworld/evaluations/substrate.py -class AppEvaluatorJudgeOutput(BaseModel): - score: float - rank: int - criticism: str - praise: str - improvement_advice: str - - -def get_builtin_eval_suite(suite_id: str) -> EvalSuiteDef: - if suite_id == "app-evaluator": - return EvalSuiteDef( - suite_id="app-evaluator", - judge=_app_evaluator_judge, - judge_schema=JudgeSchemaDef(output_model=AppEvaluatorJudgeOutput), - gate_policy=GatePolicyDef( - pass_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.85),), - approval_all=(GateMetricCondition(metric_name="score", op=">=", threshold=0.7),), - ), - metadata={"builtin": True, "preferred_backend": "callable"}, - ) -``` - -- [ ] **Step 3: Update framework documentation and task checklist after implementation** - -```md - -- `program`: execute an importable callable through the evaluator adapter layer -- typed judge schemas: Pydantic-backed validation with JSON schema export -- composite gates: structured conditions with compatibility for threshold-style suites -- trajectory scorers: suite-declared process metrics that lower into normal evaluator criteria -``` - -```md - -- [x] 1.0 Add a lightweight `EvalHarnessDef` boundary and compatibility lowering from direct `suite.execution`. -- [x] 1.1 Add a `PROGRAM` execution mode to the framework-owned evaluation execution model. -- [x] 1.2 Introduce an internal execution adapter boundary under `aworld/evaluations/` for static, agent, task, and program-backed execution. -- [x] 2.1 Add typed judge-output model support as the primary suite-backed validation contract. -- [x] 3.1 Expand gate definitions from single-threshold checks to structured composite metric conditions. -- [x] 4.1 Add suite-declared trajectory scorer definitions that lower into normal evaluator criteria. -``` - -- [ ] **Step 4: Run the full evaluator regression suite** - -Run: `pytest tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -q` -Expected: PASS with all evaluator framework and CLI consumer tests green. - -- [ ] **Step 5: Validate the OpenSpec change after code and docs are aligned** - -Run: `openspec validate aworld-evaluator-v2-extensibility-2026-06-09 --strict` -Expected: `Change 'aworld-evaluator-v2-extensibility-2026-06-09' is valid` - -- [ ] **Step 6: Commit the migration and verification slice** - -```bash -git add aworld/evaluations/substrate.py aworld/evaluations/README.md openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md tests/evaluations/test_execution_state.py tests/evaluations/test_execution_adapters.py tests/evaluations/test_evaluation_substrate.py tests/core/test_evaluator_runtime.py tests/core/test_evaluator_top_level_command.py tests/plugins/test_plugin_hooks.py tests/test_plugin_cli_entrypoint.py tests/docs/test_evaluator_report_docs.py -git commit -m "docs: finalize evaluator v2 extensibility rollout" -``` - -## Self-Review - -- Spec coverage: - - harness lowering, execution adapters, and `PROGRAM` mode -> Task 1 - - typed judge-output contracts with legacy compatibility -> Task 2 - - composite gate policies with threshold lowering -> Task 3 - - trajectory scorer declarations, verification, and spec alignment -> Task 4 -- Placeholder scan: - - no `TODO`, `TBD`, or "similar to previous task" shortcuts remain - - remaining `...` tokens only appear inside Python variadic tuple type annotations, not as placeholders - - each code-changing step contains concrete file paths and code snippets -- Type consistency: - - `EvalExecutionMode.PROGRAM`, `ExecutionAdapter`, `JudgeSchemaDef.validate_payload`, `GateMetricCondition`, and `GatePolicyDef` names are used consistently across tasks diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md deleted file mode 100644 index 2c1d275a1..000000000 --- a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/proposal.md +++ /dev/null @@ -1,34 +0,0 @@ -## Why - -`aworld-evaluation-substrate-2026-06-01` established the first execution-backed evaluator substrate and completed the v1 CLI flow, but it intentionally stopped short of several extensibility and contract-hardening steps: - -- suite execution modes are still limited to `static`, `agent`, and `task` -- execution targets still couple directly to current runner entrypoints -- judge schemas are still lightweight required-field checks -- gate policies still only express single-metric threshold decisions - -Those tradeoffs were acceptable for v1, but they limit AWorld's ability to expose evaluation as a broader framework capability for non-agent programs, stricter automation, and richer reusable evaluator definitions. - -This change is an incremental hardening of the v1 single-shot evaluator substrate. It is not intended to claim verifiers v1 parity: multi-turn rollout ownership, user simulators, lifecycle hooks, child-state composition, and training reward semantics remain out of scope for a later runtime-composition change. - -## What Changes - -- Add a lightweight first-class `EvalHarnessDef` so suites have an explicit execution boundary in the suite/case/harness/state hierarchy without adopting a full rollout-owning harness object model. -- Extend the framework-owned evaluation substrate with a bounded `PROGRAM` execution mode for importable program-backed evaluators that do not use AWorld's agent/task runtime. -- Add an internal execution adapter layer under `aworld/evaluations/` so suite-backed evaluation no longer hardcodes runner invocation details into eval targets. -- Promote judge output contracts from required-field-only validation to typed model validation with JSON-schema-friendly structure and a compatibility bridge for existing suites. -- Expand gate policies from single-threshold checks into structured composite conditions with explicit comparison operators while preserving the current simple threshold shape as compatibility sugar. -- Add suite-declared trajectory scorers so result evaluation and normalized trajectory/process metric evaluation can be configured side by side in the current single-shot flow. -- Keep `aworld-cli evaluator` compatible as an assembly layer on top of the evolved framework substrate rather than introducing a second evaluator stack. - -## Capabilities - -### Modified Capabilities - -- `evaluation-substrate`: add adapter-backed program execution, typed judge schemas, and richer composite gate policies without breaking the v1 evaluator substrate shape. - -## Impact - -- Affected code: `aworld/evaluations/**`, especially execution specification, substrate compilation, eval target execution, judge validation, and gate evaluation paths. -- Affected APIs: internal evaluation composition APIs gain additive extensions; existing suite-backed and legacy evaluation callers remain valid through compatibility paths. -- Affected systems: framework-owned evaluator execution and scoring; `aworld-cli evaluator` should inherit the new framework capabilities without owning their semantics. diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md deleted file mode 100644 index c4130959e..000000000 --- a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/specs/evaluation-substrate/spec.md +++ /dev/null @@ -1,101 +0,0 @@ -## MODIFIED Requirements - -### Requirement: Execution-backed suite flows reuse framework evaluation primitives - -The framework SHALL support suite-backed evaluation flows that execute targets through existing AWorld runtime primitives and adapter-backed program execution while exposing reusable execution results for downstream scoring. - -#### Scenario: Suite-backed flow lowers execution through a reusable harness boundary -- **WHEN** a suite-backed evaluator declares execution directly or through an evaluator harness -- **THEN** the framework SHALL compile the flow through a single harness boundary that owns execution configuration and adapter selection while preserving the existing suite/case/state model - -#### Scenario: Suite-backed flow executes through current task or agent runtime -- **WHEN** a suite-backed evaluator is configured to execute through the existing AWorld agent or task runtime -- **THEN** the framework SHALL adapt the suite flow through framework-owned execution adapters instead of hardcoding runner invocation details across evaluator targets - -#### Scenario: Suite-backed flow executes through a program-backed runtime -- **WHEN** a suite-backed evaluator is configured with a program-backed execution reference -- **THEN** the framework SHALL execute that program through a framework-owned execution adapter and normalize the result into the common evaluator execution state - -#### Scenario: Program-backed runtime is bounded to importable callables -- **WHEN** a suite-backed evaluator declares program-backed execution -- **THEN** the framework SHALL require an importable callable reference and reject unsupported command, sandbox, workflow-engine, or missing-reference configuration for this change - -#### Scenario: Importable callable execution is trusted -- **WHEN** a suite-backed evaluator declares a program reference or task-builder reference -- **THEN** the framework SHALL treat that reference as trusted in-process code controlled by the runner or workspace owner and SHALL NOT expose it through declared JSON manifests in this change - -#### Scenario: Program-backed runtime returns supported output -- **WHEN** a program-backed evaluator returns an `EvalState`, an `EvalState`-shaped mapping, a `TaskResponse`, or a bare answer value -- **THEN** the framework SHALL normalize that output into the common evaluator execution state without storing live runtime handles in the state - -#### Scenario: Existing static suite execution remains available -- **WHEN** a suite-backed evaluator is defined without execution-backed target settings -- **THEN** the framework SHALL continue to support the current static evaluation path as a valid suite execution mode - -### Requirement: Schema-constrained judge outputs - -Suite-backed evaluation flows SHALL validate judge outputs against an explicit typed judge schema before final scoring and reporting are completed, while preserving compatibility for current lightweight schema definitions. - -#### Scenario: Judge output matches the declared typed schema -- **WHEN** a suite-backed evaluator returns a result that satisfies the declared typed judge-output model -- **THEN** the framework SHALL accept the result for downstream scoring, gating, and reporting - -#### Scenario: Judge output violates the declared typed schema -- **WHEN** a suite-backed evaluator returns a result that fails the declared typed judge-output model -- **THEN** the framework SHALL surface the typed schema violation as an evaluation failure or invalid result state rather than silently accepting malformed output - -#### Scenario: Legacy schema definitions remain valid during migration -- **WHEN** an existing suite-backed evaluator still uses the current lightweight required-field schema definition -- **THEN** the framework SHALL continue to validate that suite through a compatibility path without forcing immediate migration - -#### Scenario: Judge schema metadata is exposed once per report -- **WHEN** a suite-backed evaluator has a typed or compatibility judge schema -- **THEN** the framework SHALL expose the derived judge schema metadata at the report level rather than duplicating the schema in every case result - -### Requirement: First-class gate outcomes - -Suite-backed evaluation flows SHALL evaluate a declared structured gate policy and produce a gate outcome of `pass`, `fail`, or `needs_approval`. - -#### Scenario: Composite pass conditions succeed -- **WHEN** all required pass conditions in the structured gate policy are satisfied -- **THEN** the framework SHALL emit a `pass` gate outcome - -#### Scenario: Approval-stage conditions match -- **WHEN** pass conditions are not satisfied but the structured gate policy marks the result as eligible for human review -- **THEN** the framework SHALL emit a `needs_approval` gate outcome - -#### Scenario: Composite gate conditions fail without approval path -- **WHEN** required pass conditions are not satisfied and no approval-stage conditions apply -- **THEN** the framework SHALL emit a `fail` gate outcome - -#### Scenario: Legacy threshold gates remain valid -- **WHEN** an existing suite-backed evaluator uses the current single-threshold gate definition -- **THEN** the framework SHALL preserve that behavior through a compatibility lowering into the structured gate policy model - -#### Scenario: Gate conditions use explicit comparison operators -- **WHEN** a structured gate condition compares a metric to a threshold -- **THEN** the framework SHALL support `>=`, `<=`, `>`, `<`, `==`, and `!=` operators and surface unsupported operators as invalid gate configuration - -#### Scenario: Gate references a missing metric -- **WHEN** a structured gate condition references a metric that is not present in aggregate results -- **THEN** the framework SHALL fail the gate closed, include the missing condition in `failed_conditions`, and still return the completed case results and available metrics - -### Requirement: Suite-declared trajectory evaluation - -Suite-backed evaluation flows SHALL support normalized trajectory-level scoring alongside result-level judge scoring while preserving the common report metric shape. - -#### Scenario: Suite declares trajectory scorers -- **WHEN** a suite-backed evaluator declares trajectory scorer definitions -- **THEN** the framework SHALL lower those definitions into normal evaluator scoring criteria that inspect the normalized execution state trajectory - -#### Scenario: Trajectory evaluation remains single-shot in this change -- **WHEN** a suite-backed evaluator uses trajectory scorers in this change -- **THEN** the framework SHALL score the trajectory already captured in `EvalState` and SHALL NOT claim multi-turn rollout ownership, user simulation, lifecycle hooks, or step-level training reward semantics - -#### Scenario: Trajectory scorer results participate in gates and reports -- **WHEN** a trajectory scorer emits a metric result -- **THEN** the framework SHALL include that metric in case metrics, aggregate metrics, and structured gate evaluation without replacing the final-result judge score - -#### Scenario: Suite has no trajectory scorers -- **WHEN** a suite-backed evaluator omits trajectory scorer definitions -- **THEN** the framework SHALL preserve the current result-focused scoring behavior diff --git a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md b/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md deleted file mode 100644 index 72a881572..000000000 --- a/openspec/changes/aworld-evaluator-v2-extensibility-2026-06-09/tasks.md +++ /dev/null @@ -1,37 +0,0 @@ -## 1. Execution Extensibility - -- [x] 1.0 Add a lightweight `EvalHarnessDef` boundary and compatibility lowering from direct `suite.execution`. -- [x] 1.1 Add a `PROGRAM` execution mode to the framework-owned evaluation execution model. -- [x] 1.2 Introduce an internal execution adapter boundary under `aworld/evaluations/` for static, agent, task, and program-backed execution. -- [x] 1.3 Keep existing `static`, `agent`, and `task` suite-backed flows working through compatibility paths. -- [x] 1.4 Normalize program-backed execution results into the same `EvalState` shape used by current execution-backed evaluation. -- [x] 1.5 Validate `PROGRAM` specs up front, including required importable `target_ref` and unsupported command/workflow forms. -- [x] 1.6 Keep importable callable execution as a trusted in-memory framework contract and reject executable refs in declared JSON manifests. - -## 2. Typed Judge Contracts - -- [x] 2.1 Add typed judge-output model support as the primary suite-backed validation contract. -- [x] 2.2 Preserve compatibility for current required-field-based judge schema definitions during migration. -- [x] 2.3 Expose judge-model-derived schema metadata once at the report level for docs or downstream tooling. - -## 3. Composite Gate Policies - -- [x] 3.1 Expand gate definitions from single-threshold checks to structured composite metric conditions. -- [x] 3.2 Support `pass`, `fail`, and `needs_approval` outcomes from composite gate evaluation. -- [x] 3.3 Keep current threshold-style gate definitions valid as compatibility sugar over the richer gate model. -- [x] 3.4 Support `>=`, `<=`, `>`, `<`, `==`, and `!=` gate operators. -- [x] 3.5 Fail structured gates closed when a condition references a missing metric while preserving the completed report payload. - -## 4. Trajectory Evaluation - -- [x] 4.1 Add suite-declared trajectory scorer definitions that lower into normal evaluator criteria. -- [x] 4.2 Keep existing trajectory scorer/extractor behavior and report metric shapes compatible. -- [x] 4.3 Add coverage for trajectory metrics participating in reports and gate evaluation. - -## 5. Verification - -- [x] 5.1 Add regression coverage for adapter-backed execution across static, agent, task, and program-backed suites. -- [x] 5.2 Add coverage for typed judge validation success, failure, and legacy compatibility paths. -- [x] 5.3 Add coverage for composite gate evaluation, all supported operators, missing metrics, and legacy threshold compatibility. -- [x] 5.4 Add error-path coverage for program exceptions and malformed program output where applicable. -- [x] 5.5 Validate the OpenSpec change and keep it aligned with the framework-owned evaluator scope.