From b210605b436b0c8800cd771d499fe6335684bb92 Mon Sep 17 00:00:00 2001 From: "wuman.wyf" Date: Thu, 11 Jun 2026 10:41:18 +0800 Subject: [PATCH] Support named evaluator judge backends --- .../src/aworld_cli/commands/evaluation_cmd.py | 18 +- .../src/aworld_cli/evaluator_runtime.py | 223 ++++++++++++++++-- .../top_level_commands/evaluator_cmd.py | 15 +- docs/AWorld CLI/Commands/Evaluator.md | 26 +- tests/core/test_evaluator_runtime.py | 144 ++++++++++- .../core/test_evaluator_top_level_command.py | 132 ++++++++++- tests/docs/test_evaluator_report_docs.py | 3 + .../test_answer_quality_judge_fixtures.py | 67 ++++++ .../agents/answer_quality_judge.py | 42 ++++ .../evaluator_judges/answer_quality_agent.md | 116 +++++++++ .../answer_quality_backend.py | 62 +++++ tests/test_slash_commands.py | 70 ++++++ 12 files changed, 881 insertions(+), 37 deletions(-) create mode 100644 tests/evaluations/test_answer_quality_judge_fixtures.py create mode 100644 tests/fixtures/evaluator_judges/agents/answer_quality_judge.py create mode 100644 tests/fixtures/evaluator_judges/answer_quality_agent.md create mode 100644 tests/fixtures/evaluator_judges/answer_quality_backend.py diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py index 74e8cb9ec..59c384cf7 100644 --- a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py +++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py @@ -15,14 +15,14 @@ def _usage() -> str: return """Usage: /evaluation --input --kind task --judge-agent [--agent ] [--out-dir ] - /evaluation --input --kind answer --judge-agent [--out-dir ] - /evaluation --input --kind trajectory --judge-agent [--agent ] [--out-dir ] + /evaluation --input --kind answer --judge-agent-name [--out-dir ] + /evaluation --input --kind trajectory --judge-backend-ref [--agent ] [--out-dir ] /evaluation --input --kind trajectory --task-id --judge-agent [--out-dir ] Examples: /evaluation --input ./tasks.jsonl --kind task --judge-agent ./judge_agents/answer_judge.md - /evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./judge_agents/answer_judge.md - /evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./judge_agents/trajectory_judge.md + /evaluation --input ./task_answers.jsonl --kind answer --judge-agent-name JudgeTeam + /evaluation --input ./tasks.jsonl --kind trajectory --judge-backend-ref my_eval.judges:build_backend /evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./judge_agents/trajectory_judge.md """ @@ -31,7 +31,9 @@ def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="/evaluation", add_help=False) parser.add_argument("--input", required=True) parser.add_argument("--kind", required=True) - parser.add_argument("--judge-agent", required=True) + parser.add_argument("--judge-agent") + parser.add_argument("--judge-agent-name") + parser.add_argument("--judge-backend-ref") parser.add_argument("--out-dir") parser.add_argument("--output") parser.add_argument("--task-id") @@ -88,12 +90,18 @@ async def execute(self, context: CommandContext) -> str: if args.help: return _usage() + judge_selectors = (args.judge_agent, args.judge_agent_name, args.judge_backend_ref) + if sum(1 for value in judge_selectors if value) != 1: + return "Evaluator error: exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required\n\n" + _usage() + try: report = await asyncio.to_thread( run_evaluator_source_cli, input=args.input, kind=args.kind, judge_agent=args.judge_agent, + judge_agent_name=args.judge_agent_name, + judge_backend_ref=args.judge_backend_ref, out_dir=args.out_dir, output=args.output, task_id=args.task_id, diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py index 863381b00..80f512721 100644 --- a/aworld-cli/src/aworld_cli/evaluator_runtime.py +++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py @@ -2,6 +2,8 @@ import asyncio import builtins +import importlib +import inspect import json import time from pathlib import Path @@ -20,9 +22,11 @@ ) from aworld.evaluations.substrate import ( AgentJudgeBackend, + CallableJudgeBackend, EvaluationFlowDef, GateMetricCondition, GatePolicyDef, + JudgeBackend, JudgeSchemaDef, StateCheckGrader, describe_eval_target, @@ -165,9 +169,9 @@ def _run_evaluator_hooks( - `evaluator.pre_discover` event payload: `target`, `workspace_path` - `evaluator.post_discover` event payload: `target`, `workspace_path`, `suite_names` - `evaluator.pre_run` event payload for target mode: `mode=target`, `target`, `suite`, `workspace_path` - - `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` + - `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, judge selector fields, `agent`, `workspace_path`, `output_path` - `evaluator.post_run` event payload for target mode: `mode=target`, `report`, `target`, `suite`, `workspace_path` - - `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` + - `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, judge selector fields, `agent`, `workspace_path`, `output_path` - `evaluator.render_summary` event payload: `report`, `workspace_path` - mutable state: lightweight CLI assembly metadata only - allowed side effects: report upload, notifications, summary augmentation @@ -253,6 +257,131 @@ def _case_source_metadata(case) -> dict[str, Any]: return {} +def _judge_selector_count( + *, + judge_agent: str | None, + judge_agent_name: str | None, + judge_backend_ref: str | None, +) -> int: + return sum( + 1 + for value in (judge_agent, judge_agent_name, judge_backend_ref) + if value is not None and str(value).strip() + ) + + +def _validate_judge_selectors( + *, + judge_agent: str | None, + judge_agent_name: str | None, + judge_backend_ref: str | None, +) -> None: + if _judge_selector_count( + judge_agent=judge_agent, + judge_agent_name=judge_agent_name, + judge_backend_ref=judge_backend_ref, + ) != 1: + raise ValueError("exactly one judge selector is required: --judge-agent, --judge-agent-name, or --judge-backend-ref") + + +def _load_ref(ref: str) -> Any: + module_name, separator, attr_path = ref.partition(":") + if not separator or not module_name or not attr_path: + raise ValueError(f"judge backend ref must use module:callable format: {ref}") + module = importlib.import_module(module_name) + value: Any = module + for attr in attr_path.split("."): + if not attr: + raise ValueError(f"judge backend ref has an empty attribute segment: {ref}") + value = getattr(value, attr) + return value + + +def _can_call_without_arguments(value: Any) -> bool: + try: + signature = inspect.signature(value) + except (TypeError, ValueError): + return False + for parameter in signature.parameters.values(): + if parameter.kind in (parameter.VAR_POSITIONAL, parameter.VAR_KEYWORD): + continue + if parameter.default is parameter.empty: + return False + return True + + +def _coerce_source_judge_backend(value: Any, *, backend_id: str) -> JudgeBackend: + if hasattr(value, "execute"): + return value + if callable(value): + return CallableJudgeBackend(backend_id=backend_id, judge=value) + raise ValueError("judge backend ref must resolve to a JudgeBackend-compatible object or callable") + + +def _load_source_judge_backend_ref(ref: str) -> JudgeBackend: + value = _load_ref(ref) + if hasattr(value, "execute"): + return value + if callable(value) and _can_call_without_arguments(value): + produced = value() + if inspect.isawaitable(produced): + raise ValueError("judge backend ref factory must be synchronous") + return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}") + return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}") + + +def _build_cli_agent_judge_backend(*, agent_name: str, backend_id: str, prompt_builder): + executor_cache: dict[str, Any] = {} + + async def _executor(prompt, system_prompt): + if isinstance(prompt, tuple): + raise ValueError("CLI agent judge backend only supports text prompts") + executor = executor_cache.get("executor") + if executor is None: + executor = await _load_cli_agent_executor(agent_name) + executor_cache["executor"] = executor + swarm = getattr(executor, "swarm", None) + if swarm is not None: + response = await Runners.run(input=str(prompt), swarm=swarm) + else: + response = await executor.chat(str(prompt)) + return str(getattr(response, "answer", response)) + + return AgentJudgeBackend( + backend_id=backend_id, + system_prompt=f"CLI agent judge loaded from {agent_name}", + executor=_executor, + prompt_builder=prompt_builder, + ) + + +def _resolve_source_judge_backend( + *, + judge_agent_path: Path | None, + judge_agent_name: str | None, + judge_backend_ref: str | None, + file_backend_id: str, + named_backend_prefix: str, + prompt_builder, +) -> JudgeBackend: + if judge_agent_path is not None: + return AgentJudgeBackend.from_agent_markdown( + judge_agent_path, + backend_id=file_backend_id, + prompt_builder=prompt_builder, + ) + if judge_agent_name is not None and str(judge_agent_name).strip(): + resolved_name = str(judge_agent_name).strip() + return _build_cli_agent_judge_backend( + agent_name=resolved_name, + backend_id=f"{named_backend_prefix}:{resolved_name}", + prompt_builder=prompt_builder, + ) + if judge_backend_ref is not None and str(judge_backend_ref).strip(): + return _load_source_judge_backend_ref(str(judge_backend_ref).strip()) + raise ValueError("exactly one judge selector is required: --judge-agent, --judge-agent-name, or --judge-backend-ref") + + class _CliAgentRuntimeHarness: def __init__(self, *, agent_name: str): self.agent_name = agent_name @@ -440,7 +569,9 @@ def _build_source_suite( *, kind: str, input_path: Path, - judge_agent_path: Path, + judge_agent_path: Path | None, + judge_agent_name: str | None = None, + judge_backend_ref: str | None = None, task_id: str | None, id_field: str, task_field: str, @@ -486,15 +617,19 @@ def _build_source_suite( id_field=id_field, input_field=task_field, ) + judge_backend = _resolve_source_judge_backend( + judge_agent_path=judge_agent_path, + judge_agent_name=judge_agent_name, + judge_backend_ref=judge_backend_ref, + file_backend_id="source-agent-md", + named_backend_prefix="source-agent", + prompt_builder=_build_source_prompt, + ) return create_source_eval_suite( suite_id="task-source-evaluator", source=source, runtime_harness=_build_cli_agent_runtime_harness(agent_name=agent_name), - judge_backend=AgentJudgeBackend.from_agent_markdown( - judge_agent_path, - backend_id="source-agent-md", - prompt_builder=_build_source_prompt, - ), + judge_backend=judge_backend, judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), gate_policy=answer_gate, metadata={"agent": agent_name}, @@ -507,14 +642,18 @@ def _build_source_suite( input_field=task_field, answer_field=answer_field, ) + judge_backend = _resolve_source_judge_backend( + judge_agent_path=judge_agent_path, + judge_agent_name=judge_agent_name, + judge_backend_ref=judge_backend_ref, + file_backend_id="source-agent-md", + named_backend_prefix="source-agent", + prompt_builder=_build_source_prompt, + ) return create_source_eval_suite( suite_id="answer-source-evaluator", source=source, - judge_backend=AgentJudgeBackend.from_agent_markdown( - judge_agent_path, - backend_id="source-agent-md", - prompt_builder=_build_source_prompt, - ), + judge_backend=judge_backend, judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput), gate_policy=answer_gate, ) @@ -534,15 +673,19 @@ def _build_source_suite( input_field=task_field, ) runtime_harness = _build_cli_agent_runtime_harness(agent_name=agent_name) + judge_backend = _resolve_source_judge_backend( + judge_agent_path=judge_agent_path, + judge_agent_name=judge_agent_name, + judge_backend_ref=judge_backend_ref, + file_backend_id="trajectory-evaluator-agent-md", + named_backend_prefix="trajectory-evaluator-agent", + prompt_builder=_build_trajectory_prompt, + ) return create_source_eval_suite( suite_id="trajectory-source-evaluator", source=source, runtime_harness=runtime_harness, - judge_backend=AgentJudgeBackend.from_agent_markdown( - judge_agent_path, - backend_id="trajectory-evaluator-agent-md", - prompt_builder=_build_trajectory_prompt, - ), + judge_backend=judge_backend, judge_schema=TrajectoryJudgeSchema.default(), outcome_scorers=trajectory_outcome_scorers, gate_policy=trajectory_gate, @@ -556,7 +699,9 @@ def run_evaluator_source_cli( *, input: str, kind: str, - judge_agent: str, + judge_agent: str | None = None, + judge_agent_name: str | None = None, + judge_backend_ref: str | None = None, out_dir: str | None = None, output: str | None = None, task_id: str | None = None, @@ -571,8 +716,13 @@ def run_evaluator_source_cli( input_path = Path(input).expanduser().resolve() if not input_path.exists(): raise FileNotFoundError(f"source input does not exist: {input_path}") - judge_agent_path = Path(judge_agent).expanduser().resolve() - if not judge_agent_path.exists(): + _validate_judge_selectors( + judge_agent=judge_agent, + judge_agent_name=judge_agent_name, + judge_backend_ref=judge_backend_ref, + ) + judge_agent_path = Path(judge_agent).expanduser().resolve() if judge_agent else None + if judge_agent_path is not None and not judge_agent_path.exists(): raise FileNotFoundError(f"judge agent does not exist: {judge_agent_path}") workspace_path = str(input_path.parent if input_path.is_file() else input_path) @@ -581,7 +731,9 @@ def run_evaluator_source_cli( "input": str(input_path), "kind": kind, "task_id": task_id, - "judge_agent": str(judge_agent_path), + "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None, + "judge_agent_name": judge_agent_name, + "judge_backend_ref": judge_backend_ref, "agent": agent, "workspace_path": workspace_path, "output_path": str(Path(output).expanduser().resolve()) if output else None, @@ -595,7 +747,9 @@ def run_evaluator_source_cli( "input": str(input_path), "kind": kind, "task_id": task_id, - "judge_agent": str(judge_agent_path), + "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None, + "judge_agent_name": judge_agent_name, + "judge_backend_ref": judge_backend_ref, "agent": agent, "interactive_approval": interactive_approval, }, @@ -604,6 +758,8 @@ def run_evaluator_source_cli( kind=kind, input_path=input_path, judge_agent_path=judge_agent_path, + judge_agent_name=judge_agent_name, + judge_backend_ref=judge_backend_ref, task_id=task_id, id_field=id_field, task_field=task_field, @@ -618,11 +774,24 @@ def run_evaluator_source_cli( "target_path": str(input_path), "source_kind": kind, "task_id": task_id, - "judge_agent": str(judge_agent_path), + "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None, + "judge_agent_name": judge_agent_name, + "judge_backend_ref": judge_backend_ref, "agent": agent_name if executes_agent else agent, } for key, value in hook_state.items(): - if key not in {"mode", "input", "kind", "task_id", "judge_agent", "agent", "interactive_approval", "summary_suffix"}: + if key not in { + "mode", + "input", + "kind", + "task_id", + "judge_agent", + "judge_agent_name", + "judge_backend_ref", + "agent", + "interactive_approval", + "summary_suffix", + }: target_info[key] = value flow = EvaluationFlowDef( target=target_info, @@ -647,7 +816,9 @@ def run_evaluator_source_cli( "input": str(input_path), "kind": kind, "task_id": task_id, - "judge_agent": str(judge_agent_path), + "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None, + "judge_agent_name": judge_agent_name, + "judge_backend_ref": judge_backend_ref, "agent": agent_name if executes_agent else agent, } report["automation"] = _build_automation_summary(report) diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py index 3d90f5186..e6fdfb365 100644 --- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py +++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py @@ -45,6 +45,8 @@ def register_parser(self, subparsers) -> None: parser.add_argument("--input", type=str) parser.add_argument("--kind", type=str) parser.add_argument("--judge-agent", type=str) + parser.add_argument("--judge-agent-name", type=str) + parser.add_argument("--judge-backend-ref", type=str) parser.add_argument("--out-dir", type=str) parser.add_argument("--task-id", type=str) parser.add_argument("--agent", type=str) @@ -68,14 +70,21 @@ def run(self, args, context) -> int: if not getattr(args, "kind", None): print("Evaluator error: --kind is required with --input") return 1 - if not getattr(args, "judge_agent", None): - print("Evaluator error: --judge-agent is required with --input") + judge_selectors = [ + getattr(args, "judge_agent", None), + getattr(args, "judge_agent_name", None), + getattr(args, "judge_backend_ref", None), + ] + if sum(1 for value in judge_selectors if value) != 1: + print("Evaluator error: exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required with --input") return 1 try: report = run_evaluator_source_cli( input=args.input, kind=args.kind, judge_agent=args.judge_agent, + judge_agent_name=args.judge_agent_name, + judge_backend_ref=args.judge_backend_ref, out_dir=args.out_dir, output=args.output, task_id=args.task_id, @@ -94,6 +103,8 @@ def run(self, args, context) -> int: source_only_args = ( ("kind", "--kind"), ("judge_agent", "--judge-agent"), + ("judge_agent_name", "--judge-agent-name"), + ("judge_backend_ref", "--judge-backend-ref"), ("out_dir", "--out-dir"), ("task_id", "--task-id"), ("agent", "--agent"), diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md index 5d86f73c9..2467bce98 100644 --- a/docs/AWorld CLI/Commands/Evaluator.md +++ b/docs/AWorld CLI/Commands/Evaluator.md @@ -45,6 +45,18 @@ aworld-cli evaluator \ --judge-agent ./judge_agents/answer_judge.md \ --out-dir ./reports +aworld-cli evaluator \ + --input ./task_answers.jsonl \ + --kind answer \ + --judge-agent-name answer-quality-judge \ + --out-dir ./reports + +aworld-cli evaluator \ + --input ./task_answers.jsonl \ + --kind answer \ + --judge-backend-ref my_eval.judges:build_backend \ + --out-dir ./reports + aworld-cli evaluator \ --input ~/Documents/logs/trajectory.log \ --kind trajectory \ @@ -67,6 +79,14 @@ aworld-cli evaluator \ For `task` JSONL inputs, the default fields are `id` and `input`; the evaluator runs each task through the CLI default `Aworld` agent unless `--agent` is supplied. For `trajectory`, passing `--task-id` replays one task from an existing AWorld trajectory log, omitting `--task-id` with a trajectory log replays all tasks in that log, and omitting `--task-id` with task JSONL runs the main agent, extracts the response trajectory, and evaluates that generated trajectory. For `answer` JSONL inputs, the default fields are `id`, `input`, and `answer`. Use `--id-field`, `--task-field`, and `--answer-field` only when the file uses different names. +Source-backed runs require exactly one judge selector: + +- `--judge-agent ` loads a markdown judge prompt directly. +- `--judge-agent-name ` loads an AWorld CLI registered local agent or team by name and runs the judge prompt through that agent's executor. Use `--agent-dir` in commands that support it, or set `LOCAL_AGENTS_DIR`, when the judge agent is not in the default agent search path. +- `--judge-backend-ref ` imports a process-local Python factory or callable. A factory may return a `JudgeBackend`-compatible object such as `AgentJudgeBackend` / `CallableJudgeBackend`; a raw callable is wrapped as a callable judge backend. + +The three judge selectors are mutually exclusive. They are CLI assembly choices and are not serialized into suite manifests as live handles. + Useful options: ```bash @@ -75,6 +95,8 @@ aworld-cli evaluator --target ./artifact --interactive-approval aworld-cli evaluator --input ./tasks.jsonl --kind task --judge-agent ./agent.md --agent Aworld --output ./report.json aworld-cli evaluator --input ./tasks.jsonl --kind trajectory --judge-agent ./trajectory_agent.md --output ./report.json aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-agent ./agent.md --output ./report.json +LOCAL_AGENTS_DIR=./judge_agents aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-agent-name answer-quality-judge +aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-backend-ref my_eval.judges:build_backend ``` ## Declared Suite Manifests @@ -133,9 +155,9 @@ Current event payloads: - `evaluator.pre_discover`: `target`, `workspace_path` - `evaluator.post_discover`: `target`, `workspace_path`, `suite_names` - `evaluator.pre_run` for target mode: `mode`, `target`, `suite`, `workspace_path` -- `evaluator.pre_run` for source mode: `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` +- `evaluator.pre_run` for source mode: `mode`, `input`, `kind`, `task_id`, `judge_agent`, `judge_agent_name`, `judge_backend_ref`, `agent`, `workspace_path`, `output_path` - `evaluator.post_run` for target mode: `mode`, `report`, `target`, `suite`, `workspace_path` -- `evaluator.post_run` for source mode: `mode`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path` +- `evaluator.post_run` for source mode: `mode`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `judge_agent_name`, `judge_backend_ref`, `agent`, `workspace_path`, `output_path` - `evaluator.render_summary`: `report`, `workspace_path` Hook boundaries: diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py index 65ffd1a8c..59ac8a159 100644 --- a/tests/core/test_evaluator_runtime.py +++ b/tests/core/test_evaluator_runtime.py @@ -29,6 +29,10 @@ from aworld_cli.evaluator_rendering import render_evaluator_summary +def _write_answer_source(path: Path) -> None: + path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + + @pytest.fixture(autouse=True) def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {}) @@ -83,7 +87,7 @@ def test_run_evaluator_source_cli_builds_task_answer_flow_with_default_fields( tmp_path: Path, ) -> None: input_path = tmp_path / "answers.jsonl" - input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8") + _write_answer_source(input_path) judge_agent = tmp_path / "agent.md" judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") output = tmp_path / "report.json" @@ -121,6 +125,144 @@ async def fake_run_evaluation_flow(flow): assert output.exists() +def test_run_evaluator_source_cli_supports_cli_judge_agent_name( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + _write_answer_source(input_path) + captured = {} + + class FakeExecutor: + async def chat(self, prompt): + captured["prompt"] = prompt + return '{"score": 91, "verdict": "Pass", "veto_triggered": false}' + + async def fake_load_cli_agent_executor(agent_name): + captured["agent_name"] = agent_name + return FakeExecutor() + + monkeypatch.setattr( + "aworld_cli.evaluator_runtime._load_cli_agent_executor", + fake_load_cli_agent_executor, + ) + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + execution = await flow.suite.judge_backend.execute( + flow.suite.cases[0].input, + {"answer": "existing"}, + flow.suite, + ) + return { + "report_version": 1, + "suite_id": "answer-source-evaluator", + "judge_backend": {"backend_id": execution.backend_id}, + "summary": {"answer-source-evaluator": {"score": {"mean": execution.payload["score"]}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": execution.payload["score"]}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="answer", + judge_agent_name="JudgeTeam", + output=str(tmp_path / "report.json"), + ) + + assert captured["agent_name"] == "JudgeTeam" + assert captured["flow"].suite.judge_backend.backend_id == "source-agent:JudgeTeam" + assert report["judge_backend"]["backend_id"] == "source-agent:JudgeTeam" + assert report["source_selection"]["judge_agent_name"] == "JudgeTeam" + assert report["source_selection"]["judge_agent"] is None + + +def test_run_evaluator_source_cli_supports_judge_backend_ref( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + input_path = tmp_path / "answers.jsonl" + _write_answer_source(input_path) + module_path = tmp_path / "custom_judge.py" + module_path.write_text( + "\n".join( + [ + "from aworld.evaluations.substrate import CallableJudgeBackend", + "", + "async def judge(case_input, target):", + " return {'score': 82, 'verdict': 'Pass', 'veto_triggered': False}", + "", + "def build_backend():", + " return CallableJudgeBackend(backend_id='custom-backend', judge=judge)", + ] + ), + encoding="utf-8", + ) + monkeypatch.syspath_prepend(str(tmp_path)) + captured = {} + + async def fake_run_evaluation_flow(flow): + captured["flow"] = flow + execution = await flow.suite.judge_backend.execute( + flow.suite.cases[0].input, + {"answer": "existing"}, + flow.suite, + ) + return { + "report_version": 1, + "suite_id": "answer-source-evaluator", + "judge_backend": {"backend_id": execution.backend_id}, + "summary": {"answer-source-evaluator": {"score": {"mean": execution.payload["score"]}}}, + "results": [], + "gate": {"status": "pass", "metric_name": "score", "value": execution.payload["score"]}, + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow) + + report = run_evaluator_source_cli( + input=str(input_path), + kind="answer", + judge_backend_ref="custom_judge:build_backend", + output=str(tmp_path / "report.json"), + ) + + assert captured["flow"].suite.judge_backend.backend_id == "custom-backend" + assert report["judge_backend"]["backend_id"] == "custom-backend" + assert report["source_selection"]["judge_backend_ref"] == "custom_judge:build_backend" + + +def test_run_evaluator_source_cli_rejects_missing_judge_selector(tmp_path: Path) -> None: + input_path = tmp_path / "answers.jsonl" + _write_answer_source(input_path) + + with pytest.raises(ValueError, match="exactly one judge selector"): + run_evaluator_source_cli( + input=str(input_path), + kind="answer", + output=str(tmp_path / "report.json"), + ) + + +def test_run_evaluator_source_cli_rejects_multiple_judge_selectors(tmp_path: Path) -> None: + input_path = tmp_path / "answers.jsonl" + _write_answer_source(input_path) + judge_agent = tmp_path / "agent.md" + judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8") + + with pytest.raises(ValueError, match="exactly one judge selector"): + run_evaluator_source_cli( + input=str(input_path), + kind="answer", + judge_agent=str(judge_agent), + judge_agent_name="JudgeTeam", + output=str(tmp_path / "report.json"), + ) + + def test_run_evaluator_source_cli_builds_task_flow_with_default_agent( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py index 41af8bbe9..77caa4ba6 100644 --- a/tests/core/test_evaluator_top_level_command.py +++ b/tests/core/test_evaluator_top_level_command.py @@ -122,6 +122,96 @@ def fake_run_evaluator_source_cli(**kwargs): assert "pass" in output +def test_maybe_dispatch_top_level_command_accepts_judge_agent_name( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8") + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "answer-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + [ + "aworld-cli", + "evaluator", + "--input", + str(input_path), + "--kind", + "answer", + "--judge-agent-name", + "JudgeTeam", + ] + ) + output = capsys.readouterr().out + + assert handled is True + assert calls["judge_agent"] is None + assert calls["judge_agent_name"] == "JudgeTeam" + assert calls["judge_backend_ref"] is None + assert "answer-source-evaluator" in output + + +def test_maybe_dispatch_top_level_command_accepts_judge_backend_ref( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + input_path = tmp_path / "answers.jsonl" + input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8") + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "answer-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + } + + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + handled = main_module._maybe_dispatch_top_level_command( + [ + "aworld-cli", + "evaluator", + "--input", + str(input_path), + "--kind", + "answer", + "--judge-backend-ref", + "custom_judge:build_backend", + ] + ) + output = capsys.readouterr().out + + assert handled is True + assert calls["judge_agent"] is None + assert calls["judge_agent_name"] is None + assert calls["judge_backend_ref"] == "custom_judge:build_backend" + assert "answer-source-evaluator" in output + + def test_maybe_dispatch_top_level_command_runs_task_source_with_default_agent( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -277,7 +367,7 @@ def test_evaluator_source_run_rejects_other_target_mode_arguments( assert expected in output -def test_evaluator_source_mode_requires_kind_and_judge_agent( +def test_evaluator_source_mode_requires_kind_and_judge_selector( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: @@ -293,6 +383,8 @@ def test_evaluator_source_mode_requires_kind_and_judge_agent( input="answers.jsonl", kind=None, judge_agent=None, + judge_agent_name=None, + judge_backend_ref=None, out_dir=None, output=None, task_id=None, @@ -313,6 +405,44 @@ def test_evaluator_source_mode_requires_kind_and_judge_agent( assert "--kind is required with --input" in output +def test_evaluator_source_mode_requires_one_judge_selector( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + monkeypatch.setattr( + "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli", + lambda **kwargs: pytest.fail("source runtime should not be called"), + ) + + exit_code = EvaluatorTopLevelCommand().run( + SimpleNamespace( + target=None, + suite=None, + input="answers.jsonl", + kind="answer", + judge_agent=None, + judge_agent_name=None, + judge_backend_ref=None, + out_dir=None, + output=None, + task_id=None, + agent=None, + id_field="id", + task_field="input", + answer_field="answer", + interactive_approval=False, + list_suites=False, + print_report_schema=False, + validate_report=None, + ), + TopLevelCommandContext(cwd="/tmp"), + ) + + output = capsys.readouterr().out + assert exit_code == 1 + assert "exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required" in output + + def test_evaluator_command_returns_nonzero_for_unresolved_approval( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py index 801aab593..ef1fd8994 100644 --- a/tests/docs/test_evaluator_report_docs.py +++ b/tests/docs/test_evaluator_report_docs.py @@ -17,6 +17,9 @@ def test_evaluator_report_command_doc_covers_schema_and_validation() -> None: assert "--kind task" in content assert "--kind answer" in content assert "--kind trajectory" in content + assert "--judge-agent-name" in content + assert "--judge-backend-ref" in content + assert "exactly one judge selector" in content assert "report_format" in content assert "automation" in content assert ".aworld/evaluators/*.json" in content diff --git a/tests/evaluations/test_answer_quality_judge_fixtures.py b/tests/evaluations/test_answer_quality_judge_fixtures.py new file mode 100644 index 000000000..498bc6a06 --- /dev/null +++ b/tests/evaluations/test_answer_quality_judge_fixtures.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import asyncio +import importlib +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src")) + + +FIXTURE_ROOT = Path(__file__).resolve().parents[1] / "fixtures" / "evaluator_judges" + + +def test_answer_quality_prompt_fixture_matches_judge_contract() -> None: + prompt = (FIXTURE_ROOT / "answer_quality_agent.md").read_text(encoding="utf-8") + + assert "Answer Quality Evaluator" in prompt + assert "Q1_correctness" in prompt + assert "veto_triggered" in prompt + assert "最终回复必须是且仅是一个 JSON 对象" in prompt + + +def test_answer_quality_judge_agent_name_fixture_registers_local_swarm( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from aworld_cli.core.agent_registry import LocalAgentRegistry + from aworld_cli.core.loader import init_agents + + monkeypatch.setattr(LocalAgentRegistry, "_instance", None) + + init_agents(FIXTURE_ROOT / "agents") + + local_agent = LocalAgentRegistry.get_agent("answer-quality-judge") + assert local_agent is not None + assert local_agent.register_dir == str((FIXTURE_ROOT / "agents").resolve()) + + swarm = asyncio.run(local_agent.get_swarm(refresh=True)) + judge_agent = swarm.topology[0] + assert judge_agent.name() == "answer-quality-judge" + assert "Answer Quality Evaluator" in judge_agent.system_prompt + assert "Q1_correctness" in judge_agent.system_prompt + + +def test_answer_quality_backend_ref_fixture_builds_prompt_backed_backend( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.syspath_prepend(str(FIXTURE_ROOT)) + module = importlib.import_module("answer_quality_backend") + + backend = module.build_backend() + + assert backend.backend_id == "answer-quality-fixture-backend" + assert "Answer Quality Evaluator" in backend.system_prompt + assert "Q1_correctness" in backend.system_prompt + + execution = asyncio.run( + backend.execute( + {"input": "What is AWorld?"}, + {"answer": "AWorld is an agent framework."}, + suite=None, + ) + ) + assert execution.backend_id == "answer-quality-fixture-backend" + assert execution.payload["verdict"] == "Pass" + assert execution.payload["veto_triggered"] is False diff --git a/tests/fixtures/evaluator_judges/agents/answer_quality_judge.py b/tests/fixtures/evaluator_judges/agents/answer_quality_judge.py new file mode 100644 index 000000000..4a05b89cb --- /dev/null +++ b/tests/fixtures/evaluator_judges/agents/answer_quality_judge.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import os +from pathlib import Path + +from aworld.agents.llm_agent import Agent +from aworld.config import AgentConfig, ModelConfig +from aworld.core.agent.swarm import Swarm +from aworld_cli.core import agent + + +PROMPT_PATH = Path(__file__).resolve().parents[1] / "answer_quality_agent.md" + + +def _prompt() -> str: + return PROMPT_PATH.read_text(encoding="utf-8") + + +@agent( + name="answer-quality-judge", + desc="Answer-quality judge fixture for evaluator source tests.", + metadata={"source": "tests.fixture", "prompt_path": str(PROMPT_PATH)}, + unique=True, +) +def build_answer_quality_judge_swarm() -> Swarm: + judge = Agent( + name="answer-quality-judge", + desc="Evaluates task answers and returns the evaluator JSON schema.", + conf=AgentConfig( + llm_config=ModelConfig( + llm_model_name=os.environ.get("LLM_MODEL_NAME", "gpt-4"), + llm_provider=os.environ.get("LLM_PROVIDER", "openai"), + llm_api_key=os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY"), + llm_base_url=os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1"), + llm_temperature=float(os.environ.get("LLM_TEMPERATURE", "0.1")), + params={"max_completion_tokens": 4096}, + ), + skill_configs={}, + ), + system_prompt=_prompt(), + ) + return Swarm(judge, max_steps=1, name="answer-quality-judge-swarm") diff --git a/tests/fixtures/evaluator_judges/answer_quality_agent.md b/tests/fixtures/evaluator_judges/answer_quality_agent.md new file mode 100644 index 000000000..ea85ef4f2 --- /dev/null +++ b/tests/fixtures/evaluator_judges/answer_quality_agent.md @@ -0,0 +1,116 @@ +--- +name: answer-quality-judge +description: 使用 LLM-as-judge 对「问题 ↔ 答案」对做答案质量评估(reference-free,可选参考答案)。仅评估最终答案本身的正确性、完整性、贴合度、可读性与忠实度,不评估执行过程或工具使用。输入由 evaluator 框架以 JSON 注入(case + state.answer),无需读日志、无需调用工具。 +tools: Read, Write +model: opus +--- + +# Answer Quality Evaluator(LLM-as-Judge) + +你是一名严格、以证据为准的 **答案质量评审员**。你的职责是对一条「问题 ↔ 答案」对做可复现、可量化的评估,**只覆盖最终答案本身的质量**,不评估 agent 的执行过程、工具使用或轨迹。 + +你**就是**这里的 LLM judge:所有打分由你完成,不调用外部模型,也**不需要读取任何文件或运行任何命令**。 + +## 评估输入(由框架注入) + +evaluator 框架会以**单个 JSON 对象**作为你的输入消息,结构形如: + +```json +{ + "case": { "task_id": "...", "input": "用户的原始问题/任务" }, + "state": { "answer": "待评估的最终答案", "status": "...", "artifacts": {}, "trajectory": [], "tool_calls": [] }, + "required_output_schema": { "score": "number 0-100", "verdict": "string" }, + "instruction": "Evaluate the existing answer/state and return exactly one JSON object." +} +``` + +判据来源(按优先级): + +1. **`case.input`**:用户实际想要什么——这是判断「相关性/贴合度」和「完整性」的标尺。 +2. **`state.answer`**:被评估的答案——所有评分的对象。 +3. **参考答案(若存在)**:若 `case.input` 或 `state.artifacts` 中显式给出了 reference / 标准答案 / 验收要点,则以其为「正确性」基准;否则按 **reference-free** 处理,仅凭答案的内在一致性与常识可验证性判断。 + +> 若 `state.answer` 为空或缺失,直接判 `Fail`、`score=0`、`Q1=1`,并在 `notes` 中说明。 + +--- + +## 阶段 1 · 评分(五维,1–5 分,带锚点) + +对每个维度给出 1–5 的整数分,并**引用答案中的具体片段或问题中的具体要求**作为依据。严禁仅凭印象打分。 + +锚点统一含义:**5=优秀无明显问题 / 4=良好有小瑕疵 / 3=合格但有明确缺陷 / 2=较差影响可用性 / 1=不合格**。 + +| 维度 | 权重 | 评什么 | 扣分信号 | +|---|---|---|---| +| Q1 正确性 / Correctness | 30% | 答案中的事实性断言、计算、结论是否正确;有参考时是否与参考一致 | 与参考矛盾;事实错误;计算/逻辑错误;编造数字、引述、专有名词 | +| Q2 完整性 / Completeness | 25% | 是否覆盖问题的全部子诉求与关键要点,无关键遗漏 | 漏答子问题;只答一半;缺少必要前提/边界 | +| Q3 贴合度 / Relevance & Instruction-following | 20% | 是否回答了**实际被问的问题**,是否遵守问题中的显式约束(格式、语言、长度、口吻等) | 答非所问;主题漂移;违反明确的格式/语言/长度要求 | +| Q4 可读性 / Clarity | 15% | 组织、清晰度、长度适配、表达是否凝练无歧义 | 冗长堆砌;结构混乱;表述含混;语言与提问不一致 | +| Q5 忠实度 / Faithfulness | 10% | 是否不臆造、不过度自信;不确定处是否如实标注;有参考时是否不超出参考范围杜撰 | 把猜测当事实;编造来源;无依据的绝对化断言 | + +--- + +## 阶段 2 · 汇总与判定 + +1. 计算加权总分(百分制):`score = Σ(dim_score / 5 × weight) × 100`,四舍五入到整数。 +2. 给出等级:`≥85 Excellent / 70–84 Pass / 55–69 Marginal / <55 Fail`。 +3. **一票否决项**:若 Q1 正确性 ≤2(存在实质性事实/逻辑错误,或与参考答案直接矛盾),则置 `veto_triggered=true`,且最终 `verdict` 不得高于 `Marginal`,无论加权总分多少。 +4. 列出 **Top-3 优点** 与 **Top-3 待改进项**,每条附答案中的证据指针与可执行的改进建议。 + +### 评判纪律(消除 judge 偏差) + +- 不因答案**更长 / 更华丽 / 更自信**而加分;只认正确性与目标贴合度。 +- 不被答案的自述(「我已确认…」「显然…」)影响——这类措辞需用问题约束与内在一致性核实。 +- 无参考时不确定真伪的事实,按「未证实」处理:可影响 Q5,但不要据此武断判 Q1 错误,除非违背常识或自相矛盾。 +- 打分**先写推理(引证),后给分数**,避免先入为主。 +- 语言中立:答案语言与问题不一致时,扣 Q3,而非据此曲解内容。 + +--- + +## 阶段 3 · 产出(返回严格 JSON) + +**你的最终回复必须是且仅是一个 JSON 对象,不要包裹 markdown 代码块、不要前后缀说明文字。** 框架会直接解析它。`score` 与 `verdict` 为框架必需字段,其余字段供报告与诊断使用: + +```json +{ + "task_id": "string", + "score": 0, + "verdict": "Excellent|Pass|Marginal|Fail", + "veto_triggered": false, + "Q1_correctness": 0, + "Q2_completeness": 0, + "Q3_relevance": 0, + "Q4_clarity": 0, + "Q5_faithfulness": 0, + "dimensions": { + "Q1_correctness": {"score": 0, "weight": 0.30, "evidence": ["..."], "rationale": "..."}, + "Q2_completeness": {"score": 0, "weight": 0.25, "evidence": ["..."], "rationale": "..."}, + "Q3_relevance": {"score": 0, "weight": 0.20, "evidence": ["..."], "rationale": "..."}, + "Q4_clarity": {"score": 0, "weight": 0.15, "evidence": ["..."], "rationale": "..."}, + "Q5_faithfulness": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."} + }, + "errors": [{"claim": "...", "why_wrong": "..."}], + "top_strengths": ["..."], + "top_improvements": [{"issue": "...", "evidence": "...", "suggestion": "..."}], + "notes": "可选:边界情况说明(如答案缺失、无参考、语言不一致等)" +} +``` + +字段约束: +- `score`:0–100 的整数,**等于**阶段 2 第 1 步算出的加权总分。 +- `verdict`:四档之一,且与 `score` 区间一致;触发一票否决时不得高于 `Marginal`。 +- `Q1..Q5` 顶层字段:与 `dimensions` 内对应 `score` 相同,便于框架直接读取。 +- `evidence`/`rationale`:引用答案或问题中的具体片段,不可空泛。 +- 报告语言与被评估答案保持一致。 + +> 仅当用户在 directive 中显式提供了 `OUT_DIR` 时,才额外用 `Write` 落一份人类可读的 `OUT_DIR/answer_eval_.md`;默认情况下**只返回上面的 JSON**,不写文件、不调工具。 + +--- + +## 执行清单(按序) + +- [ ] 解析注入的 JSON,定位 `case.input`、`state.answer`、可选参考答案 +- [ ] 答案缺失 → 直接 Fail 并返回(见输入说明) +- [ ] 五维逐项打分(先证据后分数) +- [ ] 加权汇总 + 一票否决检查 + 优缺点 +- [ ] 返回严格 JSON(score/verdict 必填,无 markdown 包裹) diff --git a/tests/fixtures/evaluator_judges/answer_quality_backend.py b/tests/fixtures/evaluator_judges/answer_quality_backend.py new file mode 100644 index 000000000..b5d117853 --- /dev/null +++ b/tests/fixtures/evaluator_judges/answer_quality_backend.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from aworld.evaluations.substrate import AgentJudgeBackend + + +PROMPT_PATH = Path(__file__).resolve().parent / "answer_quality_agent.md" + + +def _prompt() -> str: + return PROMPT_PATH.read_text(encoding="utf-8") + + +async def _deterministic_executor(prompt: Any, system_prompt: str) -> str: + payload = { + "task_id": "fixture", + "score": 88, + "verdict": "Pass", + "veto_triggered": False, + "Q1_correctness": 4, + "Q2_completeness": 4, + "Q3_relevance": 5, + "Q4_clarity": 4, + "Q5_faithfulness": 5, + "dimensions": { + "Q1_correctness": { + "score": 4, + "weight": 0.30, + "evidence": ["fixture answer"], + "rationale": "The fixture backend is deterministic for tests.", + } + }, + "errors": [], + "top_strengths": ["Deterministic fixture response"], + "top_improvements": [], + "notes": "This backend-ref fixture verifies loading and schema plumbing without calling an LLM.", + } + return json.dumps(payload, ensure_ascii=False) + + +def _prompt_builder(case_input: dict[str, Any], target: dict[str, Any], suite: Any) -> str: + return json.dumps( + { + "case": case_input, + "state": {"answer": target.get("answer")}, + "required_output_schema": {"score": "number 0-100", "verdict": "string"}, + "instruction": "Evaluate the existing answer/state and return exactly one JSON object.", + }, + ensure_ascii=False, + ) + + +def build_backend() -> AgentJudgeBackend: + return AgentJudgeBackend( + backend_id="answer-quality-fixture-backend", + system_prompt=_prompt(), + executor=_deterministic_executor, + prompt_builder=_prompt_builder, + ) diff --git a/tests/test_slash_commands.py b/tests/test_slash_commands.py index a230d89cb..b004af097 100644 --- a/tests/test_slash_commands.py +++ b/tests/test_slash_commands.py @@ -602,10 +602,80 @@ def fake_run_evaluator_source_cli(**kwargs): assert calls["kind"] == "trajectory" assert calls["task_id"] == "task-1" assert calls["judge_agent"] == str(agent_path) + assert calls["judge_agent_name"] is None + assert calls["judge_backend_ref"] is None assert calls["out_dir"] == str(tmp_path) assert "trajectory-source-evaluator" in result assert "Report:" in result + @pytest.mark.asyncio + async def test_evaluation_accepts_judge_agent_name(self, monkeypatch, tmp_path): + cmd = CommandRegistry.get("evaluation") + input_path = tmp_path / "answers.jsonl" + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "answer-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + "report_path": str(tmp_path / "report.json"), + } + + monkeypatch.setattr( + "aworld_cli.commands.evaluation_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + result = await cmd.execute( + CommandContext( + cwd=os.getcwd(), + user_args=f"--input {input_path} --kind answer --judge-agent-name JudgeTeam", + ) + ) + + assert calls["judge_agent"] is None + assert calls["judge_agent_name"] == "JudgeTeam" + assert calls["judge_backend_ref"] is None + assert "answer-source-evaluator" in result + + @pytest.mark.asyncio + async def test_evaluation_accepts_judge_backend_ref(self, monkeypatch, tmp_path): + cmd = CommandRegistry.get("evaluation") + input_path = tmp_path / "answers.jsonl" + calls = {} + + def fake_run_evaluator_source_cli(**kwargs): + calls.update(kwargs) + return { + "suite_id": "answer-source-evaluator", + "gate": {"status": "pass"}, + "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}}, + "results": [], + "approval": {"required": False, "resolved": False, "approved": None}, + "report_path": str(tmp_path / "report.json"), + } + + monkeypatch.setattr( + "aworld_cli.commands.evaluation_cmd.run_evaluator_source_cli", + fake_run_evaluator_source_cli, + ) + + result = await cmd.execute( + CommandContext( + cwd=os.getcwd(), + user_args=f"--input {input_path} --kind answer --judge-backend-ref custom_judge:build_backend", + ) + ) + + assert calls["judge_agent"] is None + assert calls["judge_agent_name"] is None + assert calls["judge_backend_ref"] == "custom_judge:build_backend" + assert "answer-source-evaluator" in result + @pytest.mark.asyncio async def test_evaluation_runs_source_runtime_without_nested_event_loop(self, monkeypatch, tmp_path): cmd = CommandRegistry.get("evaluation")