Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions aworld-cli/src/aworld_cli/commands/evaluation_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
def _usage() -> str:
return """Usage:
/evaluation --input <path> --kind task --judge-agent <agent.md> [--agent <agent-name>] [--out-dir <dir>]
/evaluation --input <path> --kind answer --judge-agent <agent.md> [--out-dir <dir>]
/evaluation --input <task.jsonl> --kind trajectory --judge-agent <agent.md> [--agent <agent-name>] [--out-dir <dir>]
/evaluation --input <path> --kind answer --judge-agent-name <agent-or-team> [--out-dir <dir>]
/evaluation --input <task.jsonl> --kind trajectory --judge-backend-ref <module:callable> [--agent <agent-name>] [--out-dir <dir>]
/evaluation --input <trajectory.log> --kind trajectory --task-id <id> --judge-agent <agent.md> [--out-dir <dir>]

Examples:
/evaluation --input ./tasks.jsonl --kind task --judge-agent ./judge_agents/answer_judge.md
/evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./judge_agents/answer_judge.md
/evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./judge_agents/trajectory_judge.md
/evaluation --input ./task_answers.jsonl --kind answer --judge-agent-name JudgeTeam
/evaluation --input ./tasks.jsonl --kind trajectory --judge-backend-ref my_eval.judges:build_backend
/evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./judge_agents/trajectory_judge.md
"""

Expand All @@ -31,7 +31,9 @@ def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="/evaluation", add_help=False)
parser.add_argument("--input", required=True)
parser.add_argument("--kind", required=True)
parser.add_argument("--judge-agent", required=True)
parser.add_argument("--judge-agent")
parser.add_argument("--judge-agent-name")
parser.add_argument("--judge-backend-ref")
parser.add_argument("--out-dir")
parser.add_argument("--output")
parser.add_argument("--task-id")
Expand Down Expand Up @@ -88,12 +90,18 @@ async def execute(self, context: CommandContext) -> str:
if args.help:
return _usage()

judge_selectors = (args.judge_agent, args.judge_agent_name, args.judge_backend_ref)
if sum(1 for value in judge_selectors if value) != 1:
return "Evaluator error: exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required\n\n" + _usage()

try:
report = await asyncio.to_thread(
run_evaluator_source_cli,
input=args.input,
kind=args.kind,
judge_agent=args.judge_agent,
judge_agent_name=args.judge_agent_name,
judge_backend_ref=args.judge_backend_ref,
out_dir=args.out_dir,
output=args.output,
task_id=args.task_id,
Expand Down
223 changes: 197 additions & 26 deletions aworld-cli/src/aworld_cli/evaluator_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import asyncio
import builtins
import importlib
import inspect
import json
import time
from pathlib import Path
Expand All @@ -20,9 +22,11 @@
)
from aworld.evaluations.substrate import (
AgentJudgeBackend,
CallableJudgeBackend,
EvaluationFlowDef,
GateMetricCondition,
GatePolicyDef,
JudgeBackend,
JudgeSchemaDef,
StateCheckGrader,
describe_eval_target,
Expand Down Expand Up @@ -165,9 +169,9 @@ def _run_evaluator_hooks(
- `evaluator.pre_discover` event payload: `target`, `workspace_path`
- `evaluator.post_discover` event payload: `target`, `workspace_path`, `suite_names`
- `evaluator.pre_run` event payload for target mode: `mode=target`, `target`, `suite`, `workspace_path`
- `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path`
- `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, judge selector fields, `agent`, `workspace_path`, `output_path`
- `evaluator.post_run` event payload for target mode: `mode=target`, `report`, `target`, `suite`, `workspace_path`
- `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path`
- `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, judge selector fields, `agent`, `workspace_path`, `output_path`
- `evaluator.render_summary` event payload: `report`, `workspace_path`
- mutable state: lightweight CLI assembly metadata only
- allowed side effects: report upload, notifications, summary augmentation
Expand Down Expand Up @@ -253,6 +257,131 @@ def _case_source_metadata(case) -> dict[str, Any]:
return {}


def _judge_selector_count(
*,
judge_agent: str | None,
judge_agent_name: str | None,
judge_backend_ref: str | None,
) -> int:
return sum(
1
for value in (judge_agent, judge_agent_name, judge_backend_ref)
if value is not None and str(value).strip()
)


def _validate_judge_selectors(
*,
judge_agent: str | None,
judge_agent_name: str | None,
judge_backend_ref: str | None,
) -> None:
if _judge_selector_count(
judge_agent=judge_agent,
judge_agent_name=judge_agent_name,
judge_backend_ref=judge_backend_ref,
) != 1:
raise ValueError("exactly one judge selector is required: --judge-agent, --judge-agent-name, or --judge-backend-ref")


def _load_ref(ref: str) -> Any:
module_name, separator, attr_path = ref.partition(":")
if not separator or not module_name or not attr_path:
raise ValueError(f"judge backend ref must use module:callable format: {ref}")
module = importlib.import_module(module_name)
value: Any = module
for attr in attr_path.split("."):
if not attr:
raise ValueError(f"judge backend ref has an empty attribute segment: {ref}")
value = getattr(value, attr)
return value
Comment on lines +287 to +297

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The importlib.import_module and getattr calls can raise ModuleNotFoundError, ImportError, or AttributeError if the user provides an invalid module or callable name via --judge-backend-ref. Since these exceptions are not caught by the except (FileNotFoundError, ValueError, KeyError) blocks in the CLI command handlers, they will cause the CLI to crash with a traceback.

We should catch these exceptions inside _load_ref and wrap them in a ValueError to ensure a clean error message is displayed to the user.

def _load_ref(ref: str) -> Any:
    module_name, separator, attr_path = ref.partition(":")
    if not separator or not module_name or not attr_path:
        raise ValueError(f"judge backend ref must use module:callable format: {ref}")
    try:
        module = importlib.import_module(module_name)
    except ImportError as exc:
        raise ValueError(f"Failed to import module '{module_name}': {exc}") from exc
    value: Any = module
    for attr in attr_path.split("."):
        if not attr:
            raise ValueError(f"judge backend ref has an empty attribute segment: {ref}")
        try:
            value = getattr(value, attr)
        except AttributeError as exc:
            raise ValueError(f"Attribute '{attr}' not found in '{module_name}': {exc}") from exc
    return value



def _can_call_without_arguments(value: Any) -> bool:
try:
signature = inspect.signature(value)
except (TypeError, ValueError):
return False
for parameter in signature.parameters.values():
if parameter.kind in (parameter.VAR_POSITIONAL, parameter.VAR_KEYWORD):
continue
if parameter.default is parameter.empty:
return False
return True


def _coerce_source_judge_backend(value: Any, *, backend_id: str) -> JudgeBackend:
if hasattr(value, "execute"):
return value
if callable(value):
return CallableJudgeBackend(backend_id=backend_id, judge=value)
raise ValueError("judge backend ref must resolve to a JudgeBackend-compatible object or callable")


def _load_source_judge_backend_ref(ref: str) -> JudgeBackend:
value = _load_ref(ref)
if hasattr(value, "execute"):
return value
if callable(value) and _can_call_without_arguments(value):
produced = value()
if inspect.isawaitable(produced):
raise ValueError("judge backend ref factory must be synchronous")
return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}")
return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}")
Comment on lines +321 to +330

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

If value is a class that implements execute (e.g., a custom subclass of JudgeBackend), hasattr(value, "execute") will evaluate to True on the class itself. Returning the class directly causes a runtime error when the framework attempts to call execute as an unbound method.

We should ensure we do not return the class directly, allowing it to fall through to the callable check where it can be instantiated.

Suggested change
def _load_source_judge_backend_ref(ref: str) -> JudgeBackend:
value = _load_ref(ref)
if hasattr(value, "execute"):
return value
if callable(value) and _can_call_without_arguments(value):
produced = value()
if inspect.isawaitable(produced):
raise ValueError("judge backend ref factory must be synchronous")
return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}")
return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}")
def _load_source_judge_backend_ref(ref: str) -> JudgeBackend:
value = _load_ref(ref)
if hasattr(value, "execute") and not inspect.isclass(value):
return value
if callable(value) and _can_call_without_arguments(value):
produced = value()
if inspect.isawaitable(produced):
raise ValueError("judge backend ref factory must be synchronous")
return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}")
return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}")



def _build_cli_agent_judge_backend(*, agent_name: str, backend_id: str, prompt_builder):
executor_cache: dict[str, Any] = {}

async def _executor(prompt, system_prompt):
if isinstance(prompt, tuple):
raise ValueError("CLI agent judge backend only supports text prompts")
executor = executor_cache.get("executor")
if executor is None:
executor = await _load_cli_agent_executor(agent_name)
executor_cache["executor"] = executor
swarm = getattr(executor, "swarm", None)
if swarm is not None:
response = await Runners.run(input=str(prompt), swarm=swarm)
else:
response = await executor.chat(str(prompt))
return str(getattr(response, "answer", response))

Comment on lines +333 to +349

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Since _executor is an asynchronous function, concurrent evaluation tasks can trigger _load_cli_agent_executor simultaneously before the first call finishes caching the executor. This leads to redundant loading of agents and creation of multiple CLI runtimes/executors.

Using an asyncio.Lock with a double-checked locking pattern avoids this race condition efficiently.

def _build_cli_agent_judge_backend(*, agent_name: str, backend_id: str, prompt_builder):
    executor_cache: dict[str, Any] = {}
    lock = asyncio.Lock()

    async def _executor(prompt, system_prompt):
        if isinstance(prompt, tuple):
            raise ValueError("CLI agent judge backend only supports text prompts")
        executor = executor_cache.get("executor")
        if executor is None:
            async with lock:
                executor = executor_cache.get("executor")
                if executor is None:
                    executor = await _load_cli_agent_executor(agent_name)
                    executor_cache["executor"] = executor
        swarm = getattr(executor, "swarm", None)
        if swarm is not None:
            response = await Runners.run(input=str(prompt), swarm=swarm)
        else:
            response = await executor.chat(str(prompt))
        return str(getattr(response, "answer", response))

return AgentJudgeBackend(
backend_id=backend_id,
system_prompt=f"CLI agent judge loaded from {agent_name}",
executor=_executor,
prompt_builder=prompt_builder,
)


def _resolve_source_judge_backend(
*,
judge_agent_path: Path | None,
judge_agent_name: str | None,
judge_backend_ref: str | None,
file_backend_id: str,
named_backend_prefix: str,
prompt_builder,
) -> JudgeBackend:
if judge_agent_path is not None:
return AgentJudgeBackend.from_agent_markdown(
judge_agent_path,
backend_id=file_backend_id,
prompt_builder=prompt_builder,
)
if judge_agent_name is not None and str(judge_agent_name).strip():
resolved_name = str(judge_agent_name).strip()
return _build_cli_agent_judge_backend(
agent_name=resolved_name,
backend_id=f"{named_backend_prefix}:{resolved_name}",
prompt_builder=prompt_builder,
)
if judge_backend_ref is not None and str(judge_backend_ref).strip():
return _load_source_judge_backend_ref(str(judge_backend_ref).strip())
raise ValueError("exactly one judge selector is required: --judge-agent, --judge-agent-name, or --judge-backend-ref")


class _CliAgentRuntimeHarness:
def __init__(self, *, agent_name: str):
self.agent_name = agent_name
Expand Down Expand Up @@ -440,7 +569,9 @@ def _build_source_suite(
*,
kind: str,
input_path: Path,
judge_agent_path: Path,
judge_agent_path: Path | None,
judge_agent_name: str | None = None,
judge_backend_ref: str | None = None,
task_id: str | None,
id_field: str,
task_field: str,
Expand Down Expand Up @@ -486,15 +617,19 @@ def _build_source_suite(
id_field=id_field,
input_field=task_field,
)
judge_backend = _resolve_source_judge_backend(
judge_agent_path=judge_agent_path,
judge_agent_name=judge_agent_name,
judge_backend_ref=judge_backend_ref,
file_backend_id="source-agent-md",
named_backend_prefix="source-agent",
prompt_builder=_build_source_prompt,
)
return create_source_eval_suite(
suite_id="task-source-evaluator",
source=source,
runtime_harness=_build_cli_agent_runtime_harness(agent_name=agent_name),
judge_backend=AgentJudgeBackend.from_agent_markdown(
judge_agent_path,
backend_id="source-agent-md",
prompt_builder=_build_source_prompt,
),
judge_backend=judge_backend,
judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput),
gate_policy=answer_gate,
metadata={"agent": agent_name},
Expand All @@ -507,14 +642,18 @@ def _build_source_suite(
input_field=task_field,
answer_field=answer_field,
)
judge_backend = _resolve_source_judge_backend(
judge_agent_path=judge_agent_path,
judge_agent_name=judge_agent_name,
judge_backend_ref=judge_backend_ref,
file_backend_id="source-agent-md",
named_backend_prefix="source-agent",
prompt_builder=_build_source_prompt,
)
return create_source_eval_suite(
suite_id="answer-source-evaluator",
source=source,
judge_backend=AgentJudgeBackend.from_agent_markdown(
judge_agent_path,
backend_id="source-agent-md",
prompt_builder=_build_source_prompt,
),
judge_backend=judge_backend,
judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput),
gate_policy=answer_gate,
)
Expand All @@ -534,15 +673,19 @@ def _build_source_suite(
input_field=task_field,
)
runtime_harness = _build_cli_agent_runtime_harness(agent_name=agent_name)
judge_backend = _resolve_source_judge_backend(
judge_agent_path=judge_agent_path,
judge_agent_name=judge_agent_name,
judge_backend_ref=judge_backend_ref,
file_backend_id="trajectory-evaluator-agent-md",
named_backend_prefix="trajectory-evaluator-agent",
prompt_builder=_build_trajectory_prompt,
)
return create_source_eval_suite(
suite_id="trajectory-source-evaluator",
source=source,
runtime_harness=runtime_harness,
judge_backend=AgentJudgeBackend.from_agent_markdown(
judge_agent_path,
backend_id="trajectory-evaluator-agent-md",
prompt_builder=_build_trajectory_prompt,
),
judge_backend=judge_backend,
judge_schema=TrajectoryJudgeSchema.default(),
outcome_scorers=trajectory_outcome_scorers,
gate_policy=trajectory_gate,
Expand All @@ -556,7 +699,9 @@ def run_evaluator_source_cli(
*,
input: str,
kind: str,
judge_agent: str,
judge_agent: str | None = None,
judge_agent_name: str | None = None,
judge_backend_ref: str | None = None,
out_dir: str | None = None,
output: str | None = None,
task_id: str | None = None,
Expand All @@ -571,8 +716,13 @@ def run_evaluator_source_cli(
input_path = Path(input).expanduser().resolve()
if not input_path.exists():
raise FileNotFoundError(f"source input does not exist: {input_path}")
judge_agent_path = Path(judge_agent).expanduser().resolve()
if not judge_agent_path.exists():
_validate_judge_selectors(
judge_agent=judge_agent,
judge_agent_name=judge_agent_name,
judge_backend_ref=judge_backend_ref,
)
judge_agent_path = Path(judge_agent).expanduser().resolve() if judge_agent else None
if judge_agent_path is not None and not judge_agent_path.exists():
raise FileNotFoundError(f"judge agent does not exist: {judge_agent_path}")

workspace_path = str(input_path.parent if input_path.is_file() else input_path)
Expand All @@ -581,7 +731,9 @@ def run_evaluator_source_cli(
"input": str(input_path),
"kind": kind,
"task_id": task_id,
"judge_agent": str(judge_agent_path),
"judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
"judge_agent_name": judge_agent_name,
"judge_backend_ref": judge_backend_ref,
"agent": agent,
"workspace_path": workspace_path,
"output_path": str(Path(output).expanduser().resolve()) if output else None,
Expand All @@ -595,7 +747,9 @@ def run_evaluator_source_cli(
"input": str(input_path),
"kind": kind,
"task_id": task_id,
"judge_agent": str(judge_agent_path),
"judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
"judge_agent_name": judge_agent_name,
"judge_backend_ref": judge_backend_ref,
"agent": agent,
"interactive_approval": interactive_approval,
},
Expand All @@ -604,6 +758,8 @@ def run_evaluator_source_cli(
kind=kind,
input_path=input_path,
judge_agent_path=judge_agent_path,
judge_agent_name=judge_agent_name,
judge_backend_ref=judge_backend_ref,
task_id=task_id,
id_field=id_field,
task_field=task_field,
Expand All @@ -618,11 +774,24 @@ def run_evaluator_source_cli(
"target_path": str(input_path),
"source_kind": kind,
"task_id": task_id,
"judge_agent": str(judge_agent_path),
"judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
"judge_agent_name": judge_agent_name,
"judge_backend_ref": judge_backend_ref,
"agent": agent_name if executes_agent else agent,
}
for key, value in hook_state.items():
if key not in {"mode", "input", "kind", "task_id", "judge_agent", "agent", "interactive_approval", "summary_suffix"}:
if key not in {
"mode",
"input",
"kind",
"task_id",
"judge_agent",
"judge_agent_name",
"judge_backend_ref",
"agent",
"interactive_approval",
"summary_suffix",
}:
target_info[key] = value
flow = EvaluationFlowDef(
target=target_info,
Expand All @@ -647,7 +816,9 @@ def run_evaluator_source_cli(
"input": str(input_path),
"kind": kind,
"task_id": task_id,
"judge_agent": str(judge_agent_path),
"judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
"judge_agent_name": judge_agent_name,
"judge_backend_ref": judge_backend_ref,
"agent": agent_name if executes_agent else agent,
}
report["automation"] = _build_automation_summary(report)
Expand Down
Loading
Loading