Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
6f0d7c4
feat: add suite-backed evaluator flow
wuman001 Jun 1, 2026
b4edd91
feat: add evaluator judge timeout fallback
wuman001 Jun 1, 2026
45cc8e5
feat: improve evaluator cli flow
wuman001 Jun 2, 2026
0e7cffc
feat: add visual-aware evaluator suite resolution
wuman001 Jun 2, 2026
d0bbade
feat: expose evaluator suite selection diagnostics
wuman001 Jun 2, 2026
5eb15dc
feat: add structured evaluator report metadata
wuman001 Jun 2, 2026
dd421c7
feat: export evaluator report schema
wuman001 Jun 2, 2026
850d7fa
feat: tighten evaluator report schema
wuman001 Jun 2, 2026
57e9d6a
feat: add evaluator report validation command
wuman001 Jun 2, 2026
67fbf7c
docs: add evaluator report contract guides
wuman001 Jun 2, 2026
262188b
feat: load declared evaluator suites
wuman001 Jun 2, 2026
1f94675
docs: add declared evaluator suite contract
wuman001 Jun 3, 2026
339a89e
fix: refresh declared evaluator suite discovery
wuman001 Jun 8, 2026
859c77e
fix: isolate declared evaluator suites by workspace
wuman001 Jun 8, 2026
834efc1
feat: add execution-backed evaluator substrate
wuman001 Jun 9, 2026
2cc7c30
docs: align evaluator substrate docs
wuman001 Jun 9, 2026
8dea37d
feat: extend evaluator v2 substrate
wuman001 Jun 9, 2026
99c9201
fix: harden evaluator v2 contracts
wuman001 Jun 9, 2026
fb7a2c4
docs: add evaluator v2 openspec change
wuman001 Jun 9, 2026
9feb9b6
fix: harden evaluator trajectory configuration
wuman001 Jun 10, 2026
34202d4
docs: add evaluator runtime composition change
wuman001 Jun 10, 2026
444a705
docs: align runtime composition outcomes and trials
wuman001 Jun 10, 2026
c2fb3c0
docs: specify runtime outcome and trial boundaries
wuman001 Jun 10, 2026
9f48600
feat: add evaluator runtime composition
wuman001 Jun 10, 2026
ef66197
docs: add evaluator trials pass metrics change
wuman001 Jun 10, 2026
ff2c1ac
feat: add evaluator trial pass metrics
wuman001 Jun 10, 2026
e4ebc7c
feat: add evaluator environment isolation
wuman001 Jun 10, 2026
caee5e5
feat: add adaptive evaluator user simulator
wuman001 Jun 10, 2026
de008e9
test: add manual trajectory evaluator replay case
wuman001 Jun 10, 2026
56225de
fix: trim evaluator report metadata and replay metrics
wuman001 Jun 10, 2026
5f8bc4f
feat: add evaluator input source framework
wuman001 Jun 10, 2026
2429022
feat: add source-backed evaluator cli run
wuman001 Jun 10, 2026
cb955f0
test: add source cli trajectory manual replay
wuman001 Jun 10, 2026
e4bb56a
feat: simplify evaluator source commands
wuman001 Jun 10, 2026
cfaaaf5
fix: avoid nested loop in evaluation slash command
wuman001 Jun 10, 2026
3aad470
feat: add answer quality evaluator agent
wuman001 Jun 10, 2026
a098b64
Add source-backed evaluator task flows
wuman001 Jun 10, 2026
1502cef
Ignore local evaluator artifacts
wuman001 Jun 10, 2026
512c9b4
Gate answer evaluations on veto signal
wuman001 Jun 10, 2026
f9c40d5
Handle non-numeric state check comparisons
wuman001 Jun 11, 2026
84998aa
Remove openspec changes from PR
wuman001 Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ team_implementation_analysis.md
# Temporary AI-generated artifacts
ai_news_today.*
survey/
eval/

# OpenSpec design docs (not runtime)
openspec/
Expand All @@ -177,4 +178,3 @@ openspec/
*.tmp
__pycache__/
*.pyc

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"id": "aworld-evaluator-cli",
"name": "aworld-evaluator-cli",
"version": "1.0.0",
"entrypoints": {
"cli_commands": [
{
"id": "evaluator",
"name": "evaluator",
"target": "cli_commands/evaluator.py",
"scope": "workspace",
"visibility": "public",
"metadata": {
"factory": "build_command"
}
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Built-in framework plugin providing the `evaluator` top-level CLI command."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from aworld_cli.top_level_commands.evaluator_cmd import EvaluatorTopLevelCommand


def build_command():
return EvaluatorTopLevelCommand()
3 changes: 3 additions & 0 deletions aworld-cli/src/aworld_cli/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- /cron: Manage scheduled tasks (tool command)
- /dispatch: Submit task to background execution (tool command)
- /tasks: Manage background tasks (tool command)
- /evaluation: Run evaluator flows (tool command)

Usage:
# Import to register all commands
Expand All @@ -33,6 +34,7 @@
from . import dispatch
from . import tasks
from . import plugins_cmd
from . import evaluation_cmd

__all__ = [
"help_cmd",
Expand All @@ -44,4 +46,5 @@
"dispatch",
"tasks",
"plugins_cmd",
"evaluation_cmd",
]
113 changes: 113 additions & 0 deletions aworld-cli/src/aworld_cli/commands/evaluation_cmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
/evaluation command - Run evaluator flows from chat.
"""
from __future__ import annotations

import argparse
import asyncio
import shlex

from aworld_cli.core.command_system import Command, CommandContext, register_command
from aworld_cli.evaluator_rendering import render_evaluator_summary
from aworld_cli.evaluator_runtime import run_evaluator_source_cli


def _usage() -> str:
return """Usage:
/evaluation --input <path> --kind task --judge-agent <agent.md> [--agent <agent-name>] [--out-dir <dir>]
/evaluation --input <path> --kind answer --judge-agent <agent.md> [--out-dir <dir>]
/evaluation --input <task.jsonl> --kind trajectory --judge-agent <agent.md> [--agent <agent-name>] [--out-dir <dir>]
/evaluation --input <trajectory.log> --kind trajectory --task-id <id> --judge-agent <agent.md> [--out-dir <dir>]

Examples:
/evaluation --input ./tasks.jsonl --kind task --judge-agent ./judge_agents/answer_judge.md
/evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./judge_agents/answer_judge.md
/evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./judge_agents/trajectory_judge.md
/evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./judge_agents/trajectory_judge.md
"""


def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="/evaluation", add_help=False)
parser.add_argument("--input", required=True)
parser.add_argument("--kind", required=True)
parser.add_argument("--judge-agent", required=True)
parser.add_argument("--out-dir")
parser.add_argument("--output")
parser.add_argument("--task-id")
parser.add_argument("--agent")
parser.add_argument("--id-field", default="id")
parser.add_argument("--task-field", default="input")
parser.add_argument("--answer-field", default="answer")
parser.add_argument("--interactive-approval", action="store_true")
parser.add_argument("--help", action="store_true")
return parser


@register_command
class EvaluationCommand(Command):
@property
def name(self) -> str:
return "evaluation"

@property
def description(self) -> str:
return "Run evaluator flows"

@property
def command_type(self) -> str:
return "tool"

@property
def completion_items(self) -> dict[str, str]:
return {
"/evaluation --kind task": "Run tasks with the default agent, then evaluate the produced state",
"/evaluation --kind answer": "Evaluate existing task+answer JSONL records",
"/evaluation --kind trajectory": "Evaluate generated or replayed trajectories",
}

async def execute(self, context: CommandContext) -> str:
raw_args = (context.user_args or "").strip()
if not raw_args:
return _usage()

try:
parts = shlex.split(raw_args)
except ValueError as exc:
return f"Evaluator error: {exc}\n\n{_usage()}"

if not parts or parts[0] in {"help", "--help", "-h"}:
return _usage()

parser = _build_parser()
try:
args = parser.parse_args(parts)
except SystemExit:
return _usage()

if args.help:
return _usage()

try:
report = await asyncio.to_thread(
run_evaluator_source_cli,
input=args.input,
kind=args.kind,
judge_agent=args.judge_agent,
out_dir=args.out_dir,
output=args.output,
task_id=args.task_id,
agent=args.agent,
id_field=args.id_field,
task_field=args.task_field,
answer_field=args.answer_field,
interactive_approval=args.interactive_approval,
)
except (FileNotFoundError, ValueError, KeyError) as exc:
return f"Evaluator error: {exc}"

summary = render_evaluator_summary(report)
report_path = report.get("report_path")
if report_path:
return f"{summary}\nReport: {report_path}"
return summary
26 changes: 26 additions & 0 deletions aworld-cli/src/aworld_cli/evaluator_rendering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from __future__ import annotations


def render_evaluator_summary(report: dict, *, summary_suffix: str | None = None) -> str:
suite_id = report.get("suite_id", "unknown-suite")
gate = report.get("gate", {})
status = gate.get("status", "unknown")
metric_value = gate.get("value")
summary_line = f"Evaluator suite: {suite_id}\nGate: {status}"
if metric_value is not None:
if isinstance(metric_value, (int, float)):
summary_line += f" ({metric_value:.2f})"
else:
summary_line += f" ({metric_value})"
selection = report.get("suite_selection") or {}
if selection.get("resolved"):
summary_line += f"\nSuite selection: {selection.get('mode', 'unknown')} -> {selection['resolved']}"
backend = report.get("judge_backend", {}).get("backend_id")
if backend:
summary_line += f"\nJudge backend: {backend}"
report_path = report.get("report_path")
if report_path:
summary_line += f"\nReport: {report_path}"
if summary_suffix:
summary_line += f"\n{summary_suffix}"
return summary_line
Loading
Loading