From b210605b436b0c8800cd771d499fe6335684bb92 Mon Sep 17 00:00:00 2001
From: "wuman.wyf" <wuman.wyf@antgroup.com>
Date: Thu, 11 Jun 2026 10:41:18 +0800
Subject: [PATCH] Support named evaluator judge backends

---
 .../src/aworld_cli/commands/evaluation_cmd.py |  18 +-
 .../src/aworld_cli/evaluator_runtime.py       | 223 ++++++++++++++++--
 .../top_level_commands/evaluator_cmd.py       |  15 +-
 docs/AWorld CLI/Commands/Evaluator.md         |  26 +-
 tests/core/test_evaluator_runtime.py          | 144 ++++++++++-
 .../core/test_evaluator_top_level_command.py  | 132 ++++++++++-
 tests/docs/test_evaluator_report_docs.py      |   3 +
 .../test_answer_quality_judge_fixtures.py     |  67 ++++++
 .../agents/answer_quality_judge.py            |  42 ++++
 .../evaluator_judges/answer_quality_agent.md  | 116 +++++++++
 .../answer_quality_backend.py                 |  62 +++++
 tests/test_slash_commands.py                  |  70 ++++++
 12 files changed, 881 insertions(+), 37 deletions(-)
 create mode 100644 tests/evaluations/test_answer_quality_judge_fixtures.py
 create mode 100644 tests/fixtures/evaluator_judges/agents/answer_quality_judge.py
 create mode 100644 tests/fixtures/evaluator_judges/answer_quality_agent.md
 create mode 100644 tests/fixtures/evaluator_judges/answer_quality_backend.py
diff --git a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py
index 74e8cb9ec..59c384cf7 100644
--- a/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py
+++ b/aworld-cli/src/aworld_cli/commands/evaluation_cmd.py
@@ -15,14 +15,14 @@
 def _usage() -> str:
     return """Usage:
   /evaluation --input <path> --kind task --judge-agent <agent.md> [--agent <agent-name>] [--out-dir <dir>]
-  /evaluation --input <path> --kind answer --judge-agent <agent.md> [--out-dir <dir>]
-  /evaluation --input <task.jsonl> --kind trajectory --judge-agent <agent.md> [--agent <agent-name>] [--out-dir <dir>]
+  /evaluation --input <path> --kind answer --judge-agent-name <agent-or-team> [--out-dir <dir>]
+  /evaluation --input <task.jsonl> --kind trajectory --judge-backend-ref <module:callable> [--agent <agent-name>] [--out-dir <dir>]
   /evaluation --input <trajectory.log> --kind trajectory --task-id <id> --judge-agent <agent.md> [--out-dir <dir>]
 
 Examples:
   /evaluation --input ./tasks.jsonl --kind task --judge-agent ./judge_agents/answer_judge.md
-  /evaluation --input ./task_answers.jsonl --kind answer --judge-agent ./judge_agents/answer_judge.md
-  /evaluation --input ./tasks.jsonl --kind trajectory --judge-agent ./judge_agents/trajectory_judge.md
+  /evaluation --input ./task_answers.jsonl --kind answer --judge-agent-name JudgeTeam
+  /evaluation --input ./tasks.jsonl --kind trajectory --judge-backend-ref my_eval.judges:build_backend
   /evaluation --input ~/Documents/logs/trajectory.log --kind trajectory --task-id task_123 --judge-agent ./judge_agents/trajectory_judge.md
 """
 
@@ -31,7 +31,9 @@ def _build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(prog="/evaluation", add_help=False)
     parser.add_argument("--input", required=True)
     parser.add_argument("--kind", required=True)
-    parser.add_argument("--judge-agent", required=True)
+    parser.add_argument("--judge-agent")
+    parser.add_argument("--judge-agent-name")
+    parser.add_argument("--judge-backend-ref")
     parser.add_argument("--out-dir")
     parser.add_argument("--output")
     parser.add_argument("--task-id")
@@ -88,12 +90,18 @@ async def execute(self, context: CommandContext) -> str:
         if args.help:
             return _usage()
 
+        judge_selectors = (args.judge_agent, args.judge_agent_name, args.judge_backend_ref)
+        if sum(1 for value in judge_selectors if value) != 1:
+            return "Evaluator error: exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required\n\n" + _usage()
+
         try:
             report = await asyncio.to_thread(
                 run_evaluator_source_cli,
                 input=args.input,
                 kind=args.kind,
                 judge_agent=args.judge_agent,
+                judge_agent_name=args.judge_agent_name,
+                judge_backend_ref=args.judge_backend_ref,
                 out_dir=args.out_dir,
                 output=args.output,
                 task_id=args.task_id,
diff --git a/aworld-cli/src/aworld_cli/evaluator_runtime.py b/aworld-cli/src/aworld_cli/evaluator_runtime.py
index 863381b00..80f512721 100644
--- a/aworld-cli/src/aworld_cli/evaluator_runtime.py
+++ b/aworld-cli/src/aworld_cli/evaluator_runtime.py
@@ -2,6 +2,8 @@
 
 import asyncio
 import builtins
+import importlib
+import inspect
 import json
 import time
 from pathlib import Path
@@ -20,9 +22,11 @@
 )
 from aworld.evaluations.substrate import (
     AgentJudgeBackend,
+    CallableJudgeBackend,
     EvaluationFlowDef,
     GateMetricCondition,
     GatePolicyDef,
+    JudgeBackend,
     JudgeSchemaDef,
     StateCheckGrader,
     describe_eval_target,
@@ -165,9 +169,9 @@ def _run_evaluator_hooks(
     - `evaluator.pre_discover` event payload: `target`, `workspace_path`
     - `evaluator.post_discover` event payload: `target`, `workspace_path`, `suite_names`
     - `evaluator.pre_run` event payload for target mode: `mode=target`, `target`, `suite`, `workspace_path`
-    - `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path`
+    - `evaluator.pre_run` event payload for source mode: `mode=source`, `input`, `kind`, `task_id`, judge selector fields, `agent`, `workspace_path`, `output_path`
     - `evaluator.post_run` event payload for target mode: `mode=target`, `report`, `target`, `suite`, `workspace_path`
-    - `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path`
+    - `evaluator.post_run` event payload for source mode: `mode=source`, `report`, `input`, `kind`, `task_id`, judge selector fields, `agent`, `workspace_path`, `output_path`
     - `evaluator.render_summary` event payload: `report`, `workspace_path`
     - mutable state: lightweight CLI assembly metadata only
     - allowed side effects: report upload, notifications, summary augmentation
@@ -253,6 +257,131 @@ def _case_source_metadata(case) -> dict[str, Any]:
     return {}
 
 
+def _judge_selector_count(
+    *,
+    judge_agent: str | None,
+    judge_agent_name: str | None,
+    judge_backend_ref: str | None,
+) -> int:
+    return sum(
+        1
+        for value in (judge_agent, judge_agent_name, judge_backend_ref)
+        if value is not None and str(value).strip()
+    )
+
+
+def _validate_judge_selectors(
+    *,
+    judge_agent: str | None,
+    judge_agent_name: str | None,
+    judge_backend_ref: str | None,
+) -> None:
+    if _judge_selector_count(
+        judge_agent=judge_agent,
+        judge_agent_name=judge_agent_name,
+        judge_backend_ref=judge_backend_ref,
+    ) != 1:
+        raise ValueError("exactly one judge selector is required: --judge-agent, --judge-agent-name, or --judge-backend-ref")
+
+
+def _load_ref(ref: str) -> Any:
+    module_name, separator, attr_path = ref.partition(":")
+    if not separator or not module_name or not attr_path:
+        raise ValueError(f"judge backend ref must use module:callable format: {ref}")
+    module = importlib.import_module(module_name)
+    value: Any = module
+    for attr in attr_path.split("."):
+        if not attr:
+            raise ValueError(f"judge backend ref has an empty attribute segment: {ref}")
+        value = getattr(value, attr)
+    return value
+
+
+def _can_call_without_arguments(value: Any) -> bool:
+    try:
+        signature = inspect.signature(value)
+    except (TypeError, ValueError):
+        return False
+    for parameter in signature.parameters.values():
+        if parameter.kind in (parameter.VAR_POSITIONAL, parameter.VAR_KEYWORD):
+            continue
+        if parameter.default is parameter.empty:
+            return False
+    return True
+
+
+def _coerce_source_judge_backend(value: Any, *, backend_id: str) -> JudgeBackend:
+    if hasattr(value, "execute"):
+        return value
+    if callable(value):
+        return CallableJudgeBackend(backend_id=backend_id, judge=value)
+    raise ValueError("judge backend ref must resolve to a JudgeBackend-compatible object or callable")
+
+
+def _load_source_judge_backend_ref(ref: str) -> JudgeBackend:
+    value = _load_ref(ref)
+    if hasattr(value, "execute"):
+        return value
+    if callable(value) and _can_call_without_arguments(value):
+        produced = value()
+        if inspect.isawaitable(produced):
+            raise ValueError("judge backend ref factory must be synchronous")
+        return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}")
+    return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}")
+
+
+def _build_cli_agent_judge_backend(*, agent_name: str, backend_id: str, prompt_builder):
+    executor_cache: dict[str, Any] = {}
+
+    async def _executor(prompt, system_prompt):
+        if isinstance(prompt, tuple):
+            raise ValueError("CLI agent judge backend only supports text prompts")
+        executor = executor_cache.get("executor")
+        if executor is None:
+            executor = await _load_cli_agent_executor(agent_name)
+            executor_cache["executor"] = executor
+        swarm = getattr(executor, "swarm", None)
+        if swarm is not None:
+            response = await Runners.run(input=str(prompt), swarm=swarm)
+        else:
+            response = await executor.chat(str(prompt))
+        return str(getattr(response, "answer", response))
+
+    return AgentJudgeBackend(
+        backend_id=backend_id,
+        system_prompt=f"CLI agent judge loaded from {agent_name}",
+        executor=_executor,
+        prompt_builder=prompt_builder,
+    )
+
+
+def _resolve_source_judge_backend(
+    *,
+    judge_agent_path: Path | None,
+    judge_agent_name: str | None,
+    judge_backend_ref: str | None,
+    file_backend_id: str,
+    named_backend_prefix: str,
+    prompt_builder,
+) -> JudgeBackend:
+    if judge_agent_path is not None:
+        return AgentJudgeBackend.from_agent_markdown(
+            judge_agent_path,
+            backend_id=file_backend_id,
+            prompt_builder=prompt_builder,
+        )
+    if judge_agent_name is not None and str(judge_agent_name).strip():
+        resolved_name = str(judge_agent_name).strip()
+        return _build_cli_agent_judge_backend(
+            agent_name=resolved_name,
+            backend_id=f"{named_backend_prefix}:{resolved_name}",
+            prompt_builder=prompt_builder,
+        )
+    if judge_backend_ref is not None and str(judge_backend_ref).strip():
+        return _load_source_judge_backend_ref(str(judge_backend_ref).strip())
+    raise ValueError("exactly one judge selector is required: --judge-agent, --judge-agent-name, or --judge-backend-ref")
+
+
 class _CliAgentRuntimeHarness:
     def __init__(self, *, agent_name: str):
         self.agent_name = agent_name
@@ -440,7 +569,9 @@ def _build_source_suite(
     *,
     kind: str,
     input_path: Path,
-    judge_agent_path: Path,
+    judge_agent_path: Path | None,
+    judge_agent_name: str | None = None,
+    judge_backend_ref: str | None = None,
     task_id: str | None,
     id_field: str,
     task_field: str,
@@ -486,15 +617,19 @@ def _build_source_suite(
             id_field=id_field,
             input_field=task_field,
         )
+        judge_backend = _resolve_source_judge_backend(
+            judge_agent_path=judge_agent_path,
+            judge_agent_name=judge_agent_name,
+            judge_backend_ref=judge_backend_ref,
+            file_backend_id="source-agent-md",
+            named_backend_prefix="source-agent",
+            prompt_builder=_build_source_prompt,
+        )
         return create_source_eval_suite(
             suite_id="task-source-evaluator",
             source=source,
             runtime_harness=_build_cli_agent_runtime_harness(agent_name=agent_name),
-            judge_backend=AgentJudgeBackend.from_agent_markdown(
-                judge_agent_path,
-                backend_id="source-agent-md",
-                prompt_builder=_build_source_prompt,
-            ),
+            judge_backend=judge_backend,
             judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput),
             gate_policy=answer_gate,
             metadata={"agent": agent_name},
@@ -507,14 +642,18 @@ def _build_source_suite(
             input_field=task_field,
             answer_field=answer_field,
         )
+        judge_backend = _resolve_source_judge_backend(
+            judge_agent_path=judge_agent_path,
+            judge_agent_name=judge_agent_name,
+            judge_backend_ref=judge_backend_ref,
+            file_backend_id="source-agent-md",
+            named_backend_prefix="source-agent",
+            prompt_builder=_build_source_prompt,
+        )
         return create_source_eval_suite(
             suite_id="answer-source-evaluator",
             source=source,
-            judge_backend=AgentJudgeBackend.from_agent_markdown(
-                judge_agent_path,
-                backend_id="source-agent-md",
-                prompt_builder=_build_source_prompt,
-            ),
+            judge_backend=judge_backend,
             judge_schema=JudgeSchemaDef(output_model=_SourceJudgeOutput),
             gate_policy=answer_gate,
         )
@@ -534,15 +673,19 @@ def _build_source_suite(
                 input_field=task_field,
             )
             runtime_harness = _build_cli_agent_runtime_harness(agent_name=agent_name)
+        judge_backend = _resolve_source_judge_backend(
+            judge_agent_path=judge_agent_path,
+            judge_agent_name=judge_agent_name,
+            judge_backend_ref=judge_backend_ref,
+            file_backend_id="trajectory-evaluator-agent-md",
+            named_backend_prefix="trajectory-evaluator-agent",
+            prompt_builder=_build_trajectory_prompt,
+        )
         return create_source_eval_suite(
             suite_id="trajectory-source-evaluator",
             source=source,
             runtime_harness=runtime_harness,
-            judge_backend=AgentJudgeBackend.from_agent_markdown(
-                judge_agent_path,
-                backend_id="trajectory-evaluator-agent-md",
-                prompt_builder=_build_trajectory_prompt,
-            ),
+            judge_backend=judge_backend,
             judge_schema=TrajectoryJudgeSchema.default(),
             outcome_scorers=trajectory_outcome_scorers,
             gate_policy=trajectory_gate,
@@ -556,7 +699,9 @@ def run_evaluator_source_cli(
     *,
     input: str,
     kind: str,
-    judge_agent: str,
+    judge_agent: str | None = None,
+    judge_agent_name: str | None = None,
+    judge_backend_ref: str | None = None,
     out_dir: str | None = None,
     output: str | None = None,
     task_id: str | None = None,
@@ -571,8 +716,13 @@ def run_evaluator_source_cli(
     input_path = Path(input).expanduser().resolve()
     if not input_path.exists():
         raise FileNotFoundError(f"source input does not exist: {input_path}")
-    judge_agent_path = Path(judge_agent).expanduser().resolve()
-    if not judge_agent_path.exists():
+    _validate_judge_selectors(
+        judge_agent=judge_agent,
+        judge_agent_name=judge_agent_name,
+        judge_backend_ref=judge_backend_ref,
+    )
+    judge_agent_path = Path(judge_agent).expanduser().resolve() if judge_agent else None
+    if judge_agent_path is not None and not judge_agent_path.exists():
         raise FileNotFoundError(f"judge agent does not exist: {judge_agent_path}")
 
     workspace_path = str(input_path.parent if input_path.is_file() else input_path)
@@ -581,7 +731,9 @@ def run_evaluator_source_cli(
         "input": str(input_path),
         "kind": kind,
         "task_id": task_id,
-        "judge_agent": str(judge_agent_path),
+        "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
+        "judge_agent_name": judge_agent_name,
+        "judge_backend_ref": judge_backend_ref,
         "agent": agent,
         "workspace_path": workspace_path,
         "output_path": str(Path(output).expanduser().resolve()) if output else None,
@@ -595,7 +747,9 @@ def run_evaluator_source_cli(
             "input": str(input_path),
             "kind": kind,
             "task_id": task_id,
-            "judge_agent": str(judge_agent_path),
+            "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
+            "judge_agent_name": judge_agent_name,
+            "judge_backend_ref": judge_backend_ref,
             "agent": agent,
             "interactive_approval": interactive_approval,
         },
@@ -604,6 +758,8 @@ def run_evaluator_source_cli(
         kind=kind,
         input_path=input_path,
         judge_agent_path=judge_agent_path,
+        judge_agent_name=judge_agent_name,
+        judge_backend_ref=judge_backend_ref,
         task_id=task_id,
         id_field=id_field,
         task_field=task_field,
@@ -618,11 +774,24 @@ def run_evaluator_source_cli(
         "target_path": str(input_path),
         "source_kind": kind,
         "task_id": task_id,
-        "judge_agent": str(judge_agent_path),
+        "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
+        "judge_agent_name": judge_agent_name,
+        "judge_backend_ref": judge_backend_ref,
         "agent": agent_name if executes_agent else agent,
     }
     for key, value in hook_state.items():
-        if key not in {"mode", "input", "kind", "task_id", "judge_agent", "agent", "interactive_approval", "summary_suffix"}:
+        if key not in {
+            "mode",
+            "input",
+            "kind",
+            "task_id",
+            "judge_agent",
+            "judge_agent_name",
+            "judge_backend_ref",
+            "agent",
+            "interactive_approval",
+            "summary_suffix",
+        }:
             target_info[key] = value
     flow = EvaluationFlowDef(
         target=target_info,
@@ -647,7 +816,9 @@ def run_evaluator_source_cli(
         "input": str(input_path),
         "kind": kind,
         "task_id": task_id,
-        "judge_agent": str(judge_agent_path),
+        "judge_agent": str(judge_agent_path) if judge_agent_path is not None else None,
+        "judge_agent_name": judge_agent_name,
+        "judge_backend_ref": judge_backend_ref,
         "agent": agent_name if executes_agent else agent,
     }
     report["automation"] = _build_automation_summary(report)
diff --git a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py
index 3d90f5186..e6fdfb365 100644
--- a/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py
+++ b/aworld-cli/src/aworld_cli/top_level_commands/evaluator_cmd.py
@@ -45,6 +45,8 @@ def register_parser(self, subparsers) -> None:
         parser.add_argument("--input", type=str)
         parser.add_argument("--kind", type=str)
         parser.add_argument("--judge-agent", type=str)
+        parser.add_argument("--judge-agent-name", type=str)
+        parser.add_argument("--judge-backend-ref", type=str)
         parser.add_argument("--out-dir", type=str)
         parser.add_argument("--task-id", type=str)
         parser.add_argument("--agent", type=str)
@@ -68,14 +70,21 @@ def run(self, args, context) -> int:
             if not getattr(args, "kind", None):
                 print("Evaluator error: --kind is required with --input")
                 return 1
-            if not getattr(args, "judge_agent", None):
-                print("Evaluator error: --judge-agent is required with --input")
+            judge_selectors = [
+                getattr(args, "judge_agent", None),
+                getattr(args, "judge_agent_name", None),
+                getattr(args, "judge_backend_ref", None),
+            ]
+            if sum(1 for value in judge_selectors if value) != 1:
+                print("Evaluator error: exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required with --input")
                 return 1
             try:
                 report = run_evaluator_source_cli(
                     input=args.input,
                     kind=args.kind,
                     judge_agent=args.judge_agent,
+                    judge_agent_name=args.judge_agent_name,
+                    judge_backend_ref=args.judge_backend_ref,
                     out_dir=args.out_dir,
                     output=args.output,
                     task_id=args.task_id,
@@ -94,6 +103,8 @@ def run(self, args, context) -> int:
         source_only_args = (
             ("kind", "--kind"),
             ("judge_agent", "--judge-agent"),
+            ("judge_agent_name", "--judge-agent-name"),
+            ("judge_backend_ref", "--judge-backend-ref"),
             ("out_dir", "--out-dir"),
             ("task_id", "--task-id"),
             ("agent", "--agent"),
diff --git a/docs/AWorld CLI/Commands/Evaluator.md b/docs/AWorld CLI/Commands/Evaluator.md
index 5d86f73c9..2467bce98 100644
--- a/docs/AWorld CLI/Commands/Evaluator.md	
+++ b/docs/AWorld CLI/Commands/Evaluator.md	
@@ -45,6 +45,18 @@ aworld-cli evaluator \
   --judge-agent ./judge_agents/answer_judge.md \
   --out-dir ./reports
 
+aworld-cli evaluator \
+  --input ./task_answers.jsonl \
+  --kind answer \
+  --judge-agent-name answer-quality-judge \
+  --out-dir ./reports
+
+aworld-cli evaluator \
+  --input ./task_answers.jsonl \
+  --kind answer \
+  --judge-backend-ref my_eval.judges:build_backend \
+  --out-dir ./reports
+
 aworld-cli evaluator \
   --input ~/Documents/logs/trajectory.log \
   --kind trajectory \
@@ -67,6 +79,14 @@ aworld-cli evaluator \
 
 For `task` JSONL inputs, the default fields are `id` and `input`; the evaluator runs each task through the CLI default `Aworld` agent unless `--agent` is supplied. For `trajectory`, passing `--task-id` replays one task from an existing AWorld trajectory log, omitting `--task-id` with a trajectory log replays all tasks in that log, and omitting `--task-id` with task JSONL runs the main agent, extracts the response trajectory, and evaluates that generated trajectory. For `answer` JSONL inputs, the default fields are `id`, `input`, and `answer`. Use `--id-field`, `--task-field`, and `--answer-field` only when the file uses different names.
 
+Source-backed runs require exactly one judge selector:
+
+- `--judge-agent <agent.md>` loads a markdown judge prompt directly.
+- `--judge-agent-name <agent-or-team>` loads an AWorld CLI registered local agent or team by name and runs the judge prompt through that agent's executor. Use `--agent-dir` in commands that support it, or set `LOCAL_AGENTS_DIR`, when the judge agent is not in the default agent search path.
+- `--judge-backend-ref <module:callable>` imports a process-local Python factory or callable. A factory may return a `JudgeBackend`-compatible object such as `AgentJudgeBackend` / `CallableJudgeBackend`; a raw callable is wrapped as a callable judge backend.
+
+The three judge selectors are mutually exclusive. They are CLI assembly choices and are not serialized into suite manifests as live handles.
+
 Useful options:
 
 ```bash
@@ -75,6 +95,8 @@ aworld-cli evaluator --target ./artifact --interactive-approval
 aworld-cli evaluator --input ./tasks.jsonl --kind task --judge-agent ./agent.md --agent Aworld --output ./report.json
 aworld-cli evaluator --input ./tasks.jsonl --kind trajectory --judge-agent ./trajectory_agent.md --output ./report.json
 aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-agent ./agent.md --output ./report.json
+LOCAL_AGENTS_DIR=./judge_agents aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-agent-name answer-quality-judge
+aworld-cli evaluator --input ./task_answers.jsonl --kind answer --judge-backend-ref my_eval.judges:build_backend
 ```
 
 ## Declared Suite Manifests
@@ -133,9 +155,9 @@ Current event payloads:
 - `evaluator.pre_discover`: `target`, `workspace_path`
 - `evaluator.post_discover`: `target`, `workspace_path`, `suite_names`
 - `evaluator.pre_run` for target mode: `mode`, `target`, `suite`, `workspace_path`
-- `evaluator.pre_run` for source mode: `mode`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path`
+- `evaluator.pre_run` for source mode: `mode`, `input`, `kind`, `task_id`, `judge_agent`, `judge_agent_name`, `judge_backend_ref`, `agent`, `workspace_path`, `output_path`
 - `evaluator.post_run` for target mode: `mode`, `report`, `target`, `suite`, `workspace_path`
-- `evaluator.post_run` for source mode: `mode`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `agent`, `workspace_path`, `output_path`
+- `evaluator.post_run` for source mode: `mode`, `report`, `input`, `kind`, `task_id`, `judge_agent`, `judge_agent_name`, `judge_backend_ref`, `agent`, `workspace_path`, `output_path`
 - `evaluator.render_summary`: `report`, `workspace_path`
 
 Hook boundaries:
diff --git a/tests/core/test_evaluator_runtime.py b/tests/core/test_evaluator_runtime.py
index 65ffd1a8c..59ac8a159 100644
--- a/tests/core/test_evaluator_runtime.py
+++ b/tests/core/test_evaluator_runtime.py
@@ -29,6 +29,10 @@
 from aworld_cli.evaluator_rendering import render_evaluator_summary
 
 
+def _write_answer_source(path: Path) -> None:
+    path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8")
+
+
 @pytest.fixture(autouse=True)
 def _reset_eval_registry_state(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setattr(substrate_module, "_EVAL_SUITE_REGISTRY", {})
@@ -83,7 +87,7 @@ def test_run_evaluator_source_cli_builds_task_answer_flow_with_default_fields(
     tmp_path: Path,
 ) -> None:
     input_path = tmp_path / "answers.jsonl"
-    input_path.write_text('{"id":"case-1","input":"question","answer":"existing"}\n', encoding="utf-8")
+    _write_answer_source(input_path)
     judge_agent = tmp_path / "agent.md"
     judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8")
     output = tmp_path / "report.json"
@@ -121,6 +125,144 @@ async def fake_run_evaluation_flow(flow):
     assert output.exists()
 
 
+def test_run_evaluator_source_cli_supports_cli_judge_agent_name(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    input_path = tmp_path / "answers.jsonl"
+    _write_answer_source(input_path)
+    captured = {}
+
+    class FakeExecutor:
+        async def chat(self, prompt):
+            captured["prompt"] = prompt
+            return '{"score": 91, "verdict": "Pass", "veto_triggered": false}'
+
+    async def fake_load_cli_agent_executor(agent_name):
+        captured["agent_name"] = agent_name
+        return FakeExecutor()
+
+    monkeypatch.setattr(
+        "aworld_cli.evaluator_runtime._load_cli_agent_executor",
+        fake_load_cli_agent_executor,
+    )
+
+    async def fake_run_evaluation_flow(flow):
+        captured["flow"] = flow
+        execution = await flow.suite.judge_backend.execute(
+            flow.suite.cases[0].input,
+            {"answer": "existing"},
+            flow.suite,
+        )
+        return {
+            "report_version": 1,
+            "suite_id": "answer-source-evaluator",
+            "judge_backend": {"backend_id": execution.backend_id},
+            "summary": {"answer-source-evaluator": {"score": {"mean": execution.payload["score"]}}},
+            "results": [],
+            "gate": {"status": "pass", "metric_name": "score", "value": execution.payload["score"]},
+            "approval": {"required": False, "resolved": False, "approved": None},
+        }
+
+    monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow)
+
+    report = run_evaluator_source_cli(
+        input=str(input_path),
+        kind="answer",
+        judge_agent_name="JudgeTeam",
+        output=str(tmp_path / "report.json"),
+    )
+
+    assert captured["agent_name"] == "JudgeTeam"
+    assert captured["flow"].suite.judge_backend.backend_id == "source-agent:JudgeTeam"
+    assert report["judge_backend"]["backend_id"] == "source-agent:JudgeTeam"
+    assert report["source_selection"]["judge_agent_name"] == "JudgeTeam"
+    assert report["source_selection"]["judge_agent"] is None
+
+
+def test_run_evaluator_source_cli_supports_judge_backend_ref(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    input_path = tmp_path / "answers.jsonl"
+    _write_answer_source(input_path)
+    module_path = tmp_path / "custom_judge.py"
+    module_path.write_text(
+        "\n".join(
+            [
+                "from aworld.evaluations.substrate import CallableJudgeBackend",
+                "",
+                "async def judge(case_input, target):",
+                "    return {'score': 82, 'verdict': 'Pass', 'veto_triggered': False}",
+                "",
+                "def build_backend():",
+                "    return CallableJudgeBackend(backend_id='custom-backend', judge=judge)",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.syspath_prepend(str(tmp_path))
+    captured = {}
+
+    async def fake_run_evaluation_flow(flow):
+        captured["flow"] = flow
+        execution = await flow.suite.judge_backend.execute(
+            flow.suite.cases[0].input,
+            {"answer": "existing"},
+            flow.suite,
+        )
+        return {
+            "report_version": 1,
+            "suite_id": "answer-source-evaluator",
+            "judge_backend": {"backend_id": execution.backend_id},
+            "summary": {"answer-source-evaluator": {"score": {"mean": execution.payload["score"]}}},
+            "results": [],
+            "gate": {"status": "pass", "metric_name": "score", "value": execution.payload["score"]},
+            "approval": {"required": False, "resolved": False, "approved": None},
+        }
+
+    monkeypatch.setattr("aworld_cli.evaluator_runtime.run_evaluation_flow", fake_run_evaluation_flow)
+
+    report = run_evaluator_source_cli(
+        input=str(input_path),
+        kind="answer",
+        judge_backend_ref="custom_judge:build_backend",
+        output=str(tmp_path / "report.json"),
+    )
+
+    assert captured["flow"].suite.judge_backend.backend_id == "custom-backend"
+    assert report["judge_backend"]["backend_id"] == "custom-backend"
+    assert report["source_selection"]["judge_backend_ref"] == "custom_judge:build_backend"
+
+
+def test_run_evaluator_source_cli_rejects_missing_judge_selector(tmp_path: Path) -> None:
+    input_path = tmp_path / "answers.jsonl"
+    _write_answer_source(input_path)
+
+    with pytest.raises(ValueError, match="exactly one judge selector"):
+        run_evaluator_source_cli(
+            input=str(input_path),
+            kind="answer",
+            output=str(tmp_path / "report.json"),
+        )
+
+
+def test_run_evaluator_source_cli_rejects_multiple_judge_selectors(tmp_path: Path) -> None:
+    input_path = tmp_path / "answers.jsonl"
+    _write_answer_source(input_path)
+    judge_agent = tmp_path / "agent.md"
+    judge_agent.write_text("---\nname: judge\n---\nJudge.\n", encoding="utf-8")
+
+    with pytest.raises(ValueError, match="exactly one judge selector"):
+        run_evaluator_source_cli(
+            input=str(input_path),
+            kind="answer",
+            judge_agent=str(judge_agent),
+            judge_agent_name="JudgeTeam",
+            output=str(tmp_path / "report.json"),
+        )
+
+
 def test_run_evaluator_source_cli_builds_task_flow_with_default_agent(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
diff --git a/tests/core/test_evaluator_top_level_command.py b/tests/core/test_evaluator_top_level_command.py
index 41af8bbe9..77caa4ba6 100644
--- a/tests/core/test_evaluator_top_level_command.py
+++ b/tests/core/test_evaluator_top_level_command.py
@@ -122,6 +122,96 @@ def fake_run_evaluator_source_cli(**kwargs):
     assert "pass" in output
 
 
+def test_maybe_dispatch_top_level_command_accepts_judge_agent_name(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    input_path = tmp_path / "answers.jsonl"
+    input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8")
+    calls = {}
+
+    def fake_run_evaluator_source_cli(**kwargs):
+        calls.update(kwargs)
+        return {
+            "suite_id": "answer-source-evaluator",
+            "gate": {"status": "pass"},
+            "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}},
+            "results": [],
+            "approval": {"required": False, "resolved": False, "approved": None},
+        }
+
+    monkeypatch.setattr(
+        "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli",
+        fake_run_evaluator_source_cli,
+    )
+
+    handled = main_module._maybe_dispatch_top_level_command(
+        [
+            "aworld-cli",
+            "evaluator",
+            "--input",
+            str(input_path),
+            "--kind",
+            "answer",
+            "--judge-agent-name",
+            "JudgeTeam",
+        ]
+    )
+    output = capsys.readouterr().out
+
+    assert handled is True
+    assert calls["judge_agent"] is None
+    assert calls["judge_agent_name"] == "JudgeTeam"
+    assert calls["judge_backend_ref"] is None
+    assert "answer-source-evaluator" in output
+
+
+def test_maybe_dispatch_top_level_command_accepts_judge_backend_ref(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    input_path = tmp_path / "answers.jsonl"
+    input_path.write_text('{"id":"case-1","input":"question","answer":"answer"}\n', encoding="utf-8")
+    calls = {}
+
+    def fake_run_evaluator_source_cli(**kwargs):
+        calls.update(kwargs)
+        return {
+            "suite_id": "answer-source-evaluator",
+            "gate": {"status": "pass"},
+            "summary": {"answer-source-evaluator": {"score": {"mean": 0.9}}},
+            "results": [],
+            "approval": {"required": False, "resolved": False, "approved": None},
+        }
+
+    monkeypatch.setattr(
+        "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli",
+        fake_run_evaluator_source_cli,
+    )
+
+    handled = main_module._maybe_dispatch_top_level_command(
+        [
+            "aworld-cli",
+            "evaluator",
+            "--input",
+            str(input_path),
+            "--kind",
+            "answer",
+            "--judge-backend-ref",
+            "custom_judge:build_backend",
+        ]
+    )
+    output = capsys.readouterr().out
+
+    assert handled is True
+    assert calls["judge_agent"] is None
+    assert calls["judge_agent_name"] is None
+    assert calls["judge_backend_ref"] == "custom_judge:build_backend"
+    assert "answer-source-evaluator" in output
+
+
 def test_maybe_dispatch_top_level_command_runs_task_source_with_default_agent(
     monkeypatch: pytest.MonkeyPatch,
     tmp_path: Path,
@@ -277,7 +367,7 @@ def test_evaluator_source_run_rejects_other_target_mode_arguments(
     assert expected in output
 
 
-def test_evaluator_source_mode_requires_kind_and_judge_agent(
+def test_evaluator_source_mode_requires_kind_and_judge_selector(
     monkeypatch: pytest.MonkeyPatch,
     capsys: pytest.CaptureFixture[str],
 ) -> None:
@@ -293,6 +383,8 @@ def test_evaluator_source_mode_requires_kind_and_judge_agent(
             input="answers.jsonl",
             kind=None,
             judge_agent=None,
+            judge_agent_name=None,
+            judge_backend_ref=None,
             out_dir=None,
             output=None,
             task_id=None,
@@ -313,6 +405,44 @@ def test_evaluator_source_mode_requires_kind_and_judge_agent(
     assert "--kind is required with --input" in output
 
 
+def test_evaluator_source_mode_requires_one_judge_selector(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    monkeypatch.setattr(
+        "aworld_cli.top_level_commands.evaluator_cmd.run_evaluator_source_cli",
+        lambda **kwargs: pytest.fail("source runtime should not be called"),
+    )
+
+    exit_code = EvaluatorTopLevelCommand().run(
+        SimpleNamespace(
+            target=None,
+            suite=None,
+            input="answers.jsonl",
+            kind="answer",
+            judge_agent=None,
+            judge_agent_name=None,
+            judge_backend_ref=None,
+            out_dir=None,
+            output=None,
+            task_id=None,
+            agent=None,
+            id_field="id",
+            task_field="input",
+            answer_field="answer",
+            interactive_approval=False,
+            list_suites=False,
+            print_report_schema=False,
+            validate_report=None,
+        ),
+        TopLevelCommandContext(cwd="/tmp"),
+    )
+
+    output = capsys.readouterr().out
+    assert exit_code == 1
+    assert "exactly one of --judge-agent, --judge-agent-name, or --judge-backend-ref is required" in output
+
+
 def test_evaluator_command_returns_nonzero_for_unresolved_approval(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
diff --git a/tests/docs/test_evaluator_report_docs.py b/tests/docs/test_evaluator_report_docs.py
index 801aab593..ef1fd8994 100644
--- a/tests/docs/test_evaluator_report_docs.py
+++ b/tests/docs/test_evaluator_report_docs.py
@@ -17,6 +17,9 @@ def test_evaluator_report_command_doc_covers_schema_and_validation() -> None:
     assert "--kind task" in content
     assert "--kind answer" in content
     assert "--kind trajectory" in content
+    assert "--judge-agent-name" in content
+    assert "--judge-backend-ref" in content
+    assert "exactly one judge selector" in content
     assert "report_format" in content
     assert "automation" in content
     assert ".aworld/evaluators/*.json" in content
diff --git a/tests/evaluations/test_answer_quality_judge_fixtures.py b/tests/evaluations/test_answer_quality_judge_fixtures.py
new file mode 100644
index 000000000..498bc6a06
--- /dev/null
+++ b/tests/evaluations/test_answer_quality_judge_fixtures.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import asyncio
+import importlib
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "aworld-cli" / "src"))
+
+
+FIXTURE_ROOT = Path(__file__).resolve().parents[1] / "fixtures" / "evaluator_judges"
+
+
+def test_answer_quality_prompt_fixture_matches_judge_contract() -> None:
+    prompt = (FIXTURE_ROOT / "answer_quality_agent.md").read_text(encoding="utf-8")
+
+    assert "Answer Quality Evaluator" in prompt
+    assert "Q1_correctness" in prompt
+    assert "veto_triggered" in prompt
+    assert "最终回复必须是且仅是一个 JSON 对象" in prompt
+
+
+def test_answer_quality_judge_agent_name_fixture_registers_local_swarm(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from aworld_cli.core.agent_registry import LocalAgentRegistry
+    from aworld_cli.core.loader import init_agents
+
+    monkeypatch.setattr(LocalAgentRegistry, "_instance", None)
+
+    init_agents(FIXTURE_ROOT / "agents")
+
+    local_agent = LocalAgentRegistry.get_agent("answer-quality-judge")
+    assert local_agent is not None
+    assert local_agent.register_dir == str((FIXTURE_ROOT / "agents").resolve())
+
+    swarm = asyncio.run(local_agent.get_swarm(refresh=True))
+    judge_agent = swarm.topology[0]
+    assert judge_agent.name() == "answer-quality-judge"
+    assert "Answer Quality Evaluator" in judge_agent.system_prompt
+    assert "Q1_correctness" in judge_agent.system_prompt
+
+
+def test_answer_quality_backend_ref_fixture_builds_prompt_backed_backend(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.syspath_prepend(str(FIXTURE_ROOT))
+    module = importlib.import_module("answer_quality_backend")
+
+    backend = module.build_backend()
+
+    assert backend.backend_id == "answer-quality-fixture-backend"
+    assert "Answer Quality Evaluator" in backend.system_prompt
+    assert "Q1_correctness" in backend.system_prompt
+
+    execution = asyncio.run(
+        backend.execute(
+            {"input": "What is AWorld?"},
+            {"answer": "AWorld is an agent framework."},
+            suite=None,
+        )
+    )
+    assert execution.backend_id == "answer-quality-fixture-backend"
+    assert execution.payload["verdict"] == "Pass"
+    assert execution.payload["veto_triggered"] is False
diff --git a/tests/fixtures/evaluator_judges/agents/answer_quality_judge.py b/tests/fixtures/evaluator_judges/agents/answer_quality_judge.py
new file mode 100644
index 000000000..4a05b89cb
--- /dev/null
+++ b/tests/fixtures/evaluator_judges/agents/answer_quality_judge.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from aworld.agents.llm_agent import Agent
+from aworld.config import AgentConfig, ModelConfig
+from aworld.core.agent.swarm import Swarm
+from aworld_cli.core import agent
+
+
+PROMPT_PATH = Path(__file__).resolve().parents[1] / "answer_quality_agent.md"
+
+
+def _prompt() -> str:
+    return PROMPT_PATH.read_text(encoding="utf-8")
+
+
+@agent(
+    name="answer-quality-judge",
+    desc="Answer-quality judge fixture for evaluator source tests.",
+    metadata={"source": "tests.fixture", "prompt_path": str(PROMPT_PATH)},
+    unique=True,
+)
+def build_answer_quality_judge_swarm() -> Swarm:
+    judge = Agent(
+        name="answer-quality-judge",
+        desc="Evaluates task answers and returns the evaluator JSON schema.",
+        conf=AgentConfig(
+            llm_config=ModelConfig(
+                llm_model_name=os.environ.get("LLM_MODEL_NAME", "gpt-4"),
+                llm_provider=os.environ.get("LLM_PROVIDER", "openai"),
+                llm_api_key=os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY"),
+                llm_base_url=os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1"),
+                llm_temperature=float(os.environ.get("LLM_TEMPERATURE", "0.1")),
+                params={"max_completion_tokens": 4096},
+            ),
+            skill_configs={},
+        ),
+        system_prompt=_prompt(),
+    )
+    return Swarm(judge, max_steps=1, name="answer-quality-judge-swarm")
diff --git a/tests/fixtures/evaluator_judges/answer_quality_agent.md b/tests/fixtures/evaluator_judges/answer_quality_agent.md
new file mode 100644
index 000000000..ea85ef4f2
--- /dev/null
+++ b/tests/fixtures/evaluator_judges/answer_quality_agent.md
@@ -0,0 +1,116 @@
+---
+name: answer-quality-judge
+description: 使用 LLM-as-judge 对「问题 ↔ 答案」对做答案质量评估（reference-free，可选参考答案）。仅评估最终答案本身的正确性、完整性、贴合度、可读性与忠实度，不评估执行过程或工具使用。输入由 evaluator 框架以 JSON 注入（case + state.answer），无需读日志、无需调用工具。
+tools: Read, Write
+model: opus
+---
+
+# Answer Quality Evaluator（LLM-as-Judge）
+
+你是一名严格、以证据为准的 **答案质量评审员**。你的职责是对一条「问题 ↔ 答案」对做可复现、可量化的评估，**只覆盖最终答案本身的质量**，不评估 agent 的执行过程、工具使用或轨迹。
+
+你**就是**这里的 LLM judge：所有打分由你完成，不调用外部模型，也**不需要读取任何文件或运行任何命令**。
+
+## 评估输入（由框架注入）
+
+evaluator 框架会以**单个 JSON 对象**作为你的输入消息，结构形如：
+
+```json
+{
+  "case": { "task_id": "...", "input": "用户的原始问题/任务" },
+  "state": { "answer": "待评估的最终答案", "status": "...", "artifacts": {}, "trajectory": [], "tool_calls": [] },
+  "required_output_schema": { "score": "number 0-100", "verdict": "string" },
+  "instruction": "Evaluate the existing answer/state and return exactly one JSON object."
+}
+```
+
+判据来源（按优先级）：
+
+1. **`case.input`**：用户实际想要什么——这是判断「相关性/贴合度」和「完整性」的标尺。
+2. **`state.answer`**：被评估的答案——所有评分的对象。
+3. **参考答案（若存在）**：若 `case.input` 或 `state.artifacts` 中显式给出了 reference / 标准答案 / 验收要点，则以其为「正确性」基准；否则按 **reference-free** 处理，仅凭答案的内在一致性与常识可验证性判断。
+
+> 若 `state.answer` 为空或缺失，直接判 `Fail`、`score=0`、`Q1=1`，并在 `notes` 中说明。
+
+---
+
+## 阶段 1 · 评分（五维，1–5 分，带锚点）
+
+对每个维度给出 1–5 的整数分，并**引用答案中的具体片段或问题中的具体要求**作为依据。严禁仅凭印象打分。
+
+锚点统一含义：**5=优秀无明显问题 / 4=良好有小瑕疵 / 3=合格但有明确缺陷 / 2=较差影响可用性 / 1=不合格**。
+
+| 维度 | 权重 | 评什么 | 扣分信号 |
+|---|---|---|---|
+| Q1 正确性 / Correctness | 30% | 答案中的事实性断言、计算、结论是否正确；有参考时是否与参考一致 | 与参考矛盾；事实错误；计算/逻辑错误；编造数字、引述、专有名词 |
+| Q2 完整性 / Completeness | 25% | 是否覆盖问题的全部子诉求与关键要点，无关键遗漏 | 漏答子问题；只答一半；缺少必要前提/边界 |
+| Q3 贴合度 / Relevance & Instruction-following | 20% | 是否回答了**实际被问的问题**，是否遵守问题中的显式约束（格式、语言、长度、口吻等） | 答非所问；主题漂移；违反明确的格式/语言/长度要求 |
+| Q4 可读性 / Clarity | 15% | 组织、清晰度、长度适配、表达是否凝练无歧义 | 冗长堆砌；结构混乱；表述含混；语言与提问不一致 |
+| Q5 忠实度 / Faithfulness | 10% | 是否不臆造、不过度自信；不确定处是否如实标注；有参考时是否不超出参考范围杜撰 | 把猜测当事实；编造来源；无依据的绝对化断言 |
+
+---
+
+## 阶段 2 · 汇总与判定
+
+1. 计算加权总分（百分制）：`score = Σ(dim_score / 5 × weight) × 100`，四舍五入到整数。
+2. 给出等级：`≥85 Excellent / 70–84 Pass / 55–69 Marginal / <55 Fail`。
+3. **一票否决项**：若 Q1 正确性 ≤2（存在实质性事实/逻辑错误，或与参考答案直接矛盾），则置 `veto_triggered=true`，且最终 `verdict` 不得高于 `Marginal`，无论加权总分多少。
+4. 列出 **Top-3 优点** 与 **Top-3 待改进项**，每条附答案中的证据指针与可执行的改进建议。
+
+### 评判纪律（消除 judge 偏差）
+
+- 不因答案**更长 / 更华丽 / 更自信**而加分；只认正确性与目标贴合度。
+- 不被答案的自述（「我已确认…」「显然…」）影响——这类措辞需用问题约束与内在一致性核实。
+- 无参考时不确定真伪的事实，按「未证实」处理：可影响 Q5，但不要据此武断判 Q1 错误，除非违背常识或自相矛盾。
+- 打分**先写推理（引证），后给分数**，避免先入为主。
+- 语言中立：答案语言与问题不一致时，扣 Q3，而非据此曲解内容。
+
+---
+
+## 阶段 3 · 产出（返回严格 JSON）
+
+**你的最终回复必须是且仅是一个 JSON 对象，不要包裹 markdown 代码块、不要前后缀说明文字。** 框架会直接解析它。`score` 与 `verdict` 为框架必需字段，其余字段供报告与诊断使用：
+
+```json
+{
+  "task_id": "string",
+  "score": 0,
+  "verdict": "Excellent|Pass|Marginal|Fail",
+  "veto_triggered": false,
+  "Q1_correctness": 0,
+  "Q2_completeness": 0,
+  "Q3_relevance": 0,
+  "Q4_clarity": 0,
+  "Q5_faithfulness": 0,
+  "dimensions": {
+    "Q1_correctness":  {"score": 0, "weight": 0.30, "evidence": ["..."], "rationale": "..."},
+    "Q2_completeness": {"score": 0, "weight": 0.25, "evidence": ["..."], "rationale": "..."},
+    "Q3_relevance":    {"score": 0, "weight": 0.20, "evidence": ["..."], "rationale": "..."},
+    "Q4_clarity":      {"score": 0, "weight": 0.15, "evidence": ["..."], "rationale": "..."},
+    "Q5_faithfulness": {"score": 0, "weight": 0.10, "evidence": ["..."], "rationale": "..."}
+  },
+  "errors": [{"claim": "...", "why_wrong": "..."}],
+  "top_strengths": ["..."],
+  "top_improvements": [{"issue": "...", "evidence": "...", "suggestion": "..."}],
+  "notes": "可选：边界情况说明（如答案缺失、无参考、语言不一致等）"
+}
+```
+
+字段约束：
+- `score`：0–100 的整数，**等于**阶段 2 第 1 步算出的加权总分。
+- `verdict`：四档之一，且与 `score` 区间一致；触发一票否决时不得高于 `Marginal`。
+- `Q1..Q5` 顶层字段：与 `dimensions` 内对应 `score` 相同，便于框架直接读取。
+- `evidence`/`rationale`：引用答案或问题中的具体片段，不可空泛。
+- 报告语言与被评估答案保持一致。
+
+> 仅当用户在 directive 中显式提供了 `OUT_DIR` 时，才额外用 `Write` 落一份人类可读的 `OUT_DIR/answer_eval_<task_id>.md`；默认情况下**只返回上面的 JSON**，不写文件、不调工具。
+
+---
+
+## 执行清单（按序）
+
+- [ ] 解析注入的 JSON，定位 `case.input`、`state.answer`、可选参考答案
+- [ ] 答案缺失 → 直接 Fail 并返回（见输入说明）
+- [ ] 五维逐项打分（先证据后分数）
+- [ ] 加权汇总 + 一票否决检查 + 优缺点
+- [ ] 返回严格 JSON（score/verdict 必填，无 markdown 包裹）
diff --git a/tests/fixtures/evaluator_judges/answer_quality_backend.py b/tests/fixtures/evaluator_judges/answer_quality_backend.py
new file mode 100644
index 000000000..b5d117853
--- /dev/null
+++ b/tests/fixtures/evaluator_judges/answer_quality_backend.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from aworld.evaluations.substrate import AgentJudgeBackend
+
+
+PROMPT_PATH = Path(__file__).resolve().parent / "answer_quality_agent.md"
+
+
+def _prompt() -> str:
+    return PROMPT_PATH.read_text(encoding="utf-8")
+
+
+async def _deterministic_executor(prompt: Any, system_prompt: str) -> str:
+    payload = {
+        "task_id": "fixture",
+        "score": 88,
+        "verdict": "Pass",
+        "veto_triggered": False,
+        "Q1_correctness": 4,
+        "Q2_completeness": 4,
+        "Q3_relevance": 5,
+        "Q4_clarity": 4,
+        "Q5_faithfulness": 5,
+        "dimensions": {
+            "Q1_correctness": {
+                "score": 4,
+                "weight": 0.30,
+                "evidence": ["fixture answer"],
+                "rationale": "The fixture backend is deterministic for tests.",
+            }
+        },
+        "errors": [],
+        "top_strengths": ["Deterministic fixture response"],
+        "top_improvements": [],
+        "notes": "This backend-ref fixture verifies loading and schema plumbing without calling an LLM.",
+    }
+    return json.dumps(payload, ensure_ascii=False)
+
+
+def _prompt_builder(case_input: dict[str, Any], target: dict[str, Any], suite: Any) -> str:
+    return json.dumps(
+        {
+            "case": case_input,
+            "state": {"answer": target.get("answer")},
+            "required_output_schema": {"score": "number 0-100", "verdict": "string"},
+            "instruction": "Evaluate the existing answer/state and return exactly one JSON object.",
+        },
+        ensure_ascii=False,
+    )
+
+
+def build_backend() -> AgentJudgeBackend:
+    return AgentJudgeBackend(
+        backend_id="answer-quality-fixture-backend",
+        system_prompt=_prompt(),
+        executor=_deterministic_executor,
+        prompt_builder=_prompt_builder,
+    )
diff --git a/tests/test_slash_commands.py b/tests/test_slash_commands.py
index a230d89cb..b004af097 100644
--- a/tests/test_slash_commands.py
+++ b/tests/test_slash_commands.py
@@ -602,10 +602,80 @@ def fake_run_evaluator_source_cli(**kwargs):
         assert calls["kind"] == "trajectory"
         assert calls["task_id"] == "task-1"
         assert calls["judge_agent"] == str(agent_path)
+        assert calls["judge_agent_name"] is None
+        assert calls["judge_backend_ref"] is None
         assert calls["out_dir"] == str(tmp_path)
         assert "trajectory-source-evaluator" in result
         assert "Report:" in result
 
+    @pytest.mark.asyncio
+    async def test_evaluation_accepts_judge_agent_name(self, monkeypatch, tmp_path):
+        cmd = CommandRegistry.get("evaluation")
+        input_path = tmp_path / "answers.jsonl"
+        calls = {}
+
+        def fake_run_evaluator_source_cli(**kwargs):
+            calls.update(kwargs)
+            return {
+                "suite_id": "answer-source-evaluator",
+                "gate": {"status": "pass"},
+                "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}},
+                "results": [],
+                "approval": {"required": False, "resolved": False, "approved": None},
+                "report_path": str(tmp_path / "report.json"),
+            }
+
+        monkeypatch.setattr(
+            "aworld_cli.commands.evaluation_cmd.run_evaluator_source_cli",
+            fake_run_evaluator_source_cli,
+        )
+
+        result = await cmd.execute(
+            CommandContext(
+                cwd=os.getcwd(),
+                user_args=f"--input {input_path} --kind answer --judge-agent-name JudgeTeam",
+            )
+        )
+
+        assert calls["judge_agent"] is None
+        assert calls["judge_agent_name"] == "JudgeTeam"
+        assert calls["judge_backend_ref"] is None
+        assert "answer-source-evaluator" in result
+
+    @pytest.mark.asyncio
+    async def test_evaluation_accepts_judge_backend_ref(self, monkeypatch, tmp_path):
+        cmd = CommandRegistry.get("evaluation")
+        input_path = tmp_path / "answers.jsonl"
+        calls = {}
+
+        def fake_run_evaluator_source_cli(**kwargs):
+            calls.update(kwargs)
+            return {
+                "suite_id": "answer-source-evaluator",
+                "gate": {"status": "pass"},
+                "summary": {"answer-source-evaluator": {"score": {"mean": 88.0}}},
+                "results": [],
+                "approval": {"required": False, "resolved": False, "approved": None},
+                "report_path": str(tmp_path / "report.json"),
+            }
+
+        monkeypatch.setattr(
+            "aworld_cli.commands.evaluation_cmd.run_evaluator_source_cli",
+            fake_run_evaluator_source_cli,
+        )
+
+        result = await cmd.execute(
+            CommandContext(
+                cwd=os.getcwd(),
+                user_args=f"--input {input_path} --kind answer --judge-backend-ref custom_judge:build_backend",
+            )
+        )
+
+        assert calls["judge_agent"] is None
+        assert calls["judge_agent_name"] is None
+        assert calls["judge_backend_ref"] == "custom_judge:build_backend"
+        assert "answer-source-evaluator" in result
+
     @pytest.mark.asyncio
     async def test_evaluation_runs_source_runtime_without_nested_event_loop(self, monkeypatch, tmp_path):
         cmd = CommandRegistry.get("evaluation")