Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion webeval/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ dependencies = [
"mlflow",
"rpds-py",
"pillow",
"tiktoken"
"tiktoken",
"imagehash",
"jinja2"
]


Expand Down
37 changes: 33 additions & 4 deletions webeval/scripts/verify_trajectories.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
MMRubricAgentConfig,
MMRubricOutcomeResult,
MMRubricResult,
VerifierAgent,
VerifierAgentConfig,
)
from webeval.trajectory import Trajectory

Expand Down Expand Up @@ -127,12 +129,13 @@ def load_webtailbench_tasks(path: Path) -> Dict[str, Dict[str, Any]]:
# Globals populated by ``_pool_init`` so each worker only builds its judge
# clients once, not once per task.
_GLOBAL_AGENT: Optional[MMRubricAgent] = None
_GLOBAL_VERIFIER: Optional[VerifierAgent] = None
_GLOBAL_TASKS: Optional[Dict[str, Dict[str, Any]]] = None
_GLOBAL_ARGS: Dict[str, Any] = {}


def _pool_init(args_dict: Dict[str, Any], tasks: Dict[str, Dict[str, Any]]):
global _GLOBAL_AGENT, _GLOBAL_TASKS, _GLOBAL_ARGS
global _GLOBAL_AGENT, _GLOBAL_VERIFIER, _GLOBAL_TASKS, _GLOBAL_ARGS
_GLOBAL_TASKS = tasks
_GLOBAL_ARGS = args_dict

Expand All @@ -158,6 +161,13 @@ def _pool_init(args_dict: Dict[str, Any], tasks: Dict[str, Dict[str, Any]]):
action_definitions=FARA_ACTION_DEFINITIONS,
)
)
_GLOBAL_VERIFIER = VerifierAgent(
config=VerifierAgentConfig(
o4mini_client=o4mini_client,
gpt5_client=gpt5_client,
action_definitions=FARA_ACTION_DEFINITIONS,
)
)
worker_logger.info(
f"Worker ready: gpt5={len(gpt5_client._clients)} clients, "
f"o4mini={len(o4mini_client._clients)} clients"
Expand Down Expand Up @@ -208,9 +218,26 @@ def _run_one(traj_dir_str: str) -> Dict[str, Any]:
redo_eval=_GLOBAL_ARGS["redo_eval"],
)

result = asyncio.run(_GLOBAL_AGENT._generate_reply(input_dict))
if not isinstance(result, dict):
raise TypeError(f"Expected dict from MMRubricAgent, got {type(result)}")
async def _run_pipeline(inp: Dict[str, Any]) -> Dict[str, Any]:
rubric_out = await _GLOBAL_AGENT._generate_reply(inp)
if not isinstance(rubric_out, dict):
raise TypeError(
f"Expected dict from MMRubricAgent, got {type(rubric_out)}"
)
if "error" in rubric_out:
return rubric_out
outcome_block = rubric_out.get("outcome_verification") or {}
failure_analysis = await _GLOBAL_VERIFIER.verify(
rubric_dict=rubric_out,
outcome_dict=outcome_block,
input_dict=inp,
)
rubric_out.setdefault("intermediate_mm_rubric_steps", {}).update(
failure_analysis
)
return rubric_out

result = asyncio.run(_run_pipeline(input_dict))
if "error" in result:
raise RuntimeError(f"Rubric agent reported error: {result.get('error')}")

Expand Down Expand Up @@ -254,6 +281,8 @@ def _run_one(traj_dir_str: str) -> Dict[str, Any]:
"outcome_primary_intent": outcome_vr.primary_intent,
"rubric_total_max_points": rubric_vr.total_max_points,
"rubric_total_earned_points": rubric_vr.total_earned_points,
"cp_type_used": outcome_vr.cp_type_used,
"cp_violation": outcome_vr.cp_violation,
"error_taxonomy": error_taxonomy,
}
)
Expand Down
24 changes: 22 additions & 2 deletions webeval/src/webeval/oai_clients/graceful_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
max_retries: int = 8,
max_tokens: int = 115000,
timeout: Optional[float] = None,
max_total_attempts: Optional[int] = None,
):
super().__init__(max_tokens=max_tokens)
if not clients:
Expand All @@ -57,6 +58,13 @@ def __init__(
self.max_retries = max_retries
self.max_tokens = max_tokens
self.timeout = timeout
# Hard ceiling on total loop iterations per create() call so the
# loop always terminates even when branches don't consume `tries`.
self.max_total_attempts = (
max_total_attempts
if max_total_attempts is not None
else max_retries + 2 * len(clients)
)
self.support_json = support_json
self.blocklist: set = set()
self._client_idx = random.randint(0, len(clients) - 1)
Expand Down Expand Up @@ -177,12 +185,14 @@ async def create(
extra_create_args: Mapping[str, Any] = {},
) -> CreateResult:
tries = self.max_retries
total_attempts = 0
last_error: Optional[Exception] = None
client = self.next_client(no_increment=True)
# Mutable copy so we can mutate per-request without affecting callers.
extra = dict(extra_create_args)

while tries > 0:
while tries > 0 and total_attempts < self.max_total_attempts:
total_attempts += 1
request_tokens = client.count_tokens(messages=messages)
self._remove_reasoning_effort_if_needed(client, extra)
self.logger.info(
Expand Down Expand Up @@ -222,6 +232,10 @@ async def create(
continue
except openai.BadRequestError as e:
if "check-access-response-enc" in str(e):
# Consume the retry budget so a persistent access-token
# failure on a small pool can't loop forever.
tries -= 1
last_error = e
self.logger.error(
f"GracefulRetryClient.create() AccessTokenError: {client.description}, refreshing credentials\n{e}"
)
Expand Down Expand Up @@ -298,12 +312,17 @@ async def create(
await asyncio.sleep(1)
continue
except openai.AuthenticationError as e:
# Consume the retry budget so a persistent auth failure on a
# single-endpoint pool can't loop forever.
tries -= 1
last_error = e
self.logger.error(
f"GracefulRetryClient.create() AuthenticationError: {client.description}: {e}"
)
if hasattr(client, "refresh_credentials"):
client.refresh_credentials()
client = self.next_client()
await asyncio.sleep(1)
continue
except openai.APIStatusError as e:
if "Prompt is too large" in str(e):
Expand Down Expand Up @@ -346,7 +365,8 @@ async def create(
raise last_error
valid_clients = [c for c in self._clients if c.endpoint not in self.blocklist]
raise Exception(
f"GracefulRetryClient.create(): all clients exhausted after {self.max_retries} retries; "
f"GracefulRetryClient.create(): gave up after {total_attempts} attempts "
f"(max_retries={self.max_retries}, max_total_attempts={self.max_total_attempts}); "
f"{len(valid_clients)}/{len(self._clients)} clients reachable. Blocklist size: {len(self.blocklist)}"
)

Expand Down
15 changes: 14 additions & 1 deletion webeval/src/webeval/rubric_agent/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
"""Universal Verifier (MMRubricAgent).
"""Universal Verifier (MMRubricAgent) + companion VerifierAgent.

Self-contained multimodal rubric verification pipeline used by the fara
``webeval`` package to score agent trajectories. Uses the
:class:`webeval.oai_clients.ChatCompletionClient` interface.

- :class:`MMRubricAgent` runs Steps 0–8: rubric generation → action-only
scoring → screenshot evidence analysis → multimodal rescoring →
outcome verification + critical-point classification + CP violation
check.
- :class:`VerifierAgent` runs Steps 9a/9b/10: failure-point analysis,
trajectory-informed task verification, unified task verification —
consumes the scored rubric produced by ``MMRubricAgent``.
"""

from .mm_rubric_agent import MMRubricAgent, MMRubricAgentConfig
from .verifier_agent import VerifierAgent, VerifierAgentConfig
from .data_point import (
Action,
ComputerObservation,
CriticalPointClassificationResult,
DataPoint,
DataPointMetadata,
MMRubricOutcomeResult,
Expand All @@ -25,6 +35,8 @@
__all__ = [
"MMRubricAgent",
"MMRubricAgentConfig",
"VerifierAgent",
"VerifierAgentConfig",
"DataPoint",
"DataPointMetadata",
"Task",
Expand All @@ -38,4 +50,5 @@
"VerificationResult",
"MMRubricResult",
"MMRubricOutcomeResult",
"CriticalPointClassificationResult",
]
117 changes: 117 additions & 0 deletions webeval/src/webeval/rubric_agent/_cp_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Critical-point taxonomy schema (inline-ported from agento_next).

The original lives at
``agento_next/agents/critical_point_compliance/schema.py``. Only the
two types consumed by ``critical_point_classifier.py`` are kept here:

- :class:`Confidence` — judge confidence enum with descriptions.
- :class:`CriticalPointTypesConfig` — YAML-loaded type taxonomy.

The form-field / compliance dataclasses (``ClassificationResult``,
``ComplianceResult``, etc.) and the ``ImageObj``-using
``ComplianceInput`` are intentionally excluded — they pull
``aztool.clients.messages`` and would re-introduce the external
dependency this fork is designed to avoid.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Any


class Confidence(str, Enum):
"""Judge confidence levels, with human-readable descriptions.

Each member carries a short value (``"HIGH"``, etc.) and a longer
``description`` suitable for inclusion in LLM prompts.
"""

HIGH = (
"HIGH",
(
"The screenshots clearly show what the agent did, and the "
"compliance verdict is unambiguous."
),
)
MEDIUM = (
"MEDIUM",
(
"Some agent actions are unclear from the screenshots "
"(e.g., fields are partially visible, scrolled off-screen), "
"or the expected behavior has some room for interpretation."
),
)
LOW = (
"LOW",
(
"The screenshots are unclear, the agent's actions are hard "
"to trace, or the expected behavior is ambiguous enough that "
"the verdict could go either way."
),
)

def __new__(cls, value: str, description: str = "") -> "Confidence":
obj = str.__new__(cls, value)
obj._value_ = value
return obj

def __init__(self, value: str, description: str = "") -> None:
self._description = description

@property
def description(self) -> str:
"""Human-readable explanation of this confidence level."""
return self._description


@dataclass(frozen=True)
class CriticalPointType:
"""Configuration for a single critical point type."""

name: str
description: str
expected_behavior: str = ""
examples: tuple[str, ...] = ()


@dataclass(frozen=True)
class CriticalPointTypesConfig:
"""Loaded configuration for all critical point types.

Typically deserialized from ``critical_point_types.yaml``.
"""

definition: str = ""
types: dict[str, CriticalPointType] = field(default_factory=dict)

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "CriticalPointTypesConfig":
definition = data.get("definition", "")
types_data = data.get("types", {})
types = {
name: CriticalPointType(
name=name,
description=config.get("description", ""),
expected_behavior=config.get("expected_behavior", ""),
examples=tuple(config.get("examples", [])),
)
for name, config in types_data.items()
}
return cls(definition=definition, types=types)

def items(self) -> Any:
"""Iterates over types as ``(name, dict)`` pairs for template use."""
return {
name: {
"description": t.description,
"expected_behavior": t.expected_behavior,
"examples": list(t.examples),
}
for name, t in self.types.items()
}.items()

def keys(self) -> set[str]:
"""Returns the set of type names."""
return set(self.types.keys())
Loading