microsoft · corbyrosset · May 28, 2026 · May 29, 2026 · May 29, 2026
diff --git a/webeval/pyproject.toml b/webeval/pyproject.toml
@@ -39,7 +39,9 @@ dependencies = [
     "mlflow",
     "rpds-py",
     "pillow",
-    "tiktoken"
+    "tiktoken",
+    "imagehash",
+    "jinja2"
 ]
 
 

diff --git a/webeval/scripts/verify_trajectories.py b/webeval/scripts/verify_trajectories.py
@@ -60,6 +60,8 @@
     MMRubricAgentConfig,
     MMRubricOutcomeResult,
     MMRubricResult,
+    VerifierAgent,
+    VerifierAgentConfig,
 )
 from webeval.trajectory import Trajectory
 
@@ -127,12 +129,13 @@ def load_webtailbench_tasks(path: Path) -> Dict[str, Dict[str, Any]]:
 # Globals populated by ``_pool_init`` so each worker only builds its judge
 # clients once, not once per task.
 _GLOBAL_AGENT: Optional[MMRubricAgent] = None
+_GLOBAL_VERIFIER: Optional[VerifierAgent] = None
 _GLOBAL_TASKS: Optional[Dict[str, Dict[str, Any]]] = None
 _GLOBAL_ARGS: Dict[str, Any] = {}
 
 
 def _pool_init(args_dict: Dict[str, Any], tasks: Dict[str, Dict[str, Any]]):
-    global _GLOBAL_AGENT, _GLOBAL_TASKS, _GLOBAL_ARGS
+    global _GLOBAL_AGENT, _GLOBAL_VERIFIER, _GLOBAL_TASKS, _GLOBAL_ARGS
     _GLOBAL_TASKS = tasks
     _GLOBAL_ARGS = args_dict
 
@@ -158,6 +161,13 @@ def _pool_init(args_dict: Dict[str, Any], tasks: Dict[str, Dict[str, Any]]):
             action_definitions=FARA_ACTION_DEFINITIONS,
         )
     )
+    _GLOBAL_VERIFIER = VerifierAgent(
+        config=VerifierAgentConfig(
+            o4mini_client=o4mini_client,
+            gpt5_client=gpt5_client,
+            action_definitions=FARA_ACTION_DEFINITIONS,
+        )
+    )
     worker_logger.info(
         f"Worker ready: gpt5={len(gpt5_client._clients)} clients, "
         f"o4mini={len(o4mini_client._clients)} clients"
@@ -208,9 +218,26 @@ def _run_one(traj_dir_str: str) -> Dict[str, Any]:
             redo_eval=_GLOBAL_ARGS["redo_eval"],
         )
 
-        result = asyncio.run(_GLOBAL_AGENT._generate_reply(input_dict))
-        if not isinstance(result, dict):
-            raise TypeError(f"Expected dict from MMRubricAgent, got {type(result)}")
+        async def _run_pipeline(inp: Dict[str, Any]) -> Dict[str, Any]:
+            rubric_out = await _GLOBAL_AGENT._generate_reply(inp)
+            if not isinstance(rubric_out, dict):
+                raise TypeError(
+                    f"Expected dict from MMRubricAgent, got {type(rubric_out)}"
+                )
+            if "error" in rubric_out:
+                return rubric_out
+            outcome_block = rubric_out.get("outcome_verification") or {}
+            failure_analysis = await _GLOBAL_VERIFIER.verify(
+                rubric_dict=rubric_out,
+                outcome_dict=outcome_block,
+                input_dict=inp,
+            )
+            rubric_out.setdefault("intermediate_mm_rubric_steps", {}).update(
+                failure_analysis
+            )
+            return rubric_out
+
+        result = asyncio.run(_run_pipeline(input_dict))
         if "error" in result:
             raise RuntimeError(f"Rubric agent reported error: {result.get('error')}")
 
@@ -254,6 +281,8 @@ def _run_one(traj_dir_str: str) -> Dict[str, Any]:
                 "outcome_primary_intent": outcome_vr.primary_intent,
                 "rubric_total_max_points": rubric_vr.total_max_points,
                 "rubric_total_earned_points": rubric_vr.total_earned_points,
+                "cp_type_used": outcome_vr.cp_type_used,
+                "cp_violation": outcome_vr.cp_violation,
                 "error_taxonomy": error_taxonomy,
             }
         )

diff --git a/webeval/src/webeval/oai_clients/graceful_client.py b/webeval/src/webeval/oai_clients/graceful_client.py
@@ -48,6 +48,7 @@ def __init__(
         max_retries: int = 8,
         max_tokens: int = 115000,
         timeout: Optional[float] = None,
+        max_total_attempts: Optional[int] = None,
     ):
         super().__init__(max_tokens=max_tokens)
         if not clients:
@@ -57,6 +58,13 @@ def __init__(
         self.max_retries = max_retries
         self.max_tokens = max_tokens
         self.timeout = timeout
+        # Hard ceiling on total loop iterations per create() call so the
+        # loop always terminates even when branches don't consume `tries`.
+        self.max_total_attempts = (
+            max_total_attempts
+            if max_total_attempts is not None
+            else max_retries + 2 * len(clients)
+        )
         self.support_json = support_json
         self.blocklist: set = set()
         self._client_idx = random.randint(0, len(clients) - 1)
@@ -177,12 +185,14 @@ async def create(
         extra_create_args: Mapping[str, Any] = {},
     ) -> CreateResult:
         tries = self.max_retries
+        total_attempts = 0
         last_error: Optional[Exception] = None
         client = self.next_client(no_increment=True)
         # Mutable copy so we can mutate per-request without affecting callers.
         extra = dict(extra_create_args)
 
-        while tries > 0:
+        while tries > 0 and total_attempts < self.max_total_attempts:
+            total_attempts += 1
             request_tokens = client.count_tokens(messages=messages)
             self._remove_reasoning_effort_if_needed(client, extra)
             self.logger.info(
@@ -222,6 +232,10 @@ async def create(
                 continue
             except openai.BadRequestError as e:
                 if "check-access-response-enc" in str(e):
+                    # Consume the retry budget so a persistent access-token
+                    # failure on a small pool can't loop forever.
+                    tries -= 1
+                    last_error = e
                     self.logger.error(
                         f"GracefulRetryClient.create() AccessTokenError: {client.description}, refreshing credentials\n{e}"
                     )
@@ -298,12 +312,17 @@ async def create(
                 await asyncio.sleep(1)
                 continue
             except openai.AuthenticationError as e:
+                # Consume the retry budget so a persistent auth failure on a
+                # single-endpoint pool can't loop forever.
+                tries -= 1
+                last_error = e
                 self.logger.error(
                     f"GracefulRetryClient.create() AuthenticationError: {client.description}: {e}"
                 )
                 if hasattr(client, "refresh_credentials"):
                     client.refresh_credentials()
                 client = self.next_client()
+                await asyncio.sleep(1)
                 continue
             except openai.APIStatusError as e:
                 if "Prompt is too large" in str(e):
@@ -346,7 +365,8 @@ async def create(
             raise last_error
         valid_clients = [c for c in self._clients if c.endpoint not in self.blocklist]
         raise Exception(
-            f"GracefulRetryClient.create(): all clients exhausted after {self.max_retries} retries; "
+            f"GracefulRetryClient.create(): gave up after {total_attempts} attempts "
+            f"(max_retries={self.max_retries}, max_total_attempts={self.max_total_attempts}); "
             f"{len(valid_clients)}/{len(self._clients)} clients reachable. Blocklist size: {len(self.blocklist)}"
         )
 

diff --git a/webeval/src/webeval/rubric_agent/__init__.py b/webeval/src/webeval/rubric_agent/__init__.py
@@ -1,14 +1,24 @@
-"""Universal Verifier (MMRubricAgent).
+"""Universal Verifier (MMRubricAgent) + companion VerifierAgent.
 
 Self-contained multimodal rubric verification pipeline used by the fara
 ``webeval`` package to score agent trajectories. Uses the
 :class:`webeval.oai_clients.ChatCompletionClient` interface.
+
+- :class:`MMRubricAgent` runs Steps 0–8: rubric generation → action-only
+  scoring → screenshot evidence analysis → multimodal rescoring →
+  outcome verification + critical-point classification + CP violation
+  check.
+- :class:`VerifierAgent` runs Steps 9a/9b/10: failure-point analysis,
+  trajectory-informed task verification, unified task verification —
+  consumes the scored rubric produced by ``MMRubricAgent``.
 """
 
 from .mm_rubric_agent import MMRubricAgent, MMRubricAgentConfig
+from .verifier_agent import VerifierAgent, VerifierAgentConfig
 from .data_point import (
     Action,
     ComputerObservation,
+    CriticalPointClassificationResult,
     DataPoint,
     DataPointMetadata,
     MMRubricOutcomeResult,
@@ -25,6 +35,8 @@
 __all__ = [
     "MMRubricAgent",
     "MMRubricAgentConfig",
+    "VerifierAgent",
+    "VerifierAgentConfig",
     "DataPoint",
     "DataPointMetadata",
     "Task",
@@ -38,4 +50,5 @@
     "VerificationResult",
     "MMRubricResult",
     "MMRubricOutcomeResult",
+    "CriticalPointClassificationResult",
 ]
diff --git a/webeval/src/webeval/rubric_agent/_cp_schema.py b/webeval/src/webeval/rubric_agent/_cp_schema.py
@@ -0,0 +1,117 @@
+"""Critical-point taxonomy schema (inline-ported from agento_next).
+
+The original lives at
+``agento_next/agents/critical_point_compliance/schema.py``. Only the
+two types consumed by ``critical_point_classifier.py`` are kept here:
+
+- :class:`Confidence` — judge confidence enum with descriptions.
+- :class:`CriticalPointTypesConfig` — YAML-loaded type taxonomy.
+
+The form-field / compliance dataclasses (``ClassificationResult``,
+``ComplianceResult``, etc.) and the ``ImageObj``-using
+``ComplianceInput`` are intentionally excluded — they pull
+``aztool.clients.messages`` and would re-introduce the external
+dependency this fork is designed to avoid.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+
+
+class Confidence(str, Enum):
+    """Judge confidence levels, with human-readable descriptions.
+
+    Each member carries a short value (``"HIGH"``, etc.) and a longer
+    ``description`` suitable for inclusion in LLM prompts.
+    """
+
+    HIGH = (
+        "HIGH",
+        (
+            "The screenshots clearly show what the agent did, and the "
+            "compliance verdict is unambiguous."
+        ),
+    )
+    MEDIUM = (
+        "MEDIUM",
+        (
+            "Some agent actions are unclear from the screenshots "
+            "(e.g., fields are partially visible, scrolled off-screen), "
+            "or the expected behavior has some room for interpretation."
+        ),
+    )
+    LOW = (
+        "LOW",
+        (
+            "The screenshots are unclear, the agent's actions are hard "
+            "to trace, or the expected behavior is ambiguous enough that "
+            "the verdict could go either way."
+        ),
+    )
+
+    def __new__(cls, value: str, description: str = "") -> "Confidence":
+        obj = str.__new__(cls, value)
+        obj._value_ = value
+        return obj
+
+    def __init__(self, value: str, description: str = "") -> None:
+        self._description = description
+
+    @property
+    def description(self) -> str:
+        """Human-readable explanation of this confidence level."""
+        return self._description
+
+
+@dataclass(frozen=True)
+class CriticalPointType:
+    """Configuration for a single critical point type."""
+
+    name: str
+    description: str
+    expected_behavior: str = ""
+    examples: tuple[str, ...] = ()
+
+
+@dataclass(frozen=True)
+class CriticalPointTypesConfig:
+    """Loaded configuration for all critical point types.
+
+    Typically deserialized from ``critical_point_types.yaml``.
+    """
+
+    definition: str = ""
+    types: dict[str, CriticalPointType] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "CriticalPointTypesConfig":
+        definition = data.get("definition", "")
+        types_data = data.get("types", {})
+        types = {
+            name: CriticalPointType(
+                name=name,
+                description=config.get("description", ""),
+                expected_behavior=config.get("expected_behavior", ""),
+                examples=tuple(config.get("examples", [])),
+            )
+            for name, config in types_data.items()
+        }
+        return cls(definition=definition, types=types)
+
+    def items(self) -> Any:
+        """Iterates over types as ``(name, dict)`` pairs for template use."""
+        return {
+            name: {
+                "description": t.description,
+                "expected_behavior": t.expected_behavior,
+                "examples": list(t.examples),
+            }
+            for name, t in self.types.items()
+        }.items()
+
+    def keys(self) -> set[str]:
+        """Returns the set of type names."""
+        return set(self.types.keys())