braintrustdata · Barrett Pyke (barrettpyke) · Jun 11, 2026 · Jun 10, 2026
diff --git a/py/autoevals/ragas.py b/py/autoevals/ragas.py
@@ -983,7 +983,7 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None,
 
         statements = (
             await aextract_statements(
-                client=self.client, question=input, answer=expected, model=self.model, **self.extra_args
+                client=self.client, question=input, answer=output, model=self.model, **self.extra_args
             )
         )["statements"]
 
@@ -1003,12 +1003,10 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None,
         )
 
     def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwargs):
-        check_required("Faithfulness", input=input, context=context)
+        check_required("Faithfulness", input=input, output=output, context=context)
 
         statements = (
-            extract_statements(
-                client=self.client, question=input, answer=expected, model=self.model, **self.extra_args
-            )
+            extract_statements(client=self.client, question=input, answer=output, model=self.model, **self.extra_args)
         )["statements"]
 
         faithfulness = (

diff --git a/py/autoevals/test_ragas.py b/py/autoevals/test_ragas.py
@@ -6,6 +6,7 @@
 from httpx import Response
 from openai import OpenAI
 
+import autoevals.ragas as ragas_module
 from autoevals import init
 from autoevals.ragas import *
 
@@ -125,6 +126,51 @@ def test_context_relevancy_score_normal_case():
     assert result.score >= 0.0
 
 
+def test_faithfulness_extracts_statements_from_output(monkeypatch):
+    """Regression test for Faithfulness answer routing.
+
+    This verifies that Faithfulness extracts statements from ``output`` (the model
+    answer being evaluated), not from ``expected`` (ground truth). The test uses
+    mismatched ``output``/``expected`` values and mocked helpers that derive
+    statements/verdicts from their inputs so the final score depends on which
+    field was routed into statement extraction.
+    """
+    captured_answer = None
+
+    def fake_extract_statements(question, answer, client=None, **extra_args):
+        nonlocal captured_answer
+        captured_answer = answer
+        statement = answer.strip().rstrip(".")
+        return {"statements": [statement]}
+
+    def fake_extract_faithfulness(context, statements, client=None, **extra_args):
+        faithfulness = []
+        for statement in statements:
+            verdict = int(statement in context)
+            faithfulness.append(
+                {
+                    "statement": statement,
+                    "verdict": verdict,
+                    "reason": "Supported by context" if verdict else "Not found in context",
+                }
+            )
+        return {"faithfulness": faithfulness}
+
+    monkeypatch.setattr(ragas_module, "extract_statements", fake_extract_statements)
+    monkeypatch.setattr(ragas_module, "extract_faithfulness", fake_extract_faithfulness)
+
+    scorer = Faithfulness()
+    score = scorer.eval(
+        input="What is the capital of France?",
+        output="Paris is the capital of France.",
+        expected="Lyon is the capital of France.",
+        context="Paris is the capital of France.",
+    )
+
+    assert score.score == 1
+    assert captured_answer == "Paris is the capital of France."
+
+
 @respx.mock
 def test_answer_correctness_uses_custom_embedding_model():
     """Test that AnswerCorrectness passes embedding_model parameter through to embeddings API."""