From 37796fbfd129a27117be5b64600b67e04019b6a1 Mon Sep 17 00:00:00 2001
From: Alex Rhee <arheebikes@gmail.com>
Date: Tue, 9 Jun 2026 19:06:19 -0700
Subject: [PATCH 1/2] Add thread_with_system template vars

---
 js/llm.test.ts               | 85 ++++++++++++++++++++++++++++++++++++
 js/llm.ts                    |  3 +-
 js/render-messages.test.ts   | 39 +++++++++++++++++
 js/thread-utils.ts           |  3 ++
 py/autoevals/llm.py          |  4 +-
 py/autoevals/test_llm.py     | 83 ++++++++++++++++++++++++++++++++++-
 py/autoevals/thread_utils.py |  9 +++-
 7 files changed, 221 insertions(+), 5 deletions(-)

diff --git a/js/llm.test.ts b/js/llm.test.ts
index 2b1b18be..e67977cf 100644
--- a/js/llm.test.ts
+++ b/js/llm.test.ts
@@ -8,6 +8,7 @@ import {
   buildClassificationTools,
   LLMClassifierFromTemplate,
   OpenAIClassifier,
+  templateUsesThreadVariables,
 } from "../js/llm";
 import {
   openaiClassifierShouldEvaluateArithmeticExpressions,
@@ -64,6 +65,15 @@ afterAll(() => {
 });
 
 describe("LLM Tests", () => {
+  test("templateUsesThreadVariables recognizes thread_with_system", () => {
+    expect(templateUsesThreadVariables("{{thread_with_system}}")).toBe(true);
+    expect(
+      templateUsesThreadVariables(
+        "Full thread: {{thread_with_system.0.content}}",
+      ),
+    ).toBe(true);
+  });
+
   test("openai classifier should evaluate titles", async () => {
     let callCount = -1;
     server.use(
@@ -342,6 +352,81 @@ Issue Description: {{page_content}}
     expect(capturedRequestBody.reasoning_effort).toBeUndefined();
   });
 
+  test("LLMClassifierFromTemplate keeps thread filtered while exposing thread_with_system", async () => {
+    let capturedRequestBody: unknown;
+    const systemMarker = "TRACE_SYSTEM_MESSAGE";
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+        capturedRequestBody = await request.json();
+
+        return HttpResponse.json({
+          id: "resp-test",
+          object: "response",
+          created: 1234567890,
+          model: "gpt-5-mini",
+          output: [
+            {
+              type: "function_call",
+              call_id: "call_test",
+              name: "select_choice",
+              arguments: JSON.stringify({ choice: "1" }),
+            },
+          ],
+        });
+      }),
+    );
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "thread-template",
+      promptTemplate:
+        "Filtered thread:\n{{thread}}\n\nFull thread:\n{{thread_with_system}}",
+      choiceScores: { "1": 1, "2": 0 },
+      useCoT: false,
+    });
+
+    await classifier({
+      output: "",
+      expected: "",
+      trace: {
+        async getThread() {
+          return [
+            { role: "system", content: systemMarker },
+            { role: "user", content: "Hello" },
+            { role: "assistant", content: "Hi there" },
+          ];
+        },
+      },
+    });
+
+    if (
+      !capturedRequestBody ||
+      typeof capturedRequestBody !== "object" ||
+      !("input" in capturedRequestBody) ||
+      !Array.isArray(capturedRequestBody.input)
+    ) {
+      throw new Error("Unexpected request body shape");
+    }
+
+    const firstInput = capturedRequestBody.input[0];
+    if (
+      !firstInput ||
+      typeof firstInput !== "object" ||
+      !("content" in firstInput) ||
+      typeof firstInput.content !== "string"
+    ) {
+      throw new Error("Unexpected request input shape");
+    }
+
+    const [filteredThread, fullThread] = firstInput.content.split(
+      "\n\nFull thread:\n",
+    );
+    expect(filteredThread).toContain("Hello");
+    expect(filteredThread).toContain("Hi there");
+    expect(filteredThread).not.toContain(systemMarker);
+    expect(fullThread).toContain(systemMarker);
+  });
+
   test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
     let responsesHit = false;
     let chatCompletionsHit = false;
diff --git a/js/llm.ts b/js/llm.ts
index 2826fab3..1b7f318e 100644
--- a/js/llm.ts
+++ b/js/llm.ts
@@ -30,6 +30,7 @@ export interface TraceForScorer {
 // Thread-related template variable names that require preprocessor invocation
 export const THREAD_VARIABLE_NAMES = [
   "thread",
+  "thread_with_system",
   "thread_count",
   "first_message",
   "last_message",
@@ -335,7 +336,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
     if (runtimeArgs.trace && templateUsesThreadVariables(promptTemplate)) {
       const thread = await runtimeArgs.trace.getThread();
       const scorerThread = filterSystemMessagesFromThread(thread);
-      const computed = computeThreadTemplateVars(scorerThread);
+      const computed = computeThreadTemplateVars(scorerThread, thread);
       // Build threadVars from THREAD_VARIABLE_NAMES to keep in sync with the pattern
       for (const name of THREAD_VARIABLE_NAMES) {
         threadVars[name] = computed[name as keyof ThreadTemplateVars];
diff --git a/js/render-messages.test.ts b/js/render-messages.test.ts
index 3d419394..d3975cb9 100644
--- a/js/render-messages.test.ts
+++ b/js/render-messages.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it } from "vitest";
 import { renderMessages } from "./render-messages";
 import { ChatCompletionMessageParam } from "openai/resources";
+import { computeThreadTemplateVars } from "./thread-utils";
 
 describe("renderMessages", () => {
   it("should never HTML-escape values, regardless of mustache syntax", () => {
@@ -182,4 +183,42 @@ describe("renderMessages with thread variables", () => {
     expect(rendered[0].content).toContain("Assistant:");
     expect(rendered[0].content).toContain("Simple response");
   });
+
+  it("computeThreadTemplateVars can expose thread_with_system separately", () => {
+    const fullThread = [
+      { role: "system", content: "You are a helpful assistant." },
+      ...sampleThread,
+    ];
+
+    const renderedVars = computeThreadTemplateVars(sampleThread, fullThread);
+
+    expect(renderedVars.thread).toEqual(sampleThread);
+    expect(renderedVars.thread_with_system).toEqual(fullThread);
+    expect(renderedVars.thread_count).toBe(sampleThread.length);
+    expect(renderedVars.first_message).toEqual(sampleThread[0]);
+  });
+
+  it("{{thread_with_system}} renders full conversation and supports indexing", () => {
+    const fullThread = [
+      { role: "system", content: "You are a helpful assistant." },
+      ...sampleThread,
+    ];
+    const messages: ChatCompletionMessageParam[] = [
+      {
+        role: "user",
+        content:
+          "Full thread: {{thread_with_system}}\n\nFirst full: {{thread_with_system.0}}",
+      },
+    ];
+    const rendered = renderMessages(
+      messages,
+      computeThreadTemplateVars(sampleThread, fullThread),
+    );
+
+    expect(rendered[0].content).toContain("System:");
+    expect(rendered[0].content).toContain("You are a helpful assistant.");
+    expect(rendered[0].content).toContain(
+      "First full: system: You are a helpful assistant.",
+    );
+  });
 });
diff --git a/js/thread-utils.ts b/js/thread-utils.ts
index b999b623..1f2048f8 100644
--- a/js/thread-utils.ts
+++ b/js/thread-utils.ts
@@ -252,6 +252,7 @@ export function formatMessageArrayAsText(messages: LLMMessage[]): string {
  */
 export interface ThreadTemplateVars {
   thread: unknown[];
+  thread_with_system: unknown[];
   thread_count: number;
   first_message: unknown | null;
   last_message: unknown | null;
@@ -270,6 +271,7 @@ export interface ThreadTemplateVars {
  */
 export function computeThreadTemplateVars(
   thread: unknown[],
+  threadWithSystem: unknown[] = thread,
 ): ThreadTemplateVars {
   let _user_messages: unknown[] | undefined;
   let _assistant_messages: unknown[] | undefined;
@@ -279,6 +281,7 @@ export function computeThreadTemplateVars(
 
   return {
     thread,
+    thread_with_system: threadWithSystem,
     thread_count: thread.length,
 
     get first_message(): unknown | null {
diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
index a027d42c..b0093cb4 100644
--- a/py/autoevals/llm.py
+++ b/py/autoevals/llm.py
@@ -434,7 +434,7 @@ def _compute_thread_vars_sync(self, trace) -> dict[str, object]:
         if not isinstance(thread, list):
             thread = list(thread)
 
-        computed = compute_thread_template_vars(filter_system_messages_from_thread(thread))
+        computed = compute_thread_template_vars(filter_system_messages_from_thread(thread), thread)
         return {name: computed[name] for name in self._thread_variable_names}
 
     async def _compute_thread_vars_async(self, trace) -> dict[str, object]:
@@ -450,7 +450,7 @@ async def _compute_thread_vars_async(self, trace) -> dict[str, object]:
         if not isinstance(thread, list):
             thread = list(thread)
 
-        computed = compute_thread_template_vars(filter_system_messages_from_thread(thread))
+        computed = compute_thread_template_vars(filter_system_messages_from_thread(thread), thread)
         return {name: computed[name] for name in self._thread_variable_names}
 
     def _request_args(self, output, expected, **kwargs):
diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py
index a1c2e01c..8e987efa 100644
--- a/py/autoevals/test_llm.py
+++ b/py/autoevals/test_llm.py
@@ -11,7 +11,7 @@
 from autoevals import init
 from autoevals.llm import Battle, Factuality, LLMClassifier, OpenAILLMClassifier, build_classification_tools
 from autoevals.oai import OpenAIV1Module, get_default_model
-from autoevals.thread_utils import compute_thread_template_vars
+from autoevals.thread_utils import compute_thread_template_vars, template_uses_thread_variables
 
 
 class TestModel(BaseModel):
@@ -96,6 +96,87 @@ def test_render_messages_with_thread_variables():
     assert rendered[6]["content"].startswith("Messages:\n- user: Hello, how are you?")
 
 
+def test_thread_template_detection_and_split_thread_vars():
+    assert template_uses_thread_variables("{{thread_with_system}}")
+
+    full_thread = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Hello, how are you?"},
+        {"role": "assistant", "content": "I am doing well, thank you!"},
+    ]
+    filtered_thread = full_thread[1:]
+
+    thread_vars = compute_thread_template_vars(filtered_thread, full_thread)
+
+    assert len(thread_vars["thread"]) == 2
+    assert len(thread_vars["thread_with_system"]) == 3
+    assert str(thread_vars["thread"][0]) == "user: Hello, how are you?"
+    assert str(thread_vars["thread_with_system"][0]) == "system: You are a helpful assistant."
+    assert thread_vars["thread_count"] == 2
+
+
+class _FakeTrace:
+    def __init__(self, thread):
+        self._thread = thread
+
+    async def get_thread(self, options=None):
+        del options
+        return self._thread
+
+
+def test_llm_classifier_request_args_keep_thread_filtered_and_thread_with_system_unfiltered():
+    system_marker = "PY_AUTOEVALS_SYSTEM_MARKER"
+    trace = _FakeTrace(
+        [
+            {"role": "system", "content": system_marker},
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there"},
+        ]
+    )
+    classifier = LLMClassifier(
+        "test",
+        "Filtered thread:\n{{thread}}\n\nFull thread:\n{{thread_with_system}}",
+        {"Yes": 1, "No": 0},
+        use_cot=False,
+    )
+
+    request_args = classifier._request_args(output="", expected="", trace=trace)
+    rendered_prompt = request_args["messages"][0]["content"]
+
+    filtered_thread, full_thread = rendered_prompt.split("\n\nFull thread:\n", 1)
+    assert "Hello" in filtered_thread
+    assert "Hi there" in filtered_thread
+    assert system_marker not in filtered_thread
+    assert system_marker in full_thread
+
+
+@pytest.mark.asyncio
+async def test_llm_classifier_request_args_async_keep_thread_filtered_and_thread_with_system_unfiltered():
+    system_marker = "PY_AUTOEVALS_SYSTEM_MARKER_ASYNC"
+    trace = _FakeTrace(
+        [
+            {"role": "system", "content": system_marker},
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there"},
+        ]
+    )
+    classifier = LLMClassifier(
+        "test",
+        "Filtered thread:\n{{thread}}\n\nFull thread:\n{{thread_with_system}}",
+        {"Yes": 1, "No": 0},
+        use_cot=False,
+    )
+
+    request_args = await classifier._request_args_async(output="", expected="", trace=trace)
+    rendered_prompt = request_args["messages"][0]["content"]
+
+    filtered_thread, full_thread = rendered_prompt.split("\n\nFull thread:\n", 1)
+    assert "Hello" in filtered_thread
+    assert "Hi there" in filtered_thread
+    assert system_marker not in filtered_thread
+    assert system_marker in full_thread
+
+
 def test_openai():
     e = OpenAILLMClassifier(
         "title",
diff --git a/py/autoevals/thread_utils.py b/py/autoevals/thread_utils.py
index bdfb13c0..0bbcbe8f 100644
--- a/py/autoevals/thread_utils.py
+++ b/py/autoevals/thread_utils.py
@@ -14,6 +14,7 @@
 
 THREAD_VARIABLE_NAMES = [
     "thread",
+    "thread_with_system",
     "thread_count",
     "first_message",
     "last_message",
@@ -245,8 +246,13 @@ def _to_renderable_message_array(messages: list[Any]) -> RenderableMessageArray:
     return RenderableMessageArray(wrapped)
 
 
-def compute_thread_template_vars(thread: list[Any]) -> dict[str, Any]:
+def compute_thread_template_vars(thread: list[Any], thread_with_system: list[Any] | None = None) -> dict[str, Any]:
     renderable_thread = _to_renderable_message_array(thread) if is_llm_message_array(thread) else thread
+    if thread_with_system is None:
+        thread_with_system = thread
+    renderable_thread_with_system = (
+        _to_renderable_message_array(thread_with_system) if is_llm_message_array(thread_with_system) else thread_with_system
+    )
 
     first_message = renderable_thread[0] if len(renderable_thread) > 0 else None
     last_message = renderable_thread[-1] if len(renderable_thread) > 0 else None
@@ -264,6 +270,7 @@ def compute_thread_template_vars(thread: list[Any]) -> dict[str, Any]:
 
     return {
         "thread": renderable_thread,
+        "thread_with_system": renderable_thread_with_system,
         "thread_count": len(thread),
         "first_message": first_message,
         "last_message": last_message,

From f115ae935133c3356b7e67d84c15e37e5f14f598 Mon Sep 17 00:00:00 2001
From: Alex Rhee <arheebikes@gmail.com>
Date: Tue, 9 Jun 2026 19:20:14 -0700
Subject: [PATCH 2/2] Fix autoevals formatting

---
 js/llm.test.ts               | 5 ++---
 py/autoevals/thread_utils.py | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/js/llm.test.ts b/js/llm.test.ts
index e67977cf..5863eecc 100644
--- a/js/llm.test.ts
+++ b/js/llm.test.ts
@@ -418,9 +418,8 @@ Issue Description: {{page_content}}
       throw new Error("Unexpected request input shape");
     }
 
-    const [filteredThread, fullThread] = firstInput.content.split(
-      "\n\nFull thread:\n",
-    );
+    const [filteredThread, fullThread] =
+      firstInput.content.split("\n\nFull thread:\n");
     expect(filteredThread).toContain("Hello");
     expect(filteredThread).toContain("Hi there");
     expect(filteredThread).not.toContain(systemMarker);
diff --git a/py/autoevals/thread_utils.py b/py/autoevals/thread_utils.py
index 0bbcbe8f..019d1680 100644
--- a/py/autoevals/thread_utils.py
+++ b/py/autoevals/thread_utils.py
@@ -251,7 +251,9 @@ def compute_thread_template_vars(thread: list[Any], thread_with_system: list[Any
     if thread_with_system is None:
         thread_with_system = thread
     renderable_thread_with_system = (
-        _to_renderable_message_array(thread_with_system) if is_llm_message_array(thread_with_system) else thread_with_system
+        _to_renderable_message_array(thread_with_system)
+        if is_llm_message_array(thread_with_system)
+        else thread_with_system
     )
 
     first_message = renderable_thread[0] if len(renderable_thread) > 0 else None