diff --git a/docs.json b/docs.json
index 6f1b72afe5..9706d5a072 100644
--- a/docs.json
+++ b/docs.json
@@ -520,9 +520,9 @@
{
"group": "Get Started",
"pages": [
+ "weave/agents-quickstart",
"weave/quickstart",
- "weave/tutorial-eval",
- "weave/tutorial-rag"
+ "weave/guides/integrations"
]
},
{
@@ -530,7 +530,17 @@
"pages": [
"weave/concepts/what-is-weave",
{
- "group": "Trace your application",
+ "group": "Trace your agents",
+ "pages": [
+ "weave/guides/tracking/trace-agents",
+ "weave/guides/tracking/view-agent-activity",
+ "weave/guides/tracking/trace-agents-batch",
+ "weave/guides/tracking/trace-sub-agents",
+ "weave/guides/evaluation/monitors"
+ ]
+ },
+ {
+ "group": "Trace your applications",
"pages": [
{
"group": "Tracing basics",
@@ -573,9 +583,11 @@
]
},
{
- "group": "Evaluate your application",
+ "group": "Evaluate your agents and applications",
"pages": [
"weave/guides/core-types/evaluations",
+ "weave/tutorial-eval",
+ "weave/tutorial-rag",
"weave/guides/core-types/datasets",
"weave/guides/evaluation/scorers",
"weave/guides/evaluation/builtin_scorers",
@@ -618,6 +630,7 @@
},
"weave/guides/tracking/redact-pii",
"weave/guides/evaluation/monitors",
+ "weave/guides/evaluation/custom-monitors",
"weave/guides/evaluation/automations",
"weave/guides/evaluation/guardrails",
"weave/guides/tracking/otel"
@@ -629,6 +642,16 @@
"weave/guides/integrations",
"weave/guides/integrations/autopatching",
"weave/guides/integrations/js",
+ {
+ "group": "Trace agents",
+ "pages": [
+ "weave/guides/integrations/agents/google-adk",
+ "weave/guides/integrations/agents/openai-agents-sdk",
+ "weave/guides/integrations/agents/claude-agents-sdk",
+ "weave/guides/integrations/agents/codex-harness",
+ "weave/guides/integrations/agents/pi-dev-harness"
+ ]
+ },
{
"group": "LLM Providers",
"pages": [
@@ -6689,4 +6712,4 @@
"stylesheet": "/css/styles.css"
},
"theme": "aspen"
-}
\ No newline at end of file
+}
diff --git a/snippets/_includes/agents-public-preview.mdx b/snippets/_includes/agents-public-preview.mdx
new file mode 100644
index 0000000000..24d5928466
--- /dev/null
+++ b/snippets/_includes/agents-public-preview.mdx
@@ -0,0 +1,3 @@
+
+Weave for Agents is in public preview. Features, APIs, and the Agents view UI may change before general availability.
+
diff --git a/support.mdx b/support.mdx
index 545aa89e74..43aebda78b 100644
--- a/support.mdx
+++ b/support.mdx
@@ -52,7 +52,7 @@ and the W&B community.
{/* AUTO-GENERATED: counts */}
- 16 articles · 8 tags
+ 18 articles · 11 tags
{/* END AUTO-GENERATED: counts */}
diff --git a/support/weave.mdx b/support/weave.mdx
index 16454173b9..671b2ccb28 100644
--- a/support/weave.mdx
+++ b/support/weave.mdx
@@ -6,6 +6,9 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2"
## Browse by category
+
+ 2 articles
+
2 articles
@@ -27,6 +30,12 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2"
4 articles
+
+ 2 articles
+
2 articles
+
+ 2 articles
+
diff --git a/support/weave/articles/what-is-the-difference-between-weave-op-and-weave-sdk-for-agents.mdx b/support/weave/articles/what-is-the-difference-between-weave-op-and-weave-sdk-for-agents.mdx
new file mode 100644
index 0000000000..1500c7650f
--- /dev/null
+++ b/support/weave/articles/what-is-the-difference-between-weave-op-and-weave-sdk-for-agents.mdx
@@ -0,0 +1,16 @@
+---
+title: "What is the difference between @weave.op and weave.start_session?"
+keywords: ["Agents", "Tracing", "weave.op"]
+---
+
+[`@weave.op`](/weave/guides/tracking/tracing) traces individual Python functions and surfaces results in the **Traces** tab.
+
+Weave's [agentic tracing](/weave/guides/tracking/trace-agents) (`weave.start_session`, `weave.start_turn`, `weave.start_llm`, `weave.start_tool`) models a full multi-turn agent conversation and surfaces results in the **Agents** tab.
+
+Both are part of the same `weave` API — the right choice depends on whether you're tracing functions and pipelines, or a multi-turn agentic application.
+
+---
+
+{/* AUTO-GENERATED: tab badges */}
+[Agents](/support/weave/tags/agents)[Tracing](/support/weave/tags/tracing)[weave.op](/support/weave/tags/weave-op)
+{/* END AUTO-GENERATED: tab badges */}
diff --git a/support/weave/articles/which-weave-api-should-i-use-for-agents.mdx b/support/weave/articles/which-weave-api-should-i-use-for-agents.mdx
new file mode 100644
index 0000000000..041cd59d2f
--- /dev/null
+++ b/support/weave/articles/which-weave-api-should-i-use-for-agents.mdx
@@ -0,0 +1,15 @@
+---
+title: "Which Weave API should I use to trace my agent?"
+keywords: ["Agents", "Tracing", "weave.op"]
+---
+
+It depends on what you're building:
+ - If you're running an off-the-shelf agent framework (such as Claude Code), install the corresponding [Weave integration](/weave/guides/integrations) and tracing happens automatically to the **Agents** tab.
+ - If you're building your own agent with sessions and turns, use the [Weave SDK for Agents](/weave/guides/tracking/trace-agents) (`weave.start_session`, `weave.start_turn`), which surfaces results in the **Agents** tab.
+ - If you're tracing individual functions or pipelines rather than agent conversations, use [`@weave.op`](/weave/guides/tracking/tracing), which surfaces results in the **Traces** tab.
+
+---
+
+{/* AUTO-GENERATED: tab badges */}
+[Agents](/support/weave/tags/agents)[Tracing](/support/weave/tags/tracing)[weave.op](/support/weave/tags/weave-op)
+{/* END AUTO-GENERATED: tab badges */}
diff --git a/support/weave/tags/agents.mdx b/support/weave/tags/agents.mdx
new file mode 100644
index 0000000000..bf84133780
--- /dev/null
+++ b/support/weave/tags/agents.mdx
@@ -0,0 +1,13 @@
+---
+title: "Agents"
+tag: "2"
+generator: "knowledgebase-nav"
+template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2"
+---
+
+
+ @weave.op traces individual Python functions and surfaces results in the Traces tab. Weave's agentic tracing (weave.star ...
+
+
+ It depends on what you're building: If you're running an off-the-shelf agent framework (such as Claude Code), install th ...
+
diff --git a/support/weave/tags/tracing.mdx b/support/weave/tags/tracing.mdx
new file mode 100644
index 0000000000..3cf8c89d08
--- /dev/null
+++ b/support/weave/tags/tracing.mdx
@@ -0,0 +1,13 @@
+---
+title: "Tracing"
+tag: "2"
+generator: "knowledgebase-nav"
+template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2"
+---
+
+
+ @weave.op traces individual Python functions and surfaces results in the Traces tab. Weave's agentic tracing (weave.star ...
+
+
+ It depends on what you're building: If you're running an off-the-shelf agent framework (such as Claude Code), install th ...
+
diff --git a/support/weave/tags/weave-op.mdx b/support/weave/tags/weave-op.mdx
new file mode 100644
index 0000000000..728a78c980
--- /dev/null
+++ b/support/weave/tags/weave-op.mdx
@@ -0,0 +1,13 @@
+---
+title: "weave.op"
+tag: "2"
+generator: "knowledgebase-nav"
+template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2"
+---
+
+
+ @weave.op traces individual Python functions and surfaces results in the Traces tab. Weave's agentic tracing (weave.star ...
+
+
+ It depends on what you're building: If you're running an off-the-shelf agent framework (such as Claude Code), install th ...
+
diff --git a/weave/agents-quickstart.mdx b/weave/agents-quickstart.mdx
new file mode 100644
index 0000000000..ea0b8e1368
--- /dev/null
+++ b/weave/agents-quickstart.mdx
@@ -0,0 +1,340 @@
+---
+title: "Quickstart: Trace an agent"
+description: Trace a multi-turn agent with the Weave SDK. Sessions, turns, LLM calls, and tool calls render in the Agents view of your project.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+[Try in Colab](https://colab.research.google.com/github/wandb/weave/blob/master/docs/weave/cookbooks/source/agents-quickstart.ipynb) · [GitHub source](https://github.com/wandb/weave/blob/master/docs/weave/cookbooks/source/agents-quickstart.ipynb)
+
+The Weave SDK allows you to trace agents built with popular SDKs or custom harnesses. This quickstart guides you through how to manually integrate Weave into a custom-built multi-turn agent to emit and capture OpenTelemetry spans.
+
+If you are looking to integrate Weave with popular SDKs or harnesses, such as the Claude Agent SDK or Codex, see the [Weave integration section](/weave/guides/integrations). Weave autopatches into several popular agent-building SDKs and agent harnesses for quick integration.
+
+## What you'll learn
+
+The code in this guide sets up a small research agent that can look things up on Wikipedia. It asks three questions (three turns), lets the AI decide when to search Wikipedia for an answer, and uses Weave to record every step (the conversation, each question, each AI response, and each Wikipedia lookup) so you can see exactly what happened in the Weave Agents view.
+
+This guide shows you how to:
+
+- Initialize Weave for agent tracing with `weave.init()`
+- Open a session and a turn with `start_session` / `startSession` and `start_turn` / `startTurn`
+- Wrap LLM calls with `start_llm` / `startLLM` and record usage
+- Wrap tool executions with `start_tool` / `startTool` and record results
+- View the resulting session, turns, and tool calls in the Agents view
+
+## How the Weave SDK works with agents
+
+The Weave SDK includes a generic OTel ingest system for agents, meaning that Weave can capture information from any OTel span in your agent's code. However, Weave requires special handling of the following spans to render your agent's traces in the Agents view of the Weave UI.
+
+| Concept | Python | TypeScript | OTel span |
+| --- | --- | --- | --- |
+| A conversation | `weave.start_session(...)` | `weave.startSession(...)` | (no span — groups turns) |
+| One user / agent exchange | `weave.start_turn(...)` | `weave.startTurn(...)` | `invoke_agent` |
+| One LLM API call | `weave.start_llm(...)` | `weave.startLLM(...)` | `chat` |
+| One tool execution | `weave.start_tool(...)` | `weave.startTool(...)` | `execute_tool` |
+
+In Python, all four functions work as context managers (`with weave.start_*(...) as obj:`). On exit, they end the span and flush attributes, including on exceptions. In TypeScript, call `.end()` on each returned object — use `try { ... } finally { obj.end(); }` to guarantee cleanup on exceptions.
+
+Other [GenAI semantic-convention attributes](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/), such as `gen_ai.usage.*` and `gen_ai.agent.name`, enable additional rendering, but they are optional.
+
+## Prerequisites
+
+- A W&B account and [API key](https://wandb.ai/authorize)
+- An OpenAI API key
+- Python 3.10+ (for the Python examples)
+- Node.js 18+ (for the TypeScript examples — built-in `fetch` is required)
+
+## Install packages
+
+Install the following packages into your developer environment:
+
+
+```bash Python
+pip install weave openai requests
+```
+
+```bash TypeScript
+npm install weave openai
+```
+
+
+## Initialize Weave
+
+`weave.init()` authenticates with W&B and configures the OTel exporter that sends agent spans to the **Agents** view. If the project does not exist on your team, Weave creates it the first time you write to it.
+
+
+```python lines Python
+import getpass
+import os
+
+os.environ["WANDB_API_KEY"] = getpass.getpass("Enter your W&B API key: ")
+os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
+
+TEAM = input("Enter your W&B team name: ")
+PROJECT = input("Enter your W&B project name: ")
+
+import weave
+weave.init(f"{TEAM}/{PROJECT}")
+```
+
+```typescript lines highlight="4" TypeScript
+// Set WANDB_API_KEY, OPENAI_API_KEY in your environment before running this project
+import * as weave from 'weave';
+
+await weave.init(`[YOUR-TEAM]/[YOUR-PROJECT]`);
+```
+
+
+## Define a tool
+
+The following code defines the agent's Wikipedia search tool and an OpenAI tool schema to determine when and how to use the tool.
+
+
+```python lines Python
+import json
+import requests
+
+def wikipedia_search(query: str) -> str:
+ r = requests.get(
+ "https://en.wikipedia.org/w/api.php",
+ params={
+ "action": "query", "generator": "search", "gsrsearch": query, "gsrlimit": 1,
+ "prop": "extracts", "exintro": True, "explaintext": True, "format": "json",
+ },
+ headers={"User-Agent": "weave-demo"},
+ ).json()
+ return next(iter(r["query"]["pages"].values()))["extract"]
+
+wikipedia_tool_schema = {
+ "type": "function",
+ "function": {
+ "name": "wikipedia_search",
+ "description": "Search Wikipedia for a topic and return its intro paragraph.",
+ "parameters": {
+ "type": "object",
+ "properties": {"query": {"type": "string"}},
+ "required": ["query"],
+ },
+ },
+}
+```
+
+```typescript lines TypeScript
+async function wikipediaSearch(query: string): Promise {
+ const url = new URL('https://en.wikipedia.org/w/api.php');
+ url.search = new URLSearchParams({
+ action: 'query',
+ generator: 'search',
+ gsrsearch: query,
+ gsrlimit: '1',
+ prop: 'extracts',
+ exintro: 'true',
+ explaintext: 'true',
+ format: 'json',
+ }).toString();
+ const res = await fetch(url, { headers: { 'User-Agent': 'weave-demo' } });
+ const data = (await res.json()) as {
+ query: { pages: Record };
+ };
+ return Object.values(data.query.pages)[0].extract;
+}
+
+const wikipediaToolSchema = {
+ type: 'function' as const,
+ function: {
+ name: 'wikipedia_search',
+ description: 'Search Wikipedia for a topic and return its intro paragraph.',
+ parameters: {
+ type: 'object',
+ properties: { query: { type: 'string' } },
+ required: ['query'],
+ },
+ },
+};
+```
+
+
+## Run a traced multi-turn agent
+
+The example below runs three turns in a single session. Each turn:
+
+1. Opens a `chat` span and lets the LLM decide whether to call the tool
+2. If the LLM requested a tool, opens an `execute_tool` span around the call and feeds the result back to the LLM
+3. Opens a second `chat` span to produce the final answer
+
+
+```python lines highlight="9,11,17,29,42,46,53" Python
+from openai import OpenAI
+
+openai_client = OpenAI()
+MODEL = "gpt-4o-mini"
+
+def run_turn(history, user_message):
+ history.append({"role": "user", "content": user_message})
+
+ with weave.start_turn(user_message=user_message, model=MODEL):
+ # LLM call 1 — the model may decide to use a tool.
+ with weave.start_llm(model=MODEL, provider_name="openai") as llm:
+ resp = openai_client.chat.completions.create(
+ model=MODEL, messages=history, tools=[wikipedia_tool_schema],
+ )
+ msg = resp.choices[0].message
+ llm.output(msg.content or "")
+ llm.usage = weave.Usage(
+ input_tokens=resp.usage.prompt_tokens,
+ output_tokens=resp.usage.completion_tokens,
+ )
+ history.append(msg.model_dump(exclude_none=True))
+
+ # If no tool was requested, the first LLM response is the answer.
+ if not msg.tool_calls:
+ return msg.content
+
+ # Execute each requested tool call.
+ for tc in msg.tool_calls:
+ with weave.start_tool(
+ name=tc.function.name,
+ arguments=tc.function.arguments,
+ tool_call_id=tc.id,
+ ) as tool:
+ tool.result = wikipedia_search(**json.loads(tc.function.arguments))
+ history.append({
+ "role": "tool",
+ "tool_call_id": tc.id,
+ "content": tool.result,
+ })
+
+ # LLM call 2 — synthesize the final answer.
+ with weave.start_llm(model=MODEL, provider_name="openai") as llm:
+ resp = openai_client.chat.completions.create(model=MODEL, messages=history)
+ msg = resp.choices[0].message
+ llm.output(msg.content)
+ llm.usage = weave.Usage(
+ input_tokens=resp.usage.prompt_tokens,
+ output_tokens=resp.usage.completion_tokens,
+ )
+ history.append({"role": "assistant", "content": msg.content})
+ return msg.content
+
+with weave.start_session(agent_name="research-bot") as session:
+ history = []
+ for question in [
+ "Who founded Anthropic?",
+ "What is Claude (the AI assistant)?",
+ "Summarize what we discussed in one sentence.",
+ ]:
+ print(f"USER: {question}")
+ print(f"AGENT: {run_turn(history, question)}\n")
+```
+
+```typescript lines highlight="10,13,39,54,76" TypeScript
+import OpenAI from 'openai';
+
+const openaiClient = new OpenAI();
+const MODEL = 'gpt-4o-mini';
+
+// history is a list of OpenAI chat messages; typed loosely for brevity.
+async function runTurn(history: any[], userMessage: string): Promise {
+ history.push({ role: 'user', content: userMessage });
+
+ const turn = weave.startTurn({ userMessage, model: MODEL });
+ try {
+ // LLM call 1 — the model may decide to use a tool.
+ const llm1 = weave.startLLM({ model: MODEL, providerName: 'openai' });
+ let msg;
+ try {
+ const resp = await openaiClient.chat.completions.create({
+ model: MODEL,
+ messages: history,
+ tools: [wikipediaToolSchema],
+ });
+ msg = resp.choices[0].message;
+ llm1.output(msg.content ?? '');
+ llm1.usage = {
+ inputTokens: resp.usage?.prompt_tokens,
+ outputTokens: resp.usage?.completion_tokens,
+ };
+ history.push(msg);
+ } finally {
+ llm1.end();
+ }
+
+ // If no tool was requested, the first LLM response is the answer.
+ if (!msg.tool_calls?.length) {
+ return msg.content ?? null;
+ }
+
+ // Execute each requested tool call.
+ for (const tc of msg.tool_calls) {
+ const tool = weave.startTool({
+ name: tc.function.name,
+ args: tc.function.arguments,
+ toolCallId: tc.id,
+ });
+ try {
+ const { query } = JSON.parse(tc.function.arguments);
+ tool.result = await wikipediaSearch(query);
+ history.push({ role: 'tool', tool_call_id: tc.id, content: tool.result });
+ } finally {
+ tool.end();
+ }
+ }
+
+ // LLM call 2 — synthesize the final answer.
+ const llm2 = weave.startLLM({ model: MODEL, providerName: 'openai' });
+ try {
+ const resp = await openaiClient.chat.completions.create({
+ model: MODEL,
+ messages: history,
+ });
+ const msg2 = resp.choices[0].message;
+ llm2.output(msg2.content ?? '');
+ llm2.usage = {
+ inputTokens: resp.usage?.prompt_tokens,
+ outputTokens: resp.usage?.completion_tokens,
+ };
+ history.push({ role: 'assistant', content: msg2.content });
+ return msg2.content ?? null;
+ } finally {
+ llm2.end();
+ }
+ } finally {
+ turn.end();
+ }
+}
+
+const session = weave.startSession({ agentName: 'research-bot' });
+try {
+ const history: any[] = [];
+ for (const question of [
+ 'Who founded Anthropic?',
+ 'What is Claude (the AI assistant)?',
+ 'Summarize what we discussed in one sentence.',
+ ]) {
+ console.log(`USER: ${question}`);
+ console.log(`AGENT: ${await runTurn(history, question)}\n`);
+ }
+} finally {
+ session.end();
+}
+```
+
+
+## See your agent traces in the Agents view
+
+When `weave.init()` runs, it prints a link to your project where you can see:
+
+- A row in the **Agents** tab for `research-bot`
+- One session containing three turns
+- Each turn (`invoke_agent`) with two `chat` spans and an `execute_tool` span nested inside
+- Token counts, latency, model, and the full message exchange on each `chat`
+
+Click into any turn to inspect the inputs, outputs, tool arguments, and tool results.
+
+## Next steps
+
+* Get a better understanding of how to [trace agents with Weave](weave/guides/tracking/trace-agents) and what features and options are available in the Weave SDK.
+* See the [integration section](/weave/guides/integrations) for more options on how to integrate Weave with your agents.
\ No newline at end of file
diff --git a/weave/concepts/what-is-weave.mdx b/weave/concepts/what-is-weave.mdx
index 871c3c522b..7f9e5b1690 100644
--- a/weave/concepts/what-is-weave.mdx
+++ b/weave/concepts/what-is-weave.mdx
@@ -10,7 +10,7 @@ Building LLM applications is fundamentally different from traditional software d
## The main threads of Weave
Weave provides the following core functionality:
-- **Visibility** into every LLM call, input, and output in your application.
+- **Visibility** into agent sessions and multi-turn conversations, or into individual function calls and outputs in application code.
- **Systematic evaluation** to measure performance against curated test cases.
- **Version tracking** for prompts, models, and data so you can understand what changed.
- **Experimentation** with different prompt and model comparisons.
@@ -18,20 +18,30 @@ Weave provides the following core functionality:
- **Monitoring** in production using guardrails and scorers for LLM safety and quality.
-### Traces
-Track end-to-end how data flows through your LLM application.
+### Agentic tracing
-- See inputs and outputs of each application usage.
+Weave provides agentic observability for the full lifecycle of agent conversations including: sessions, LLM calls, and tool executions.
+
+If you're building an agent, follow the [agent tracing quickstart](weave/agents-quickstart.mdx) or learn to use the Weave SDK to [trace your agents](/weave/guides/tracking/trace-agents).
+
+If you're using a supported third-party agent harness, such as Claude Code or OpenAI Agent SDK, Weave instruments it automatically with no additional code. See [Integrations](/weave/guides/integrations) for all supported frameworks.
+
+
+### Application Tracing
+If you want to trace individual function calls, application code, or custom logic, use Weave Ops and Calls. Add one line to any function to track inputs, outputs, cost, token count, and latency.
+
+- Track end-to-end how data flows through your LLM application.
- See source documents used to produce the LLM feedback.
-- See cost, token count, and latency of LLM calls.
- Drill down into specific prompts and how answers are produced.
-- Collect feedback on responses from users.
-- In your code, you can use Weave [ops and calls](/weave/guides/tracking/tracing) to track what your functions are doing.
-[Get started with tracing](/weave/quickstart)
+To trace individual functions, follow the Weave [Op tracing quickstart](/weave/quickstart) or learn to use the Weave [Ops and Calls](/weave/guides/tracking/tracing).
+
+If you're using a supported third-party agent framework, such as Claude Code, Weave instruments it automatically with no additional code. See [Integrations](/weave/guides/integrations) for all supported frameworks.
+
+
### Evaluations
-Systematically benchmark your LLM application's performance to gain confidence when deploying to production.
+Systematically benchmark and monitor your LLM application's performance with evaluations to iteratively improve quality and reliability.
- Easily track which versions of model/prompt resulted in what performance.
- Define metrics to evaluate responses using one or more scoring functions.
@@ -81,41 +91,23 @@ pip install weave
npm install weave
```
-3. In your script, import Weave and initialize a project:
+3. In your script, import Weave and initialize a project.
+
+Replace `[YOUR-TEAM]` with your W&B team name and `[YOUR-PROJECT]` with your W&B project name.
```Python Python
import weave
-client = weave.init('your-team/your-project-name')
+client = weave.init('[YOUR-TEAM]/[YOUR-PROJECT]')
```
```TypeScript Typescript
import * as weave from 'weave';
-const client = await weave.init('your-team/your-project-name');
+const client = await weave.init('[YOUR-TEAM]/[YOUR-PROJECT]');
```
You're now ready to use Weave.
-Weave integrates with popular LLM providers and frameworks. When you use a [supported integration](/weave/guides/integrations/), Weave automatically traces LLM calls without additional code changes.
-
-4. Beyond relying on the supported integrations, you can also use Weave to log traces for custom functions by adding one line to your call function.
-
-When you decorate a function with `@weave.op()` (in Python), or wrap it with `weave.op()` (in TypeScript), Weave automatically captures its code, inputs, outputs, and execution metadata.
-
-```python Python
- @weave.op
- async def my_function(){
- ... }
-```
-
-```typescript Typescript
-function myFunction() {
- ...
-}
-const myFunctionOp = weave.op(myFunction)
-```
-
-To try it out with a guided tutorial, see [Get started with tracing](/weave/quickstart).
diff --git a/weave/cookbooks/source/agents-quickstart.ipynb b/weave/cookbooks/source/agents-quickstart.ipynb
new file mode 100644
index 0000000000..3d7e76508d
--- /dev/null
+++ b/weave/cookbooks/source/agents-quickstart.ipynb
@@ -0,0 +1,256 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Quickstart: Trace an agent\n",
+ "\n",
+ "Trace a multi-turn agent with the Weave SDK. Sessions, turns, LLM calls, and tool calls render in the Agents view of your project.\n",
+ "\n",
+ "The Weave SDK allows you to trace custom agents or agents created using popular SDKs. This quickstart guides you through how to manually integrate Weave into a custom-built multi-turn agent to emit and capture OpenTelemetry spans and render them in Weave's Agents view.\n",
+ "\n",
+ "If you are looking to integrate Weave with popular SDKs or harnesses, such as the Claude Agents SDK or Codex, see the [Weave integration section](https://docs.wandb.ai/weave/guides/integrations). Weave autopatches into several popular agent-building SDKs and agent harnesses for quick integration.\n",
+ "\n",
+ "## What you'll learn\n",
+ "\n",
+ "The code in this guide sets up a small research agent that can look things up on Wikipedia. It asks three questions (three turns), lets the AI decide when to search Wikipedia for an answer, and uses Weave to record every step (the conversation, each question, each AI response, and each Wikipedia lookup) so you can see exactly what happened in the Weave Agents view.\n",
+ "\n",
+ "This guide shows you how to:\n",
+ "\n",
+ "- Initialize Weave for agent tracing with `weave.init()`\n",
+ "- Open a session and a turn with `weave.start_session()` and `weave.start_turn()`\n",
+ "- Wrap LLM calls with `weave.start_llm()` and record usage\n",
+ "- Wrap tool executions with `weave.start_tool()` and record results\n",
+ "- View the resulting session, turns, and tool calls in the Agents view\n",
+ "\n",
+ "## How the Weave SDK works with agents\n",
+ "\n",
+ "The Weave SDK includes a generic OTel ingest system for agents, meaning that Weave can capture information from any OTel span in your agent's code. However, Weave requires special handling of the following spans to render your agent's traces in the Agents view of the Weave UI.\n",
+ "\n",
+ "| Function | Maps to | OTel span |\n",
+ "| --- | --- | --- |\n",
+ "| `weave.start_session(...)` | A conversation | (no span — groups turns) |\n",
+ "| `weave.start_turn(...)` | One user / agent exchange | `invoke_agent` |\n",
+ "| `weave.start_llm(...)` | One LLM API call | `chat` |\n",
+ "| `weave.start_tool(...)` | One tool execution | `execute_tool` |\n",
+ "\n",
+ "All four are context managers. On exit, they end the span and flush attributes, including on exceptions.\n",
+ "\n",
+ "Other [GenAI semantic-convention attributes](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/), such as `gen_ai.usage.*` and `gen_ai.agent.name`, enable additional rendering, but they are optional."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Prerequisites\n",
+ "\n",
+ "- A W&B account and [API key](https://wandb.ai/authorize)\n",
+ "- Python 3.10+\n",
+ "- An OpenAI API key\n",
+ "\n",
+ "## Install packages\n",
+ "\n",
+ "Install the following packages into your developer environment:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%pip install -q weave openai requests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Initialize Weave\n",
+ "\n",
+ "`weave.init()` authenticates with W&B and configures the OTel exporter that sends agent spans to the **Agents** view. If the project does not exist on your team, Weave creates it the first time you write to it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import getpass\n",
+ "import os\n",
+ "\n",
+ "os.environ[\"WANDB_API_KEY\"] = getpass.getpass(\"Enter your W&B API key: \")\n",
+ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API key: \")\n",
+ "\n",
+ "TEAM = input(\"Enter your W&B team name: \")\n",
+ "PROJECT = input(\"Enter your W&B project name: \")\n",
+ "\n",
+ "import weave\n",
+ "weave.init(f\"{TEAM}/{PROJECT}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a tool\n",
+ "\n",
+ "The following code defines the agent's Wikipedia search tool and an OpenAI tool schema to determine when and how to use the tool."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import requests\n",
+ "\n",
+ "def wikipedia_search(query: str) -> str:\n",
+ " r = requests.get(\n",
+ " \"https://en.wikipedia.org/w/api.php\",\n",
+ " params={\n",
+ " \"action\": \"query\", \"generator\": \"search\", \"gsrsearch\": query, \"gsrlimit\": 1,\n",
+ " \"prop\": \"extracts\", \"exintro\": True, \"explaintext\": True, \"format\": \"json\",\n",
+ " },\n",
+ " headers={\"User-Agent\": \"weave-demo\"},\n",
+ " ).json()\n",
+ " return next(iter(r[\"query\"][\"pages\"].values()))[\"extract\"]\n",
+ "\n",
+ "wikipedia_tool_schema = {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"wikipedia_search\",\n",
+ " \"description\": \"Search Wikipedia for a topic and return its intro paragraph.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\"query\": {\"type\": \"string\"}},\n",
+ " \"required\": [\"query\"],\n",
+ " },\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run a traced multi-turn agent\n",
+ "\n",
+ "The example below runs three turns in a single session. Each turn:\n",
+ "\n",
+ "1. Opens a `chat` span and lets the LLM decide whether to call the tool\n",
+ "2. If the LLM requested a tool, opens an `execute_tool` span around the call and feeds the result back to the LLM\n",
+ "3. Opens a second `chat` span to produce the final answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from openai import OpenAI\n",
+ "\n",
+ "openai_client = OpenAI()\n",
+ "MODEL = \"gpt-4o-mini\"\n",
+ "\n",
+ "def run_turn(history, user_message):\n",
+ " history.append({\"role\": \"user\", \"content\": user_message})\n",
+ "\n",
+ " with weave.start_turn(user_message=user_message, model=MODEL):\n",
+ " # LLM call 1 — the model may decide to use a tool.\n",
+ " with weave.start_llm(model=MODEL, provider_name=\"openai\") as llm:\n",
+ " resp = openai_client.chat.completions.create(\n",
+ " model=MODEL, messages=history, tools=[wikipedia_tool_schema],\n",
+ " )\n",
+ " msg = resp.choices[0].message\n",
+ " llm.output(msg.content or \"\")\n",
+ " llm.usage = weave.Usage(\n",
+ " input_tokens=resp.usage.prompt_tokens,\n",
+ " output_tokens=resp.usage.completion_tokens,\n",
+ " )\n",
+ " history.append(msg.model_dump(exclude_none=True))\n",
+ "\n",
+ " # If no tool was requested, the first LLM response is the answer.\n",
+ " if not msg.tool_calls:\n",
+ " return msg.content\n",
+ "\n",
+ " # Execute each requested tool call.\n",
+ " for tc in msg.tool_calls:\n",
+ " with weave.start_tool(\n",
+ " name=tc.function.name,\n",
+ " arguments=tc.function.arguments,\n",
+ " tool_call_id=tc.id,\n",
+ " ) as tool:\n",
+ " tool.result = wikipedia_search(**json.loads(tc.function.arguments))\n",
+ " history.append({\n",
+ " \"role\": \"tool\",\n",
+ " \"tool_call_id\": tc.id,\n",
+ " \"content\": tool.result,\n",
+ " })\n",
+ "\n",
+ " # LLM call 2 — synthesize the final answer.\n",
+ " with weave.start_llm(model=MODEL, provider_name=\"openai\") as llm:\n",
+ " resp = openai_client.chat.completions.create(model=MODEL, messages=history)\n",
+ " msg = resp.choices[0].message\n",
+ " llm.output(msg.content)\n",
+ " llm.usage = weave.Usage(\n",
+ " input_tokens=resp.usage.prompt_tokens,\n",
+ " output_tokens=resp.usage.completion_tokens,\n",
+ " )\n",
+ " history.append({\"role\": \"assistant\", \"content\": msg.content})\n",
+ " return msg.content\n",
+ "\n",
+ "with weave.start_session(agent_name=\"research-bot\") as session:\n",
+ " history = []\n",
+ " for question in [\n",
+ " \"Who founded Anthropic?\",\n",
+ " \"What is Claude (the AI assistant)?\",\n",
+ " \"Summarize what we discussed in one sentence.\",\n",
+ " ]:\n",
+ " print(f\"USER: {question}\")\n",
+ " print(f\"AGENT: {run_turn(history, question)}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## See your agent traces in the Agents view\n",
+ "\n",
+ "When `weave.init()` runs, it prints a link to your project where you can see:\n",
+ "\n",
+ "- A row in the **Agents** tab for `research-bot`\n",
+ "- One session containing three turns\n",
+ "- Each turn (`invoke_agent`) with two `chat` spans and an `execute_tool` span nested inside\n",
+ "- Token counts, latency, model, and the full message exchange on each `chat`\n",
+ "\n",
+ "Click into any turn to inspect the inputs, outputs, tool arguments, and tool results.\n",
+ "\n",
+ "## Next steps\n",
+ "\n",
+ "- Get a better understanding of how to [trace agents with Weave](https://docs.wandb.ai/weave/guides/tracking/trace-agents) and what features and options are available in the Weave SDK.\n",
+ "- See the [integration section](https://docs.wandb.ai/weave/guides/integrations) for more options on how to integrate Weave with your agents."
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/weave/guides/core-types/datasets.mdx b/weave/guides/core-types/datasets.mdx
index a62bb185b9..47f01b2656 100644
--- a/weave/guides/core-types/datasets.mdx
+++ b/weave/guides/core-types/datasets.mdx
@@ -266,7 +266,22 @@ You can create, edit, and delete `Dataset`s in the UI. Creating datasets in the
Now, the `Dataset` is deleted, and no longer visible in the **Datasets** tab in your Weave dashboard.
-### Add a new example to a `Dataset`
+### Add a new agent trace to a `Dataset`
+To add agent traces such as sessions, turns, and tool calls to a `Dataset`:
+
+1. Navigate to [wandb.ai](http://wandb.ai) and select your project.
+1. In the Weave project sidebar, click **Agents**.
+1. In the top tab bar, click **Conversations**.
+1. In the Conversations table, select a conversation row to load the conversation details side panel. The conversation details side panel has a header with a agent name and a UUID.
+1. In the conversation details side panel header, click the table icon to the right to open the **Add example to dataset** drawer.
+
+1. In **Choose a dataset**, use the dropdown to choose the dataset you want to add traces to.
+1. In **Select context**, select which messages to add to the dataset. Click **Next**.
+1. Review your selections and click **Add to dataset**.
+
+
+### Add a new trace to a `Dataset`
+To add traces generated from Ops and Calls to a `DataSet`:
1. Navigate to the Weave project you want to edit.
diff --git a/weave/guides/core-types/imgs/agent-conversation-details-header-add-dataset.png b/weave/guides/core-types/imgs/agent-conversation-details-header-add-dataset.png
new file mode 100644
index 0000000000..6707c8238b
Binary files /dev/null and b/weave/guides/core-types/imgs/agent-conversation-details-header-add-dataset.png differ
diff --git a/weave/guides/evaluation/custom-monitors.mdx b/weave/guides/evaluation/custom-monitors.mdx
new file mode 100644
index 0000000000..fc2647a698
--- /dev/null
+++ b/weave/guides/evaluation/custom-monitors.mdx
@@ -0,0 +1,178 @@
+---
+title: "Set up custom monitors"
+description: "Passively score production traffic to surface trends and issues"
+---
+
+Monitors use LLM judges to passively score production traffic to surface trends and issues in your LLM applications. For example, you can monitor your application's responses for correctness or helpfulness, or you can monitor user input to identify trends in what they're asking your agents about. Monitors automatically store all scoring results in W&B Weave's database, allowing you to analyze historical trends and patterns.
+
+You can monitor text, images, and audio in your application's input and output.
+
+Monitors require no code changes to your application. Set them up using the W&B Weave UI.
+
+If you need to actively intervene in your application's behavior based on scores, use [guardrails](/weave/guides/evaluation/guardrails) instead.
+
+### When to use signals and custom monitors
+
+Use [signals](/weave/guides/evaluation/monitors), preset automated scorers for production traces, to get started with production monitoring quickly, then add custom monitors for evaluation criteria specific to your application.
+
+| | Signals | Custom monitors |
+|---|---------|----------------|
+| **Configuration** | One-click enable, no prompt writing | Full control over scoring prompt, model, and parameters |
+| **Scope** | Preset quality and error classifiers | Any evaluation criteria you define |
+| **Trace selection** | Automatic (successful root traces for quality, failed traces for errors) | Configurable operations, filters, and sampling rate |
+| **Model** | Serverless Inference (preset) | Any commercial or Serverless Inference model |
+| **Use case** | Quick production monitoring with proven classifiers | Custom evaluation criteria specific to your application |
+
+
+
+## How to create a monitor in Weave
+
+To create a custom monitor in Weave:
+
+1. Open the [W&B UI](https://wandb.ai/home) and then open your Weave project.
+2. From the Weave side-nav, select **Monitors** and then select the **+ New Monitor** button. This opens the **Create new monitor** modal dialog.
+3. In the Create new monitor menu, configure the following fields:
+ - **Name**: Must start with a letter or number. Can contain letters, numbers, hyphens, and underscores.
+ - **Description** (Optional): Explain what the monitor does.
+ - **Active monitor** toggle: Turn the monitor on or off.
+ - **Calls to monitor**:
+ - **Operations**: Choose one or more `@weave.op`s to monitor. You must log at least one trace that uses the op before it appears in the list of available ops.
+ - **Filter** (Optional): Narrow down which calls are eligible (for example, by `max_tokens` or `top_p`).
+ - **Sampling rate**: The percentage of calls to score (0% to 100%).
+
+ A lower sampling rate reduces costs, since each scoring call has an associated cost.
+
+ - **LLM-as-a-judge configuration**:
+ - **Scorer name**: Must start with a letter or number. Can contain letters, numbers, hyphens, and underscores.
+ - **Score Audio**: Filters the available LLM models to display only audio-enabled models, and opens the Media Scoring JSON Paths field.
+ - **Score Images**: Filters the available LLM models to display only image-enabled models, and opens the Media Scoring JSON Paths field.
+ - **Judge model**: Select the model to score your ops. The menu contains commercial LLM models you have configured in your W&B account, as well as [Serverless Inference models](/inference/models). Audio-enabled models have an **Audio Input** label beside their names. For the selected model, configure the following settings:
+ - **Configuration name**: A name for this model configuration.
+ - **System prompt**: Defines the judging model's role and persona, for example, "You are an impartial AI judge."
+ - **Response format**: The format the judge should output its response in, such as a `json_object` or plain `text`.
+ - **Scoring prompt**: The evaluation task used to score your ops. You can reference [prompt variables](/weave/guides/evaluation/scorers#access-variables-from-your-ops-in-scoring-prompts) from your ops in your scoring prompts. For example, "Evaluate whether `{output}` is accurate based on `{ground_truth}`."
+ - **Media Scoring JSON Paths**: Specify JSONPath expressions (RFC 9535) to extract media from your trace data. If no paths are specified, all scorable media from user messages will be included. This field appears when you enable **Score Audio** or **Score Images**.
+
+4. Once you have configured the monitor's fields, select **Create monitor**. This adds the monitor to your Weave project. When your code starts generating traces, you can review the scores in the **Traces** tab by selecting the monitor's name and reviewing the data in the resulting panel.
+
+You can also [compare](/weave/guides/tools/comparison) and visualize the monitor's trace data in the Weave UI, or download it in various formats (such as CSV and JSON) using the download button () in the Traces tab.
+
+Weave automatically stores all scorer results in the [Call](/weave/guides/tracking/tracing#calls) object's `feedback` field.
+
+### Example: Create a truthfulness monitor
+
+The following example creates a monitor that evaluates the truthfulness of generated statements.
+
+1. Define a function that generates statements. Some statements are truthful, others are not:
+
+
+
+```python lines
+import weave
+import random
+import openai
+
+weave.init("my-team/my-weave-project")
+
+client = openai.OpenAI()
+
+@weave.op()
+def generate_statement(ground_truth: str) -> str:
+ if random.random() < 0.5:
+ response = client.chat.completions.create(
+ model="gpt-4.1",
+ messages=[
+ {
+ "role": "user",
+ "content": f"Generate a statement that is incorrect based on this fact: {ground_truth}"
+ }
+ ]
+ )
+ return response.choices[0].message.content
+ else:
+ return ground_truth
+
+generate_statement("The Earth revolves around the Sun.")
+```
+
+
+```typescript lines
+import * as weave from 'weave';
+import OpenAI from 'openai';
+
+await weave.init('my-team/my-weave-project');
+
+const client = new OpenAI();
+
+const generateStatement = weave.op(async (ground_truth: string): Promise => {
+ if (Math.random() < 0.5) {
+ const response = await client.chat.completions.create({
+ model: 'gpt-4.1',
+ messages: [
+ {
+ role: 'user',
+ content: `Generate a statement that is incorrect based on this fact: ${ground_truth}`,
+ },
+ ],
+ });
+ return response.choices[0]?.message?.content ?? '';
+ }
+ return ground_truth;
+});
+
+await generateStatement("The Earth revolves around the Sun.");
+```
+
+
+
+2. Run the function at least once to log a trace in your project. This makes the op available for monitoring in the W&B UI.
+
+3. Open your Weave project in the W&B UI and select **Monitors** from the side-nav. Then select **New Monitor**.
+4. In the Create new monitor menu, configure the fields using the following values:
+ - **Name**: `truthfulness-monitor`
+ - **Description**: `Evaluates the truthfulness of generated statements.`
+ - **Active monitor**: Toggle **on**.
+ - **Operations**: Select `generate_statement`.
+ - **Sampling rate**: Set to `100%` to score every call.
+ - **Scorer name**: `truthfulness-scorer`
+ - **Judge model**: `o3-mini-2025-01-31`
+ - **System prompt**: `You are an impartial AI judge. Your task is to evaluate the truthfulness of statements.`
+ - **Response format**: `json_object`
+ - **Scoring prompt**:
+ ```text
+ Evaluate whether the output statement is accurate based on the input statement.
+
+ This is the input statement: {ground_truth}
+
+ This is the output statement: {output}
+
+ The response should be a JSON object with the following fields:
+ - is_true: a boolean stating whether the output statement is true or false based on the input statement.
+ - reasoning: your reasoning as to why the statement is true or false.
+ ```
+
+5. Select **Create Monitor**. This adds the monitor to your Weave project.
+
+
+6. In your script, invoke your function using statements of varying degrees of truthfulness to test the scoring function:
+
+
+
+```python lines
+generate_statement("The Earth revolves around the Sun.")
+generate_statement("Water freezes at 0 degrees Celsius.")
+generate_statement("The Great Wall of China was built over several centuries.")
+```
+
+
+```typescript lines
+await generateStatement("The Earth revolves around the Sun.");
+await generateStatement("Water freezes at 0 degrees Celsius.");
+await generateStatement("The Great Wall of China was built over several centuries.");
+```
+
+
+
+7. After running the script using several different statements, open the W&B UI and navigate to the **Traces** tab. Select any **LLMAsAJudgeScorer.score** trace to see the results.
+
+
diff --git a/weave/guides/evaluation/img/weave_signals_project_dash.png b/weave/guides/evaluation/img/weave_signals_project_dash.png
new file mode 100644
index 0000000000..99c98b59f7
Binary files /dev/null and b/weave/guides/evaluation/img/weave_signals_project_dash.png differ
diff --git a/weave/guides/evaluation/img/weave_signals_trace_hover.png b/weave/guides/evaluation/img/weave_signals_trace_hover.png
new file mode 100644
index 0000000000..4c6785e44f
Binary files /dev/null and b/weave/guides/evaluation/img/weave_signals_trace_hover.png differ
diff --git a/weave/guides/evaluation/img/weave_signals_trace_reasoning.png b/weave/guides/evaluation/img/weave_signals_trace_reasoning.png
new file mode 100644
index 0000000000..a6e0bca101
Binary files /dev/null and b/weave/guides/evaluation/img/weave_signals_trace_reasoning.png differ
diff --git a/weave/guides/evaluation/monitors.mdx b/weave/guides/evaluation/monitors.mdx
index 3edc36a538..d49d48c092 100644
--- a/weave/guides/evaluation/monitors.mdx
+++ b/weave/guides/evaluation/monitors.mdx
@@ -1,164 +1,136 @@
---
-title: "Set up monitors"
-description: "Passively score production traffic to surface trends and issues"
+title: "Monitor using built-in signals"
+description: "W&B provides built-in scoring solutions for agents in production"
---
-Monitors use LLM judges to passively score production traffic to surface trends and issues in your LLM applications. For example, you can monitor your application's responses for correctness or helpfulness, or you can monitor user input to identify trends in what they're asking your agents about. Monitors automatically store all scoring results in Weave's database, allowing you to analyze historical trends and patterns.
-
-You can monitor text, images, and audio in your application's input and output.
-
-Monitors require no code changes to your application. Set them up using the W&B Weave UI.
-
-If you need to actively intervene in your application's behavior based on scores, use [guardrails](/weave/guides/evaluation/guardrails) instead.
-
-## How to create a monitor in Weave
-
-To create a monitor in Weave:
-
-1. Open the [W&B UI](https://wandb.ai/home) and then open your Weave project.
-2. From the Weave side-nav, select **Monitors** and then select the **+ New Monitor** button. This opens the **Create new monitor** modal dialog.
-3. In the Create new monitor menu, configure the following fields:
- - **Name**: Must start with a letter or number. Can contain letters, numbers, hyphens, and underscores.
- - **Description** (Optional): Explain what the monitor does.
- - **Active monitor** toggle: Turn the monitor on or off.
- - **Calls to monitor**:
- - **Operations**: Choose one or more `@weave.op`s to monitor. You must log at least one trace that uses the op before it appears in the list of available ops.
- - **Filter** (Optional): Narrow down which calls are eligible (for example, by `max_tokens` or `top_p`).
- - **Sampling rate**: The percentage of calls to score (0% to 100%).
-
- A lower sampling rate reduces costs, since each scoring call has an associated cost.
-
- - **LLM-as-a-judge configuration**:
- - **Scorer name**: Must start with a letter or number. Can contain letters, numbers, hyphens, and underscores.
- - **Score Audio**: Filters the available LLM models to display only audio-enabled models, and opens the Media Scoring JSON Paths field.
- - **Score Images**: Filters the available LLM models to display only image-enabled models, and opens the Media Scoring JSON Paths field.
- - **Judge model**: Select the model to score your ops. The menu contains commercial LLM models you have configured in your W&B account, as well as [Serverless Inference models](/inference/models). Audio-enabled models have an **Audio Input** label beside their names. For the selected model, configure the following settings:
- - **Configuration name**: A name for this model configuration.
- - **System prompt**: Defines the judging model's role and persona, for example, "You are an impartial AI judge."
- - **Response format**: The format the judge should output its response in, such as a `json_object` or plain `text`.
- - **Scoring prompt**: The evaluation task used to score your ops. You can reference [prompt variables](/weave/guides/evaluation/scorers#access-variables-from-your-ops-in-scoring-prompts) from your ops in your scoring prompts. For example, "Evaluate whether `{output}` is accurate based on `{ground_truth}`."
- - **Media Scoring JSON Paths**: Specify JSONPath expressions (RFC 9535) to extract media from your trace data. If no paths are specified, all scorable media from user messages will be included. This field appears when you enable **Score Audio** or **Score Images**.
-
-4. Once you have configured the monitor's fields, click **Create monitor**. This adds the monitor to your Weave project. When your code starts generating traces, you can review the scores in the **Traces** tab by selecting the monitor's name and reviewing the data in the resulting panel.
-
-You can also [compare](/weave/guides/tools/comparison) and visualize the monitor's trace data in the Weave UI, or download it in various formats (such as CSV and JSON) using the download button () in the Traces tab.
-
-Weave automatically stores all scorer results in the [Call](/weave/guides/tracking/tracing#calls) object's `feedback` field.
-
-### Example: Create a truthfulness monitor
-
-The following example creates a monitor that evaluates the truthfulness of generated statements.
-
-1. Define a function that generates statements. Some statements are truthful, others are not:
-
-
-
-```python
-import weave
-import random
-import openai
-
-weave.init("my-team/my-weave-project")
-
-client = openai.OpenAI()
-
-@weave.op()
-def generate_statement(ground_truth: str) -> str:
- if random.random() < 0.5:
- response = client.chat.completions.create(
- model="gpt-4.1",
- messages=[
- {
- "role": "user",
- "content": f"Generate a statement that is incorrect based on this fact: {ground_truth}"
- }
- ]
- )
- return response.choices[0].message.content
- else:
- return ground_truth
-
-generate_statement("The Earth revolves around the Sun.")
-```
-
-
-```typescript
-import * as weave from 'weave';
-import OpenAI from 'openai';
-
-await weave.init('my-team/my-weave-project');
-
-const client = new OpenAI();
-
-const generateStatement = weave.op(async (ground_truth: string): Promise => {
- if (Math.random() < 0.5) {
- const response = await client.chat.completions.create({
- model: 'gpt-4.1',
- messages: [
- {
- role: 'user',
- content: `Generate a statement that is incorrect based on this fact: ${ground_truth}`,
- },
- ],
- });
- return response.choices[0]?.message?.content ?? '';
- }
- return ground_truth;
-});
-
-await generateStatement("The Earth revolves around the Sun.");
-```
-
-
-
-2. Run the function at least once to log a trace in your project. This makes the op available for monitoring in the W&B UI.
-
-3. Open your Weave project in the W&B UI and select **Monitors** from the side-nav. Then select **New Monitor**.
-4. In the Create new monitor menu, configure the fields using the following values:
- - **Name**: `truthfulness-monitor`
- - **Description**: `Evaluates the truthfulness of generated statements.`
- - **Active monitor**: Toggle **on**.
- - **Operations**: Select `generate_statement`.
- - **Sampling rate**: Set to `100%` to score every call.
- - **Scorer name**: `truthfulness-scorer`
- - **Judge model**: `o3-mini-2025-01-31`
- - **System prompt**: `You are an impartial AI judge. Your task is to evaluate the truthfulness of statements.`
- - **Response format**: `json_object`
- - **Scoring prompt**:
- ```text
- Evaluate whether the output statement is accurate based on the input statement.
-
- This is the input statement: {ground_truth}
-
- This is the output statement: {output}
-
- The response should be a JSON object with the following fields:
- - is_true: a boolean stating whether the output statement is true or false based on the input statement.
- - reasoning: your reasoning as to why the statement is true or false.
- ```
-
-5. Click **Create Monitor**. This adds the monitor to your Weave project.
-
-
-6. In your script, invoke your function using statements of varying degrees of truthfulness to test the scoring function:
-
-
-
-```python
-generate_statement("The Earth revolves around the Sun.")
-generate_statement("Water freezes at 0 degrees Celsius.")
-generate_statement("The Great Wall of China was built over several centuries.")
-```
-
-
-```typescript
-await generateStatement("The Earth revolves around the Sun.");
-await generateStatement("Water freezes at 0 degrees Celsius.");
-await generateStatement("The Great Wall of China was built over several centuries.");
-```
-
-
-
-7. After running the script using several different statements, open the W&B UI and navigate to the **Traces** tab. Select any **LLMAsAJudgeScorer.score** trace to see the results.
-
-
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+In modern agent development, standard system metrics like latency, token count, and cost are insufficient for understanding complex agent behavior. Inspecting individual traces provides deep insight but doesn't scale to the millions of traces generated in a live environment.
+
+Signals provide a high-level monitoring solution to this problem by offering automated, behavioral scoring for agents in production:
+- Automated scoring: Every incoming production trace is automatically processed and scored on common quality issues and errors.
+- Infrastructure: Processing is powered by CoreWeave compute and CoreWeave GPUs for scalability across millions of traces.
+
+By using signals within production, you can:
+- Gain behavioral insight: Move beyond simple system metrics to understand if your agent is hallucinating, failing to follow conversation patterns, or losing grounding in its evidence.
+- Accelerate the research loop: Use the scores and failure analyses generated by signals to identify specific weaknesses, which you can use to research model improvement, data annotation, or reinforcement learning.
+
+## Available signals
+
+W&B Weave offers monitors with built-in signals: preset scorers that evaluate production traces for common quality issues and errors out of the box, with no custom setup. Each built-in signal uses a benchmarked LLM prompt to classify traces and saves the results as comma-delimited tags representing the detected issues.
+
+Signals use a [Serverless Inference](/inference/) model to score traces, so no external API keys are required.
+
+W&B Weave provides 13 preset signals organized into two groups.
+
+### Quality signals
+
+Quality signals evaluate successful root-level traces for output quality and safety issues.
+
+| Signal | What it detects |
+|--------|----------------|
+| **Hallucination** | Fabricated facts or claims that contradict the provided input context |
+| **Low quality** | Responses with poor format, insufficient effort, or incomplete content |
+| **User frustration** | Signs of user frustration such as repeated questions, negative sentiment, or complaints |
+| **Jailbreaking** | Prompt injection and jailbreak attempts that try to bypass safety guidelines |
+| **NSFW** | Explicit, violent, or otherwise inappropriate content in inputs or outputs |
+| **Lazy** | Low-effort responses such as excessive brevity, refusals to help, or deferred work |
+| **Forgetful** | Failure to use context from earlier in the conversation, ignoring previously stated facts or instructions |
+
+### Error signals
+
+Error signals categorize failed traces by root cause to help you identify and resolve infrastructure and application issues.
+
+| Signal | What it detects |
+|--------|----------------|
+| **Network Error** | DNS failures, timeouts, connection resets, and other connectivity issues |
+| **Ratelimited** | HTTP 429 responses, quota exhaustion, and throttling from upstream APIs |
+| **Request Too Large** | Requests exceeding size or token limits, such as context window exceeded |
+| **Bad Request** | Client-side errors where the server rejected the request (4xx except 429) |
+| **Bad Response** | Invalid, unexpected, or unusable responses from remote services (5xx) |
+| **Bug** | Flaws in application code such as `KeyError`, `TypeError`, or logic errors |
+
+### How signals work
+
+Each signal uses an LLM-as-a-judge approach to classify traces:
+
+- **Trace selection**: Quality signals evaluate successful root-level traces. Error signals evaluate failed traces. Child spans and intermediate Calls are not scored.
+- **Prompt construction**: Weave constructs a prompt that includes the trace metadata, inputs, outputs, exception details (if any), and the operation's source code. The signal's classifier prompt is appended with instructions for the specific issue to detect.
+- **LLM scoring**: For each signal, a Serverless Inference model performs a binary classification (whether that issue is present on the trace). Detected issues are returned as comma-delimited string tags (for example, `"Low-quality, User-frustration, Forgetful"`).
+
+When multiple signals from the same group (Quality or Error) are active, Weave batches the signals into a single LLM call for efficiency. The model evaluates all active classifiers in one pass and returns results for each.
+
+## Enable signals from the Monitors page
+
+To enable signals:
+
+1. Navigate to [wandb.ai](https://wandb.ai/) and then open your Weave project.
+2. In the Weave project sidebar, select **Monitors**.
+3. At the top of the Monitors page, a row of suggested signal cards appears. Each card shows the signal name, a description, and an **+ Add signal** button.
+4. To enable a single signal, select the **+ Add signal** button on the signal card. The signal begins scoring new traces immediately.
+5. To enable multiple signals at once, select the **+ [X] more signals** button. This opens a drawer that lists all available signals grouped by category.
+6. Select the signals you want to turn on, then select **Add signals**.
+
+After enabling signals, Weave automatically scores incoming traces.
+
+## Manage active signals
+
+To view or remove active signals:
+
+1. From the Monitors page, select the **Manage signals** () button. This opens a drawer listing all currently active signals grouped by category.
+2. Hover over a signal and select the **Remove signal** () button to disable the signal.
+
+Removing a signal stops scoring new traces. Existing scores from the signal are preserved.
+
+## Use built-in signals
+
+### See tagged agent traces on the Agents page
+
+If you are tracing agentic applications, signal results appear in two places
+on the **Agents** page:
+
+- The **Signals** tab shows all scored turns across your agents, with the
+ scorer name, numeric score, and 24-hour trend for each.
+- The **Conversations** tab's conversation detail panel shows a **Scores** section in the Events
+ panel. This includes the rubric breakdown and per-criterion confidence for each
+ active scorer.
+
+For full details, see [View agent activity](/weave/guides/tracking/view-agent-activity).
+
+### See tagged Call traces on the Traces page
+
+If you are tracing individual functions as Ops with the `@weave.op` decorator, signal results are stored as feedback on the Call object and are queryable from the **Traces** page.
+
+You can quickly scan your traces for certain behavior in the **Traces** page using the **Signals** column. The Signals column is populated with tags when their criteria are met, and you can hover over these tags to see the confidence in the score and the reasoning.
+
+
+
+Using the trace table toolbar, you can filter the trace table to only show traces that have triggered certain signals.
+
+You can view additional signal details in the Traces page by selecting the classifier Call that the signal generates and reviewing the **Trace Details view**. Under Call **Output**, review `classifier_meta` for the reasoning. For example, the following screenshot shows a `Quality-classifiers` signal with `Low-quality` match and confidence (0.9) with a reason for this rating.
+
+
+
+### See signals in the project dashboard
+
+You can also review signals at a project level:
+
+1. In the project sidebar, select **Project**.
+2. At the top of the Project dashboard, select the **Weave** tab.
+3. In the Weave dashboard panels, locate **Monitor Scores**.
+
+In the Monitor Scores project panel, you can see time-based graphs of signals that occurred for the project.
+
+
+
+### Alert on signals
+
+You can set up automated triggers that notify your team through tools like Slack when an agent's performance drops below a certain threshold.
+
+To get notified when a signal is triggered, set up an [automation](/weave/guides/evaluation/automations).
+
+---
+
+For specific monitoring beyond what is provided by the built-in signals, see [Set up custom monitors](/weave/guides/evaluation/custom-monitors).
diff --git a/weave/guides/integrations.mdx b/weave/guides/integrations.mdx
index 73a1bed59e..e77e519084 100644
--- a/weave/guides/integrations.mdx
+++ b/weave/guides/integrations.mdx
@@ -1,19 +1,50 @@
---
title: Integrations overview
-description: "Trace and monitor LLM calls across 30+ providers and frameworks with Weave's automatic patching for OpenAI, Anthropic, and more."
+description: "Trace agents and LLM applications with Weave's integrations for popular agent SDKs, harnesses, LLM providers, and orchestration frameworks."
---
+Weave provides two ways to integrate with your AI stack:
+- **Trace agents**: For multi-turn agentic applications built with agent SDKs (such as the OpenAI Agents SDK or Google ADK) or run inside agent harnesses (such as Claude Code, Codex, or Pi.dev). These integrations capture sessions, turns, LLM calls, and tool calls, and render them in the Weave **Agents** view.
+- **Trace LLM applications**: For code that calls LLM providers (OpenAI, Anthropic, Bedrock, and others) or uses orchestration frameworks (LangChain, LlamaIndex, DSPy, and others). These integrations capture individual LLM calls and pipeline steps as Weave **Calls** in the **Traces** view.
+If you're not sure which path to take, start with [Trace your agents](/weave/guides/tracking/trace-agents) for new multi-turn agent work, or [Trace LLM applications](/weave/guides/tracking/tracing) for existing LLM-call workflows.
-W&B Weave provides logging integrations for popular LLM providers and orchestration frameworks. These integrations allow you to seamlessly trace calls made through various libraries, enhancing your ability to monitor and analyze your AI applications.
+## Trace agents
-If you use LLM provider libraries (such as OpenAI, Anthropic, Cohere, or Mistral) in your application, you want those API calls to show up in W&B Weave as traced Calls: inputs, outputs, latency, token usage, and cost. Without help, you would have to wrap every `client.chat.completions.create()` (or equivalent) in `@weave.op` or manual instrumentation, which is tedious and easy to miss something.
+The Weave SDK models the full lifecycle of a multi-turn agent: sessions, turns, LLM calls, and tool calls. For supported agent SDKs and harnesses, Weave autopatches the framework so you only need to call `weave.init()` — every agent invocation, sub-agent handoff, model call, and tool call is captured automatically and rendered in the **Agents** view.
-Weave automatically intercepts (patches) supported LLM client libraries. Your application code stays unchanged: you use the provider SDK as usual, and each request is recorded as a Weave Call. You get full tracing with minimal setup.
+For custom agents, you can instrument any agent code by hand using `weave.start_session`, `weave.start_turn`, `weave.start_llm`, and `weave.start_tool`. See the [agents quickstart](/weave/agents-quickstart) for a walkthrough.
+### Integrate Weave with agent SDKs
-## LLM providers
+Agent SDKs are libraries for building agents and multi-agent workflows in your own application code. Weave autopatches the following SDKs:
+
+- **[OpenAI Agents SDK](/weave/guides/integrations/agents/openai-agents-sdk)**
+- **[Google Agent Development Kit (ADK)](/weave/guides/integrations/agents/google-adk)**
+- **[Claude Agent SDK](/weave/guides/integrations/agents/claude-agents-sdk)**
+
+### Integrate Weave with agent harnesses
+
+Agent harnesses are end-user agent runtimes (such as coding agents and developer tools) that produce spans Weave can capture. Install the appropriate plugin or extension and your harness sessions are routed to the **Agents** view:
+
+- **[Claude Code](/weave/guides/integrations/agents/claude-code-harness)**
+- **[Codex](/weave/guides/integrations/agents/codex-harness)**
+- **[Pi.dev](/weave/guides/integrations/agents/pi-dev-harness)**
+
+### Build your own
+
+Use the Weave SDK directly to instrument custom agents, including any agent that emits OpenTelemetry spans. Weave accepts any OTel span and has special handling for [GenAI semantic-convention attributes](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/) so your spans render correctly in the **Agents** view of the Weave UI.
+
+See the the [Trace an agent quickstart](/weave/agents-quickstart) for information on how to trace custom agents.
+
+## Trace LLM applications
+
+If your application calls an LLM provider's API directly or uses an orchestration framework, Weave can automatically intercept traces (using autopatching) for many popular libraries and frameworks. By importing the Weave SDK into your code and initializing with `weave.init`, each request is recorded as a Weave **Call** with inputs, outputs, latency, token usage, and cost.
+
+For libraries Weave doesn't autopatch, you can manually apply [Weave Ops](/weave/guides/tracking/create-call#2-tracking-of-custom-functions) to your code to capture traces.
+
+### LLM providers
LLM providers are the vendors that offer access to large language models for generating predictions. Weave integrates with these providers to log and trace the interactions with their APIs:
@@ -35,12 +66,13 @@ LLM providers are the vendors that offer access to large language models for gen
**[Local Models](/weave/guides/integrations/local_models)**: For when you're running models on your own infrastructure.
-## Frameworks
+### Frameworks
-Frameworks help orchestrate the actual execution pipelines in AI applications. They provide tools and abstractions for building complex workflows. Weave integrates with these frameworks to trace the entire pipeline:
+Frameworks help orchestrate the execution pipelines in AI applications. They provide tools and abstractions for building complex workflows. Weave integrates with these frameworks to trace the entire pipeline:
-- **[OpenAI Agents SDK](/weave/guides/integrations/openai_agents)**
-- **[Claude Code](/weave/guides/integrations/claude_code)**
+- **[OpenAI Agents SDK (call-level tracing)](/weave/guides/integrations/openai_agents)**
+- **[Claude Agent SDK (call-level tracing)](/weave/guides/integrations/claude_agent)**
+- **[Claude Code (call-level tracing)](/weave/guides/integrations/claude_code)**
- **[LangChain](/weave/guides/integrations/langchain)**
- **[LlamaIndex](/weave/guides/integrations/llamaindex)**
- **[Haystack](/weave/guides/integrations/haystack)**
@@ -49,7 +81,7 @@ Frameworks help orchestrate the actual execution pipelines in AI applications. T
- **[CrewAI](/weave/guides/integrations/crewai)**
- **[Smolagents](/weave/guides/integrations/smolagents)**
- **[PydanticAI](/weave/guides/integrations/pydantic_ai)**
-- **[Google Agent Development Kit (ADK)](/weave/guides/integrations/google_adk)**
+- **[Google Agent Development Kit (ADK, call-level tracing)](/weave/guides/integrations/google_adk)**
- **[AutoGen](/weave/guides/integrations/autogen)**
- **[Verdict](/weave/guides/integrations/verdict)**
- **[TypeScript SDK](/weave/guides/integrations/js)**
@@ -57,13 +89,12 @@ Frameworks help orchestrate the actual execution pipelines in AI applications. T
- **[Agno](/weave/guides/integrations/agno)**
- **[Koog](/weave/guides/integrations/koog)**
-## RL Frameworks
+### RL Frameworks
+
- **[Verifiers](/weave/guides/integrations/verifiers)**
-## Protocols
+### Protocols
Weave integrates with standardized protocols that enable communication between AI applications and their supporting services:
- **[Model Context Protocol (MCP)](/weave/guides/integrations/mcp)**
-
-Choose an integration from the lists above to learn more about how to use Weave with your preferred LLM provider, framework, or protocol. Whether you're directly accessing LLM APIs, building complex pipelines, or using standardized protocols, Weave provides the tools to trace and analyze your AI applications effectively.
diff --git a/weave/guides/integrations/agents/claude-agents-sdk.mdx b/weave/guides/integrations/agents/claude-agents-sdk.mdx
new file mode 100644
index 0000000000..97b66b91ec
--- /dev/null
+++ b/weave/guides/integrations/agents/claude-agents-sdk.mdx
@@ -0,0 +1,89 @@
+---
+title: "Claude Agent SDK"
+description: Trace an agent built with the Claude Agent SDK using Weave.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+The [Claude Agent SDK](https://github.com/anthropics/claude-agent-sdk-python) is a Python SDK for building agent applications with Claude.
+
+Weave automatically traces Claude Agent SDK calls, including agent queries, model responses, tool use, and multi-turn conversations. Weave displays the captured data in the **Agents** view of your project.
+
+## Trace Claude Agent SDK agents with Weave
+
+The Weave SDK autopatches the Claude Agent SDK, allowing you to capture traces from your Claude agents with minimal setup.
+
+This doc shows how to initialize Weave and then run a Claude agent with MCP tools through `ClaudeSDKClient` so that Weave automatically traces the conversation, model calls, and tool calls end-to-end.
+
+### Prerequisites
+
+- A W&B account and [API key](https://wandb.ai/authorize) set as a `WANDB_API_KEY` environment variable
+- An Anthropic API key set as an `ANTHROPIC_API_KEY` environment variable
+- Python 3.10+
+
+### Install packages
+
+Install the following packages in your developer environment:
+
+```bash
+pip install weave claude-agent-sdk
+```
+
+### Initialize Weave in your code
+
+Add `weave.init` to the project, update your W&B team and project names, and then build an agent the way you normally would.
+
+The following code creates a Claude agent with two MCP math tools and runs it while Weave captures its traces.
+
+```python lines
+import anyio
+import weave
+
+from claude_agent_sdk import (
+ ClaudeAgentOptions,
+ ClaudeSDKClient,
+ create_sdk_mcp_server,
+ tool,
+)
+
+weave.init("[YOUR-TEAM]/[YOUR-PROJECT]")
+
+
+@tool("add", "Add two numbers", {"a": float, "b": float})
+async def add(args: dict) -> dict:
+ return {"content": [{"type": "text", "text": str(args["a"] + args["b"])}]}
+
+
+@tool("multiply", "Multiply two numbers", {"a": float, "b": float})
+async def multiply(args: dict) -> dict:
+ return {"content": [{"type": "text", "text": str(args["a"] * args["b"])}]}
+
+
+math_server = create_sdk_mcp_server(
+ name="math",
+ version="1.0.0",
+ tools=[add, multiply],
+)
+
+
+async def main():
+ options = ClaudeAgentOptions(
+ mcp_servers={"math": math_server},
+ allowed_tools=["mcp__math__add", "mcp__math__multiply"],
+ )
+
+ async with ClaudeSDKClient(options=options) as client:
+ await client.query("Using the math tools, compute (3 + 7) * 2.")
+
+ async for message in client.receive_response():
+ print(message)
+
+
+anyio.run(main)
+```
+
+`weave.init()` prints a link to your project when it runs.
+
+For details on viewing Agents data in Weave, see [View agent activity](/weave/guides/tracking/view-agent-activity).
diff --git a/weave/guides/integrations/agents/codex-harness.mdx b/weave/guides/integrations/agents/codex-harness.mdx
new file mode 100644
index 0000000000..78ee13df51
--- /dev/null
+++ b/weave/guides/integrations/agents/codex-harness.mdx
@@ -0,0 +1,9 @@
+---
+title: "Codex"
+description: "Trace Codex agentic sessions, LLM calls, and tool executions in Weave."
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
diff --git a/weave/guides/integrations/agents/google-adk.mdx b/weave/guides/integrations/agents/google-adk.mdx
new file mode 100644
index 0000000000..010ccc8a87
--- /dev/null
+++ b/weave/guides/integrations/agents/google-adk.mdx
@@ -0,0 +1,113 @@
+---
+title: "Google ADK"
+description: Trace an agent built with Google's Agent Development Kit (ADK) using Weave.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+[Google's Agent Development Kit (ADK)](https://google.github.io/adk-docs/) is a flexible, model-agnostic Python framework for building and orchestrating agents. While optimized for Gemini, ADK supports any model and supports both simple tasks and complex multi-agent workflows. Weave automatically traces agents built with ADK, including each agent invocation, sub-agent handoff, model call, and tool call. Weave displays the captured data in the **Agents** view of your project.
+
+## Trace Google ADK agents with Weave
+
+The Weave SDK autopatches with Google ADK, allowing you to capture traces from your ADK agents with minimal set up. This doc shows how to initialize Weave and then run a multi-turn research agent built with Google ADK so that Weave captures every agent invocation, model call, and tool call across the session.
+
+### Prerequisites
+
+- A W&B account and [API key](https://wandb.ai/authorize) set as a `WANDB_API_KEY` environment variable
+- A [Google API key](https://aistudio.google.com/apikey) for Gemini
+- Python 3.10+
+
+### Install packages
+
+Install the following packages in your developer environment:
+
+```bash
+pip install weave google-adk requests
+```
+
+### Initialize Weave in your code
+
+Add `weave.init` to the project, along with your W&B team and project names, and then build an agent the way you normally would. The following code creates a `research_assistant` agent that uses `gemini-2.5-flash` and a `wikipedia_search` tool, then runs three questions through a single ADK session while Weave captures the trace.
+
+```python lines
+import asyncio
+import requests
+import weave
+from google.adk.agents import Agent
+from google.adk.runners import InMemoryRunner
+from google.genai import types
+
+weave.init("/")
+
+def wikipedia_search(query: str) -> dict:
+ """Search Wikipedia for a topic and return its title and intro paragraph.
+
+ Args:
+ query: The topic to search for.
+
+ Returns:
+ A dictionary with the article title and intro extract.
+ """
+ r = requests.get(
+ "https://en.wikipedia.org/w/api.php",
+ params={
+ "action": "query", "generator": "search", "gsrsearch": query, "gsrlimit": 1,
+ "prop": "extracts", "exintro": True, "explaintext": True, "format": "json",
+ },
+ headers={"User-Agent": "weave-demo"},
+ ).json()
+ page = next(iter(r["query"]["pages"].values()))
+ return {"title": page["title"], "extract": page["extract"]}
+
+agent = Agent(
+ name="research_assistant",
+ model="gemini-2.5-flash",
+ instruction=(
+ "You are a research assistant. Use the wikipedia_search tool to look up "
+ "topics when needed, and cite the article titles you used."
+ ),
+ tools=[wikipedia_search],
+)
+
+async def main():
+ runner = InMemoryRunner(agent=agent, app_name="research-app")
+ session = await runner.session_service.create_session(
+ app_name="research-app", user_id="user-1"
+ )
+
+ questions = [
+ "Who founded Anthropic?",
+ "What is Claude (the AI assistant)?",
+ "Summarize what we discussed in one sentence.",
+ ]
+
+ for question in questions:
+ print(f"USER: {question}")
+ async for event in runner.run_async(
+ user_id="user-1",
+ session_id=session.id,
+ new_message=types.Content(
+ role="user",
+ parts=[types.Part(text=question)],
+ ),
+ ):
+ if event.is_final_response() and event.content:
+ print(f"AGENT: {event.content.parts[0].text}\n")
+
+asyncio.run(main())
+```
+
+The example runs three turns in a single ADK session. The first two turns trigger Wikipedia lookups, and the third uses the prior conversation context to produce a summary without a tool call.
+
+### See your agent traces in the Agents view
+
+`weave.init()` prints a link to your project when it runs. Open the **Agents** view to inspect:
+
+- A row in the **Agents** tab for `research_assistant`
+- A single session containing three turns
+- Each turn rendered as an `invoke_agent` span with nested model calls and tool calls
+- The full input, model, output, token usage, and Wikipedia results at each step
+
+For details on viewing Agents data in Weave, see [View agent activity](/weave/guides/tracking/view-agent-activity).
\ No newline at end of file
diff --git a/weave/guides/integrations/agents/openai-agents-sdk.mdx b/weave/guides/integrations/agents/openai-agents-sdk.mdx
new file mode 100644
index 0000000000..ae7b0d54e2
--- /dev/null
+++ b/weave/guides/integrations/agents/openai-agents-sdk.mdx
@@ -0,0 +1,96 @@
+---
+title: "OpenAI Agents SDK"
+description: Trace an agent built with the OpenAI Agents SDK using Weave.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+The [OpenAI Agents SDK](https://github.com/openai/openai-agents-python) is a lightweight framework for building agents and multi-agent workflows on top of OpenAI's API. Weave automatically traces agents built with the OpenAI Agents SDK, including each agent invocation, sub-agent handoff, model call, and tool call. Weave displays the captured data in the **Agents** view of your project.
+
+
+The Weave TypeScript SDK does not support autopatching for the OpenAI Agents SDK.
+
+
+## Trace OpenAI Agents SDK agents with Weave
+
+The Weave SDK autopatches with the OpenAI Agents SDK, allowing you to capture traces from your agents with minimal set up. This doc shows how to initialize Weave and then run a multi-turn research agent built with the OpenAI Agents SDK so that Weave captures every agent invocation, model call, and tool call across the session.
+
+### Prerequisites
+
+- A W&B account and [API key](https://wandb.ai/authorize) set as a `WANDB_API_KEY` environment variable
+- An [OpenAI API key](https://platform.openai.com/api-keys)
+- Python 3.10+
+
+### Install packages
+
+Install the following packages in your developer environment:
+
+```bash
+pip install weave openai-agents requests
+```
+
+### Initialize Weave in your code
+
+Add `weave.init` to the project, along with your W&B team and project names, and then build an agent the way you normally would. The following code defines a `wikipedia_search` function tool and a `Research assistant` agent, then runs three questions through the OpenAI Agents SDK `Runner` while Weave captures the trace.
+
+```python lines
+import asyncio
+import requests
+import weave
+from agents import Agent, Runner, function_tool
+
+weave.init("/")
+
+@function_tool
+def wikipedia_search(query: str) -> str:
+ """Search Wikipedia for a topic and return its title and intro paragraph."""
+ r = requests.get(
+ "https://en.wikipedia.org/w/api.php",
+ params={
+ "action": "query", "generator": "search", "gsrsearch": query, "gsrlimit": 1,
+ "prop": "extracts", "exintro": True, "explaintext": True, "format": "json",
+ },
+ headers={"User-Agent": "weave-demo"},
+ ).json()
+ page = next(iter(r["query"]["pages"].values()))
+ return f"{page['title']}: {page['extract']}"
+
+agent = Agent(
+ name="Research assistant",
+ instructions=(
+ "You are a research assistant. Use the wikipedia_search tool to look up "
+ "topics when needed, and cite the article titles you used."
+ ),
+ tools=[wikipedia_search],
+)
+
+async def main():
+ history = []
+ for question in [
+ "Who founded Anthropic?",
+ "What is Claude (the AI assistant)?",
+ "Summarize what we discussed in one sentence.",
+ ]:
+ history.append({"role": "user", "content": question})
+ print(f"USER: {question}")
+ result = await Runner.run(agent, input=history)
+ print(f"AGENT: {result.final_output}\n")
+ history = result.to_input_list()
+
+asyncio.run(main())
+```
+
+The example runs three turns in a single conversation. The first two turns trigger Wikipedia lookups, and the third uses the prior conversation context to produce a summary without a tool call. Each call to `Runner.run` continues the conversation by passing the previous result's input list back as the next request.
+
+### See your agent traces in the Agents view
+
+`weave.init()` prints a link to your project when it runs. Open the **Agents** view to inspect:
+
+- A row in the **Agents** tab for `Research assistant`
+- A session containing three turns
+- Each turn rendered as an `invoke_agent` span with nested model calls and tool calls
+- The full input, model, output, token usage, and Wikipedia results at each step
+
+For details on viewing Agents data in Weave, see [View agent activity](/weave/guides/tracking/view-agent-activity).
\ No newline at end of file
diff --git a/weave/guides/integrations/agents/pi-dev-harness.mdx b/weave/guides/integrations/agents/pi-dev-harness.mdx
new file mode 100644
index 0000000000..2411144361
--- /dev/null
+++ b/weave/guides/integrations/agents/pi-dev-harness.mdx
@@ -0,0 +1,101 @@
+---
+title: "Pi"
+description: "Trace Pi agentic sessions, LLM calls, and tool executions in Weave."
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+[Pi](https://pi.dev/) is a terminal-based coding agent. Weave traces Pi sessions, LLM calls, and tool executions automatically using the `createOtelExtension` integration, which conforms to the [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/).
+
+
+Pi is a TypeScript/Node.js framework with no Python equivalent. Pi requires the ESM module system — your project must use `"type": "module"` in `package.json`, or compile TypeScript to ESM output. CommonJS projects will error. For more information on setting up an ESM project, see [Typescript SDK integration](/weave/guides/integrations/js#set-up-an-esm-project).
+
+
+## Prerequisites
+
+- Install [Node.js](https://nodejs.org/) (v18 or later).
+- Set `WANDB_API_KEY` in your environment. Create your API key at [wandb.ai/authorize](https://wandb.ai/authorize).
+
+## Install packages
+- Install Weave, Pi, and Node type definitions as local project dependencies:
+
+```bash lines
+npm install weave @earendil-works/pi-coding-agent
+npm install --save-dev @types/node tsx typescript
+```
+
+## Trace a Pi prompt and response
+
+Call `weave.init()` before creating your agent session, then pass `createOtelExtension()` as an extension factory. Weave traces the full agent lifecycle: the session, each prompt/response cycle (`invoke_agent`), individual LLM calls (`chat`), and tool executions (`execute_tool`). The session ID is generated automatically by `SessionManager.inMemory()`.
+
+```typescript lines
+import {init, createOtelExtension} from 'weave';
+
+import {
+ createAgentSession,
+ DefaultResourceLoader,
+ SessionManager,
+ getAgentDir,
+} from '@earendil-works/pi-coding-agent';
+
+async function main() {
+ // 1. Initialize Weave — sets up the OTEL TracerProvider pointing at your
+ // Weave project. All spans created by createOtelExtension() are
+ // automatically exported here.
+ await init('[YOUR-TEAM]/[YOUR-PROJECT]');
+
+// 2. Create a resource loader and inject the Weave OTEL extension.
+// The resource loader provides the Pi runtime environment and
+// extension lifecycle used for tracing agent activity.
+ const resourceLoader = new DefaultResourceLoader({
+ cwd: process.cwd(),
+ agentDir: getAgentDir(),
+ extensionFactories: [createOtelExtension({})],
+ });
+
+ await resourceLoader.reload();
+
+ // 3. Start the agent session
+ const {session} = await createAgentSession({
+ resourceLoader,
+ sessionManager: SessionManager.inMemory(),
+ });
+
+ // 4. Bind extensions — triggers session_start event so the OTEL adapter
+ // creates the root session span and captures the conversation ID.
+ await session.bindExtensions({});
+
+ // 5. Stream assistant output to stdout
+ session.subscribe(event => {
+ if (
+ event.type === 'message_update' &&
+ event.assistantMessageEvent.type === 'text_delta'
+ ) {
+ process.stdout.write(event.assistantMessageEvent.delta);
+ }
+ });
+
+ // 6. Send a prompt and wait for the full response
+ await session.prompt('What files are in the current directory?');
+ console.log();
+}
+
+main();
+```
+
+Build and run using:
+```bash
+npx tsx [filename].ts
+```
+
+When you run your code, your traces appear in the **Agents** tab of your Weave project at `https://wandb.ai/[YOUR-TEAM]/[YOUR-PROJECT]/weave/agents`.
+
+### Next steps
+
+You can turn this example into a multi-turn session by adding additional prompts. Each call to `session.prompt()` is traced as a separate `invoke_agent` span, all nested under a single root span. The agent retains context across prompts automatically.
+
+After running the code, the **Agents** tab shows the full multi-turn timeline with nested LLM calls, tool executions, token usage, and cost.
+
+
diff --git a/weave/guides/tracking/imgs/agent-view-agent-detail.png b/weave/guides/tracking/imgs/agent-view-agent-detail.png
new file mode 100644
index 0000000000..405ea3d1f7
Binary files /dev/null and b/weave/guides/tracking/imgs/agent-view-agent-detail.png differ
diff --git a/weave/guides/tracking/imgs/agent-view-agent.png b/weave/guides/tracking/imgs/agent-view-agent.png
new file mode 100644
index 0000000000..29110c05d7
Binary files /dev/null and b/weave/guides/tracking/imgs/agent-view-agent.png differ
diff --git a/weave/guides/tracking/imgs/agent-view-conversation-detail.png b/weave/guides/tracking/imgs/agent-view-conversation-detail.png
new file mode 100644
index 0000000000..58ca12dcf3
Binary files /dev/null and b/weave/guides/tracking/imgs/agent-view-conversation-detail.png differ
diff --git a/weave/guides/tracking/imgs/agent-view-conversation.png b/weave/guides/tracking/imgs/agent-view-conversation.png
new file mode 100644
index 0000000000..f3d991ce90
Binary files /dev/null and b/weave/guides/tracking/imgs/agent-view-conversation.png differ
diff --git a/weave/guides/tracking/imgs/agent-view-signals.png b/weave/guides/tracking/imgs/agent-view-signals.png
new file mode 100644
index 0000000000..1357caf037
Binary files /dev/null and b/weave/guides/tracking/imgs/agent-view-signals.png differ
diff --git a/weave/guides/tracking/imgs/agent-view-spans-detail.png b/weave/guides/tracking/imgs/agent-view-spans-detail.png
new file mode 100644
index 0000000000..2a644f3344
Binary files /dev/null and b/weave/guides/tracking/imgs/agent-view-spans-detail.png differ
diff --git a/weave/guides/tracking/trace-agents-batch.mdx b/weave/guides/tracking/trace-agents-batch.mdx
new file mode 100644
index 0000000000..329f54d585
--- /dev/null
+++ b/weave/guides/tracking/trace-agents-batch.mdx
@@ -0,0 +1,99 @@
+---
+title: Batch logging for your agent
+description: Manually log agent traces for frameworks that have already completed the LLM call and just need to record it.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+For frameworks that have already completed the LLM call and just need to record it, use `weave.log_turn` and `weave.log_session`. All spans are created and ended immediately without keeping any context managers open.
+
+Data logged this way can be historical — no live session is needed. Set `session_id` to any stable string that uniquely identifies the conversation. Turns that share the same `session_id` are grouped as a single session in the **Agents** view.
+
+If you are building your own agent loop, use the real-time instrumentation APIs described in [Trace your agents](/weave/guides/tracking/trace-agents) instead.
+
+## Log a turn
+
+`weave.log_turn` accepts a fully-formed turn, including all LLM and tool spans.
+
+
+
+
+```python lines highlight="1,3,28"
+weave.init("[YOUR-TEAM]/[YOUR-PROJECT]")
+
+from weave.session.session import LLM, Message, Tool, Usage
+
+llm_span = LLM(
+ model="gpt-4o",
+ provider_name="openai",
+ input_messages=[Message(role="user", content="What is the weather?")],
+ output_messages=[Message(role="assistant", content="Let me check.")],
+ usage=Usage(input_tokens=100, output_tokens=20),
+)
+
+tool_span = Tool(
+ name="get_weather",
+ arguments='{"city": "Tokyo"}',
+ result='"24°C, sunny"',
+)
+
+llm_span2 = LLM(
+ model="gpt-4o",
+ provider_name="openai",
+ input_messages=[Message(role="user", content="What is the weather?")],
+ output_messages=[Message(role="assistant", content="It is 24°C and sunny.")],
+ usage=Usage(input_tokens=150, output_tokens=30),
+)
+
+# Log a turn with all its spans.
+weave.log_turn(
+ session_id="my-session-abc",
+ agent_name="weather-bot",
+ messages=[
+ Message(role="user", content="What is the weather in Tokyo?"),
+ Message(role="assistant", content="It is 24°C and sunny in Tokyo."),
+ ],
+ spans=[llm_span, tool_span, llm_span2],
+)
+```
+
+
+
+
+```plaintext
+This feature is not available in the TypeScript SDK yet.
+```
+
+
+
+
+`log_turn` returns a `LogResult` containing the trace IDs of the emitted spans.
+
+An optional `model` parameter on `log_turn` sets the model on the turn's own span, not on the child LLM spans. Each `LLM` span carries its own `model` independently. If a turn uses multiple models, set `model` on `log_turn` to whichever you consider the primary model for that turn.
+
+## Log a session
+
+To bulk-import a complete, multi-turn session at once, use `weave.log_session`. The `turns` parameter accepts a list of `Turn` objects, each constructed the same way as the `log_turn` example above.
+
+
+
+
+```python lines
+weave.log_session(
+ session_id="my-session-abc",
+ agent_name="weather-bot",
+ turns=[turn_1, turn_2],
+)
+```
+
+
+
+
+```plaintext
+This feature is not available in the TypeScript SDK yet.
+```
+
+
+
diff --git a/weave/guides/tracking/trace-agents.mdx b/weave/guides/tracking/trace-agents.mdx
new file mode 100644
index 0000000000..178248e70f
--- /dev/null
+++ b/weave/guides/tracking/trace-agents.mdx
@@ -0,0 +1,495 @@
+---
+title: Trace your agents
+description: Use the Weave SDK to instrument multi-turn agentic applications and view them in the Agents tab.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+Learn how to instrument a multi-turn agentic application using the W&B Weave SDK so that you can view, debug, and evaluate your agent's behavior. This is intended for developers who are building or integrating agents and want structured visibility into sessions, turns, LLM calls, and tool executions.
+
+The Weave SDK for Agents models the full lifecycle of a multi-turn agent conversation: the agent that owns many sessions, the session that groups turns together, each user-agent exchange (turn), the LLM calls within a turn, and the tool executions that an LLM triggers. Traces appear in the **Agents** tab of your Weave project. Each session shows a multi-turn timeline with nested tool calls, token usage, and feedback.
+
+If you are tracing individual functions as Ops with the `@weave.op` decorator, see [Trace LLM applications](/weave/guides/tracking/tracing) instead.
+
+## Before you begin
+
+To get started, install the `weave` package and initialize your project. This makes Weave aware of your team and project so that spans are routed to the correct location in the UI.
+
+Install Weave and initialize your project:
+
+
+
+
+```bash lines
+pip install weave
+```
+
+Replace `[YOUR-TEAM]` with your W&B team name and `[YOUR-PROJECT]` with your W&B project name.
+
+```python lines
+import weave
+
+weave.init("[YOUR-TEAM]/[YOUR-PROJECT]")
+```
+
+Call `weave.init()` before any `start_session()`, `start_turn()`, `start_llm()`, or `start_tool()` call. All agent tracing functions no-op silently when tracing is disabled or the init call is absent, so you can leave instrumentation in production code and control it through configuration.
+
+
+
+
+```bash lines
+npm install weave
+```
+
+Replace `[YOUR-TEAM]` with your W&B team name and `[YOUR-PROJECT]` with your W&B project name.
+
+```typescript lines
+import * as weave from 'weave';
+
+await weave.init('[YOUR-TEAM]/[YOUR-PROJECT]');
+```
+
+Call `weave.init()` before any `startSession()`, `startTurn()`, `startLLM()`, or `startTool()` call. All agent tracing functions no-op silently when tracing is disabled or the init call is absent, so you can leave instrumentation in production code and control it through configuration.
+
+
+
+
+## The agent data model
+
+Weave models agent behavior as a hierarchy of one-to-many relationships. Each agent can have many sessions, each session can have many turns, each turn can have many LLM calls, and each LLM call can trigger many tool calls.
+
+| Concept | Weave SDK class | OTel span type | Description |
+|---|---|---|---|
+| Agent | *(no class)* | *(no span; grouped by `agent_name`)* | An agentic application in the Agents tab; contains one or more sessions |
+| Session | `Session` | *(no span; turns are grouped by `conversation_id`)* | A conversation or run that contains one or more turns |
+| Turn | `Turn` | `invoke_agent` | One user message and the agent's complete response |
+| LLM call | `LLM` | `chat` | One call to a language model API |
+| Tool call | `Tool` | `execute_tool` | One tool call triggered by an LLM response |
+
+The following diagram shows how one agent spans many sessions, one session spans many turns, and so on.
+
+```mermaid
+flowchart TB
+ Agent["Agent
agent_name"]
+
+ Agent --> S1 & S2
+
+ S1["Session 1
conversation_id
(no OTel span)"]
+ S2["Session 2
conversation_id
(no OTel span)"]
+
+ S1 --> T1 & T2
+ S2 --> T3
+
+ T1["Turn 1
invoke_agent
(root span, own trace)"]
+ T2["Turn 2
invoke_agent
(root span, own trace)"]
+ T3["Turn 1
invoke_agent
(root span, own trace)"]
+
+ T1 --> L1 & L2
+ L1["LLM call
chat"]
+ L2["LLM call
chat"]
+
+ L1 --> Tool1["Tool call
execute_tool"]
+
+ classDef agent fill:#DE72FF33,stroke:#454B52,stroke-width:2px
+ classDef session fill:#FFD95C33,stroke:#454B52,stroke-width:2px
+ classDef turn fill:#00CDDB33,stroke:#454B52,stroke-width:2px
+ classDef llm fill:#FFCBAD33,stroke:#454B52,stroke-width:2px
+ classDef tool fill:#f4f4f5,stroke:#454B52,stroke-width:2px
+
+ class Agent agent
+ class S1,S2 session
+ class T1,T2,T3 turn
+ class L1,L2 llm
+ class Tool1 tool
+```
+
+A session groups turns by a shared `conversation_id` attribute rather than a parent span, so each turn starts its own OTel trace. This design supports distributed tracing and parallel execution. The client sends spans directly to the OTel collector without any server-side aggregation.
+
+
+**Using a third-party agent SDK or harness?** Start with the [Weave integrations](/weave/guides/integrations) page instead of manual SDK instrumentation. Weave autopatches supported agent SDKs (such as OpenAI Agents SDK) and agent harnesses (such as Claude Code) for built-in agentic observability.
+
+
+## Agent tracing APIs
+
+Weave exposes the following top-level functions. Each function returns an object that works as a context manager (using `with` in Python, or `try/finally` in TypeScript) or that you can close manually by calling `.end()`.
+
+### Start a session
+
+`start_session()` / `startSession()` sets a `conversation_id` attribute on all child spans so that turns are grouped in the Agents tab. If you pass a `session_id`, it must be stable across the lifetime of the conversation. Reuse the same ID to add new turns to an existing session. When you omit `session_id`, the SDK generates a UUID automatically.
+
+The active session is stored in context (a Python `ContextVar` or Node.js `AsyncLocalStorage`), so any code running in the same async context can retrieve it with `weave.get_current_session()` / `weave.getCurrentSession()` without passing the session object explicitly.
+
+
+
+
+```python lines
+session = weave.start_session(
+ agent_name="my-agent", # Required: identifies the agent in the UI.
+ session_id="", # Optional: stable ID to group turns; auto-generated when empty.
+ model="", # Optional: default model for turns in this session.
+ session_name="", # Optional: human-readable label shown in the UI.
+ include_content=True, # Optional: set False to omit message bodies from spans.
+ continue_parent_trace=False, # Optional: attach to an existing OTel trace instead of starting a new one.
+)
+```
+
+
+
+
+```typescript lines
+const session = weave.startSession({
+ agentName: 'my-agent', // Optional: identifies the agent in the UI.
+ sessionId: '', // Optional: stable ID to group turns, auto-generated when empty.
+ model: '', // Optional: default model for turns in this session.
+});
+```
+
+
+
+
+
+### Start a turn
+
+`start_turn()` / `startTurn()` creates a new `invoke_agent` span that becomes the root of a new OTel trace. Weave uses this span to represent one complete user-agent exchange in the timeline view.
+
+When called as a top-level function, it resolves the active session from context and inherits its conversation ID. If no session is active, the turn is created without a `conversation_id` and won't be grouped with other turns.
+
+
+
+
+```python lines
+turn = weave.start_turn(
+ user_message="What is the weather in Tokyo?", # The user's input text.
+ agent_name="my-agent", # Optional: overrides the session-level agent name.
+ model="gpt-4o", # Optional: model used for this turn.
+)
+```
+
+
+
+
+```typescript lines
+const turn = weave.startTurn({
+ agentName: 'my-agent', // Optional: overrides the session-level agent name.
+ model: 'gpt-4o', // Optional: model used for this turn.
+});
+```
+
+
+
+
+
+
+### Start an LLM call
+
+`start_llm()` / `startLLM()` creates a `chat` span nested under the current turn. Weave uses this span to display token usage, model name, input and output messages, and reasoning in the Agents view.
+
+
+
+
+```python lines
+llm = weave.start_llm(
+ model="gpt-4o", # The model identifier.
+ provider_name="openai", # Required: provider name, for example "openai", "anthropic".
+ system_instructions=["Be concise."], # Optional: system prompt strings.
+)
+```
+
+
+
+
+```typescript lines
+const llm = weave.startLLM({
+ model: 'gpt-4o', // The model identifier.
+ providerName: 'openai', // Optional: provider name, for example "openai", "anthropic".
+});
+```
+
+
+
+
+
+
+After the LLM call completes, assign the response data to the `llm` object before it closes:
+
+
+
+
+```python lines
+with weave.start_llm(model="gpt-4o", provider_name="openai") as llm:
+ response = openai_client.chat.completions.create(...)
+ llm.input_messages = [Message(role="user", content="...")]
+ llm.output_messages = [Message(role="assistant", content=response.choices[0].message.content)]
+ llm.usage = Usage(
+ input_tokens=response.usage.prompt_tokens,
+ output_tokens=response.usage.completion_tokens,
+ )
+```
+
+
+
+
+```typescript lines
+const llm = weave.startLLM({ model: 'gpt-4o', providerName: 'openai' });
+try {
+ const response = await openaiClient.chat.completions.create({ ... });
+ llm.inputMessages = [{ role: 'user', content: '...' }];
+ llm.outputMessages = [{ role: 'assistant', content: response.choices[0].message.content ?? '' }];
+ llm.usage = {
+ inputTokens: response.usage?.prompt_tokens,
+ outputTokens: response.usage?.completion_tokens,
+ };
+} finally {
+ llm.end();
+}
+```
+
+
+
+
+Pass `provider_name` / `providerName` explicitly. Weave doesn't infer it from the model string.
+
+### Start a tool call
+
+`start_tool()` / `startTool()` creates an `execute_tool` span. The span becomes a child of whatever OTel span is active in context (typically the `chat` span of the LLM call that produced the tool call).
+
+
+
+
+```python lines
+tool = weave.start_tool(
+ name="get_weather", # Tool name as declared to the LLM.
+ arguments='{"city": "Tokyo"}', # JSON string of the tool arguments.
+ tool_call_id="call_abc123", # Optional: tool call ID from the LLM response.
+)
+```
+
+
+
+
+```typescript lines
+const tool = weave.startTool({
+ name: 'get_weather', // Tool name as declared to the LLM.
+ args: '{"city": "Tokyo"}', // Optional: JSON string of the tool arguments.
+ toolCallId: 'call_abc123', // Optional: tool call ID from the LLM response.
+});
+```
+
+
+
+
+Assign the tool result before closing:
+
+
+
+
+```python lines
+with weave.start_tool(name="get_weather", arguments='{"city": "Tokyo"}') as tool:
+ result = get_weather_api("Tokyo")
+ tool.result = result # Accepts dict, list, or string. JSON-encoded automatically.
+```
+
+
+
+
+```typescript lines
+const tool = weave.startTool({ name: 'get_weather', args: '{"city": "Tokyo"}' });
+try {
+ tool.result = await getWeatherApi('Tokyo');
+} finally {
+ tool.end();
+}
+```
+
+
+
+
+## Usage patterns for agent tracing
+
+The following sections describe how to combine these functions depending on how your agent code is structured.
+
+The examples below use two types from the Weave SDK:
+
+- `Message` represents a single entry in a conversation: a user input, an assistant response, a system prompt, or a tool result. Assign to `llm.input_messages` / `llm.inputMessages` to record what the model received and produced.
+- `Usage` captures token counts from the LLM response and is assigned to `llm.usage`.
+
+Weave uses both to populate the Agents view with the inputs, outputs, and token usage of each LLM call. For all supported data types, see the API reference.
+
+### Context manager / try-finally pattern
+
+The recommended approach for most agents is using a context manager pattern in Python or a try-finally pattern in TypeScript. The span closes and sends at the end of the block, even if an exception occurs.
+
+Weave stores the active session, turn, and LLM call in context, so any function called within a block can call `start_llm()` / `startLLM()` or `start_tool()` / `startTool()` without holding an explicit reference to the parent. This works across module boundaries as long as the code runs in the same async context. To retrieve the active objects from anywhere in the call stack, use `weave.get_current_session()` / `weave.getCurrentSession()`, `weave.get_current_turn()` / `weave.getCurrentTurn()`, and `weave.get_current_llm()` / `weave.getCurrentLLM()`.
+
+
+
+
+```python lines highlight="13,14,17,25,29"
+import weave
+from weave.session.session import Message, Usage
+
+# Placeholder functions: replace with your own implementations.
+def call_openai(*args, **kwargs):
+ pass # Replace with your LLM client call.
+
+def get_weather_api(city: str) -> str:
+ return "24°C, sunny" # Replace with your weather API call.
+
+weave.init("[YOUR-TEAM]/[YOUR-PROJECT]")
+
+with weave.start_session(agent_name="weather-bot") as session:
+ with session.start_turn(user_message="What is the weather in Tokyo?") as turn:
+
+ # First LLM call: returns a tool call.
+ with weave.start_llm(model="gpt-4o", provider_name="openai") as llm:
+ response = call_openai(...)
+ llm.input_messages = [Message(role="user", content="What is the weather?")]
+ llm.think("User wants weather data, I should call get_weather.")
+ llm.output("Let me check the weather for you.")
+ llm.usage = Usage(input_tokens=100, output_tokens=20)
+
+ # Tool call: child of the LLM call that requested it.
+ with weave.start_tool(name="get_weather", arguments='{"city":"Tokyo"}') as tool:
+ tool.result = get_weather_api("Tokyo") # Returns "24°C, sunny".
+
+ # Second LLM call: synthesizes the final answer.
+ with weave.start_llm(model="gpt-4o", provider_name="openai") as llm:
+ llm.input_messages = [Message(role="user", content="What is the weather?")]
+ llm.output("It is 24°C and sunny in Tokyo today.")
+ llm.usage = Usage(input_tokens=150, output_tokens=30)
+```
+
+
+
+
+```typescript lines highlight="11,13,16,24,35"
+import * as weave from 'weave';
+import type { Message, Usage } from 'weave';
+
+// Placeholder function: replace with your own implementation.
+async function getWeatherApi(city: string): Promise {
+ return '24°C, sunny'; // Replace with your weather API call.
+}
+
+await weave.init('[YOUR-TEAM]/[YOUR-PROJECT]');
+
+const session = weave.startSession({ agentName: 'weather-bot' });
+try {
+ const turn = session.startTurn({ agentName: 'weather-bot' });
+ try {
+ // First LLM call: returns a tool call.
+ const llm = weave.startLLM({ model: 'gpt-4o', providerName: 'openai' });
+ try {
+ llm.inputMessages = [{ role: 'user', content: 'What is the weather?' }];
+ llm.think('User wants weather data, I should call get_weather.');
+ llm.output('Let me check the weather for you.');
+ llm.usage = { inputTokens: 100, outputTokens: 20 };
+
+ // Tool call: child of the LLM call that requested it.
+ const tool = weave.startTool({ name: 'get_weather', args: '{"city":"Tokyo"}' });
+ try {
+ tool.result = await getWeatherApi('Tokyo'); // Returns "24°C, sunny".
+ } finally {
+ tool.end();
+ }
+ } finally {
+ llm.end();
+ }
+
+ // Second LLM call: synthesizes the final answer.
+ const llm2 = weave.startLLM({ model: 'gpt-4o', providerName: 'openai' });
+ try {
+ llm2.inputMessages = [{ role: 'user', content: 'What is the weather?' }];
+ llm2.output('It is 24°C and sunny in Tokyo today.');
+ llm2.usage = { inputTokens: 150, outputTokens: 30 };
+ } finally {
+ llm2.end();
+ }
+ } finally {
+ turn.end();
+ }
+} finally {
+ session.end();
+}
+```
+
+
+
+
+### Manual start and end pattern
+
+Use `.end()` explicitly when you can't use `with` blocks or `try/finally`. For example, when spans are opened and closed in different function calls, or when managing async lifecycle outside a coroutine.
+
+
+
+
+```python lines highlight="1,2,4,9,15"
+session = weave.start_session(agent_name="weather-bot")
+turn = session.start_turn(user_message="What is the weather?")
+
+llm = weave.start_llm(model="gpt-4o", provider_name="openai")
+llm.input_messages = [Message(role="user", content="What is the weather?")]
+llm.output("Let me check.")
+llm.usage = Usage(input_tokens=100, output_tokens=20)
+
+tool = weave.start_tool(name="get_weather", arguments='{"city": "Tokyo"}')
+tool.result = "24°C, sunny"
+tool.end() # end() is idempotent — safe to call more than once.
+
+llm.end()
+
+llm2 = weave.start_llm(model="gpt-4o", provider_name="openai")
+llm2.output("It is 24°C and sunny in Tokyo.")
+llm2.usage = Usage(input_tokens=150, output_tokens=30)
+llm2.end()
+
+turn.end()
+session.end()
+```
+
+
+
+
+```typescript lines highlight="1,2,4,9,15"
+const session = weave.startSession({ agentName: 'weather-bot' });
+const turn = session.startTurn({ agentName: 'weather-bot' });
+
+const llm = weave.startLLM({ model: 'gpt-4o', providerName: 'openai' });
+llm.inputMessages = [{ role: 'user', content: 'What is the weather?' }];
+llm.output('Let me check.');
+llm.usage = { inputTokens: 100, outputTokens: 20 };
+
+const tool = weave.startTool({ name: 'get_weather', args: '{"city": "Tokyo"}' });
+tool.result = '24°C, sunny';
+tool.end(); // end() is idempotent: safe to call more than once.
+
+llm.end();
+
+const llm2 = weave.startLLM({ model: 'gpt-4o', providerName: 'openai' });
+llm2.output('It is 24°C and sunny in Tokyo.');
+llm2.usage = { inputTokens: 150, outputTokens: 30 };
+llm2.end();
+
+turn.end();
+session.end();
+```
+
+
+
+
+## Semantic conventions
+
+The Weave SDK emits OTel spans that conform to the [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/) and [GenAI agent span conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/). Any OTel span is accepted. Weave stores all attributes and makes them queryable. You can add arbitrary attributes to spans using the standard OTel span API alongside Weave's tracing objects.
+
+## How spans appear in the Weave UI
+
+Once you run instrumented code, your traces appear in the **Agents** tab of your Weave project at `https://wandb.ai/[YOUR-TEAM]/[YOUR-PROJECT]/weave/agents`.
+
+- The **Sessions list** shows all sessions with a minimap of turn activity.
+- Clicking a session opens the **multi-turn session view** showing each turn, its LLM calls, tool executions, token counts, and any attached feedback.
+- Each `chat` span shows the input messages, output messages, model name, and usage.
+- Each `execute_tool` span shows the tool name, arguments, and result.
+
+For details on viewing Agents data in Weave, see [View agent activity](/weave/guides/tracking/view-agent-activity).
diff --git a/weave/guides/tracking/trace-sub-agents.mdx b/weave/guides/tracking/trace-sub-agents.mdx
new file mode 100644
index 0000000000..650e91233d
--- /dev/null
+++ b/weave/guides/tracking/trace-sub-agents.mdx
@@ -0,0 +1,160 @@
+---
+title: Trace sub-agents
+description: Trace nested agents and agents that have been delegated tasks using Weave the Weave sub-agent span.
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+A sub-agent is a delegated agent invocation that runs inside a turn. Use sub-agents when one agent hands off to another, such as when a supervisor agent dispatches a specialist agent.
+
+When instrumented using Weave, sub-agents emit a nested `invoke_agent` OTel span in the same trace as the parent turn. In the **Agents** view, this nesting renders as a sub-agent invocation under the turn that triggered it, with its own LLM calls and tool calls grouped beneath.
+
+## Sub-agent data model
+
+The `weave.start_subagent` span maps to the OTel `invoke_agent` span and emits the same operation name as the parent turn. Weave distinguishes the two by their parent-child relationship in the trace:
+
+```plaintext
+Turn (invoke_agent — root span)
+├── LLM call (chat) ← parent agent's reasoning
+│ └── SubAgent (invoke_agent) ← delegation happens here
+│ ├── LLM call (chat) ← sub-agent's own LLM call
+│ └── Tool call (execute_tool)
+└── LLM call (chat) ← parent agent synthesizes the final answer
+```
+
+Sub-agents inherit the active session's `conversation_id`, so they're grouped with the rest of the conversation in the Agents view.
+
+```python lines
+sub = weave.start_subagent(
+ name="research-specialist", # Required: identifies this sub-agent in the UI.
+ model="gpt-4o", # Optional: defaults to the parent session's model if empty.
+)
+```
+
+`weave.start_subagent` creates an `invoke_agent` span that automatically becomes a child of whatever span is currently active in OTel context, typically the parent turn or the LLM call that triggered the delegation. OTel context propagation handles the parent-child relationship, so no explicit delegation is needed.
+
+## Trace a single sub-agent
+
+The following example runs a supervisor agent that receives a request and delegates it a research-specialist sub-agent that uses a Wikipedia search tool to find the answer.
+
+Weave captures the full hierarchy by wrapping the conversation in `weave.start_session` and then a `session.start_turn`. Weave then captures the sub-agent trace using the `weave.start_subagent` block for the specialist, and recording each LLM call and tool execution as child spans.
+
+We have intentionally omitted the routing logic in these examples to focus on the tracing between agents.
+
+
+```python lines highlight="4,10,16,22,30"
+import weave
+from weave.session.session import Message, Usage
+
+weave.init("[YOUR-TEAM]/[YOUR-PROJECT]")
+
+with weave.start_session(agent_name="supervisor") as session:
+ with session.start_turn(user_message="Research the founders of Anthropic.") as turn:
+
+ # Supervisor LLM call: decide which specialist to delegate to.
+ with weave.start_llm(model="gpt-4o", provider_name="openai") as llm:
+ llm.input_messages = [Message(role="user", content="Research the founders of Anthropic.")]
+ llm.output("Delegating to the research specialist.")
+ llm.usage = Usage(input_tokens=80, output_tokens=10)
+
+ # Delegate to the research specialist as a sub-agent.
+ with weave.start_subagent(name="research-specialist", model="gpt-4o") as sub:
+ with sub.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.input_messages = [Message(role="user", content="Find founders of Anthropic.")]
+ sub_llm.output("I should search for this.")
+ sub_llm.usage = Usage(input_tokens=120, output_tokens=15)
+
+ with weave.start_tool(name="wikipedia_search", arguments='{"query":"Anthropic"}') as tool:
+ tool.result = "Anthropic was founded by Dario and Daniela Amodei in 2021."
+
+ with sub.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.output("Anthropic was founded by Dario and Daniela Amodei in 2021.")
+ sub_llm.usage = Usage(input_tokens=200, output_tokens=25)
+
+ # Back in the supervisor turn: synthesize the final answer.
+ with weave.start_llm(model="gpt-4o", provider_name="openai") as llm:
+ llm.output("Anthropic was founded by Dario and Daniela Amodei in 2021.")
+ llm.usage = Usage(input_tokens=300, output_tokens=20)
+```
+
+In the Agents view, the sub-agent appears as a nested `invoke_agent` block inside the turn, with its own LLM calls and tool calls grouped beneath. The supervisor's direct LLM calls remain siblings of the sub-agent.
+
+## Trace multiple sub-agents
+
+The following example runs a content-pipeline agent that handles a single request by delegating to three sibling sub-agents in sequence: a `researcher` that gathers facts, a `writer` that drafts the post, and a `reviewer` that polishes the final output.
+
+Weave captures all three sub-agents as siblings under the same turn by opening a separate `weave.start_subagent` block for each. Because each sub-agent inherits the active turn's OTel context, they appear as peer `invoke_agent` spans nested under the turn rather than under each other.
+
+```python lines highlight="1,5,11,15,22"
+with weave.start_session(agent_name="content-pipeline") as session:
+ with session.start_turn(user_message="Write a short blog post about Anthropic.") as turn:
+
+ # Researcher sub-agent: gather facts.
+ with weave.start_subagent(name="researcher", model="gpt-4o") as researcher:
+ with researcher.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.input_messages = [Message(role="user", content="Find key facts about Anthropic.")]
+ sub_llm.output("I should search Wikipedia.")
+ sub_llm.usage = Usage(input_tokens=80, output_tokens=15)
+
+ with weave.start_tool(name="wikipedia_search", arguments='{"query":"Anthropic"}') as tool:
+ tool.result = "Anthropic was founded by Dario and Daniela Amodei in 2021."
+
+ # Writer sub-agent: draft the post.
+ with weave.start_subagent(name="writer", model="gpt-4o") as writer:
+ with writer.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.input_messages = [Message(role="user", content="Draft a post using the research.")]
+ sub_llm.output("Anthropic, founded in 2021 by Dario and Daniela Amodei, builds AI safety research...")
+ sub_llm.usage = Usage(input_tokens=180, output_tokens=120)
+
+ # Reviewer sub-agent: polish the draft.
+ with weave.start_subagent(name="reviewer", model="gpt-4o") as reviewer:
+ with reviewer.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.input_messages = [Message(role="user", content="Review and tighten the draft.")]
+ sub_llm.output("Final post: Anthropic, founded in 2021 by Dario and Daniela Amodei, builds AI safety research...")
+ sub_llm.usage = Usage(input_tokens=200, output_tokens=140)
+```
+
+In the Agents view, the turn contains three sibling sub-agent invocations, each with its own LLM call nested beneath, and the `researcher` includes its tool call. None of the sub-agents are children of each other.
+
+### Trace nested sub-agents
+
+A sub-agent can itself delegate to another sub-agent. Each `start_subagent` call nests under whatever span is currently active in OTel context.
+
+```python lines highlight="1,2,4,5,11,16"
+with weave.start_session(agent_name="orchestrator") as session:
+ with session.start_turn(user_message="Compare Anthropic and OpenAI.") as turn:
+
+ with weave.start_subagent(name="research-coordinator") as coordinator:
+ with weave.start_subagent(name="anthropic-researcher") as r1:
+ with r1.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.output("Anthropic facts...")
+ sub_llm.usage = Usage(input_tokens=120, output_tokens=30)
+
+ # Nested: the researcher delegates to its own summarizer sub-agent.
+ with weave.start_subagent(name="anthropic-summarizer") as summarizer:
+ with summarizer.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.output("Anthropic summary: ...")
+ sub_llm.usage = Usage(input_tokens=80, output_tokens=20)
+
+ with weave.start_subagent(name="openai-researcher") as r2:
+ with r2.llm(model="gpt-4o", provider_name="openai") as sub_llm:
+ sub_llm.output("OpenAI facts...")
+ sub_llm.usage = Usage(input_tokens=120, output_tokens=30)
+```
+
+The example produces three levels of nesting under the turn:
+
+```plaintext
+turn (invoke_agent)
+└── research-coordinator (invoke_agent)
+ ├── anthropic-researcher (invoke_agent)
+ │ ├── chat
+ │ └── anthropic-summarizer (invoke_agent) ← nested inside anthropic-researcher
+ │ └── chat
+ └── openai-researcher (invoke_agent) ← sibling of anthropic-researcher
+ └── chat
+```
+
+In the Agents view, `research-coordinator` appears as a sub-agent of the turn, `anthropic-researcher` and `openai-researcher` appear as siblings under the coordinator, and `anthropic-summarizer` appears as a sub-agent of `anthropic-researcher`.
diff --git a/weave/guides/tracking/view-agent-activity.mdx b/weave/guides/tracking/view-agent-activity.mdx
new file mode 100644
index 0000000000..fd32ed7c5a
--- /dev/null
+++ b/weave/guides/tracking/view-agent-activity.mdx
@@ -0,0 +1,320 @@
+---
+title: "View agent activity"
+description: "Use W&B Weave's Agents view to understand what your agent did, how much it cost, and exactly where things went right or wrong."
+---
+
+import AgentsPreview from '/snippets/_includes/agents-public-preview.mdx';
+
+
+
+The Agents view gives you a turn-by-turn record of every conversation your agent had, along with token usage, tool invocations, and execution spans.
+
+Agent applications are hard to debug because the interesting behavior happens between the user's request and the final response. The Agents view in W&B Weave makes that middle layer visible. Every conversation your agent had is captured here, with the full message history, span-level execution detail, and token costs attached. You can see at a glance whether an agent completed its task, how many tool calls it made, and where time or budget was spent. For teams building and iterating on agents, this is the starting point for understanding behavior in production.
+
+## Get started
+
+To enter the Agents view:
+
+1. Navigate to [https://wandb.ai](https://wandb.ai) and select your project.
+2. In the sidebar menu, select **Agents** to view all agent conversations saved for your project.
+
+## Agents tab
+
+The **Agents** tab gives you a high-level view of all agents that have logged
+traces to this project. Use it to spot which agents are active, compare
+latency and error rates across agents, and identify agents that need
+attention before drilling into individual conversations.
+
+
+
+It is useful for scenarios such as:
+
+- **Monitoring a fleet of agents.** The card grid lets you compare latency and
+ error rate across all agents at once without opening individual conversations.
+ A latency spike or a newly red error rate on one card signals a regression
+ worth investigating.
+- **Identifying stale agents.** Sorting by **Last seen** highlights agents that
+ haven't recorded activity recently. This is useful for confirming a deployment
+ is live or spotting agents that may have stopped logging traces unexpectedly.
+- **Comparing versions.** The version count on each card tells you how many
+ distinct versions of that agent have been deployed. A high version count
+ alongside a rising error rate may indicate a regression introduced in a recent
+ deployment.
+- **Drilling into an agent.** Click any card to open the detail panel for that
+ agent, from which you can navigate to its conversations or spans.
+
+
+
+### Agent cards
+
+Each agent is represented as a card showing:
+
+| Field | Description |
+|---|---|
+| **Agent name** | The name logged with the agent's traces. |
+| **Last seen** | How long ago the agent last recorded activity. |
+| **Version** | The number of distinct `agent_version` values recorded across the agent's spans. |
+| **Activity histogram** | A bar chart of recent conversation volume, giving a quick sense of usage trends. |
+| **Conversations** | Total number of conversations recorded. |
+| **Spans** | Total number of spans recorded across all conversations. |
+| **Latency (avg.)** | Average end-to-end duration per invocation. |
+| **Error rate** | Percentage of invocations that returned an error. Displays in red when greater than 0%. |
+
+### Find and sort agents
+
+Use the **Search agents** field to filter cards by agent name.
+
+Use the sort dropdown (default: **Last seen**) to reorder the grid. The
+available sort options are:
+
+- **Last seen**: Most recently active agents first.
+- **Most invocations**: Highest conversation volume first.
+- **Most input tokens**: Highest token consumption first.
+- **Most errors**: Highest error count first.
+
+Sorting by **Most errors** is useful for a quick daily health check: agents
+with non-zero error rates surface immediately, and the red error rate on the
+card confirms at a glance which need investigation.
+
+## Conversations tab
+
+The **Conversations** tab on the Agents page lets you browse, filter, and
+inspect individual agent runs. Use it to investigate failures, measure token
+costs, and understand the sequence of LLM calls and tool executions that made
+up a run.
+
+For high-level questions about what an agent said and did across a conversation, start with the Conversations tab.
+
+
+
+### Conversations table
+
+The conversation table shows one row per conversation. The following columns
+appear by default.
+
+| Column | Description |
+|---|---|
+| **Conversation** | The conversation ID and a preview of the first message. |
+| **Last message** | A preview of the most recent message, with a role indicator. |
+| **Agent** | The name of the agent or agents involved. |
+| **Invocations** | How many times the agent was invoked during the conversation. |
+| **Spans** | Total number of spans recorded. Higher span counts indicate more branching or tool use. |
+| **In tokens** | Input tokens consumed. |
+| **Out tokens** | Output tokens generated. |
+| **Started** | When the conversation began. |
+| **Last activity** | How long ago the last message was recorded. |
+
+To show or hide additional columns, click **Columns** in the toolbar.
+
+### Filtering and time window
+
+Use the **Filter** bar to narrow results by agent, model, error status, or
+other attributes.
+
+Use the time window selector (**1m**, **1h**, **6h**, **24h**, **7d**, or
+**30d**) to restrict the list to conversations that were active within that
+period. The conversation volume histogram above the list updates to reflect the
+selected window.
+
+Hover over any column header in the conversation list to filter that column to
+a specific value or range.
+
+### Agent conversation detail
+
+Click a conversation row to open a detail panel with two sub-tabs: turns and events.
+
+
+
+#### Turns
+
+The conversation detail turns panel shows each turn in chronological order, numbered from 1.
+
+Each turn displays the number of intermediate responses and tool calls, and
+the total wall-clock duration. Expand a turn to see the full message thread.
+
+##### Messages
+
+Within a turn, messages are grouped by role.
+
+**User messages** show the message text and any attached media or content
+references.
+
+**Assistant messages** show:
+
+- The agent name and the model used (for example, `gpt-5.5-2026-04-23`).
+- Timestamp and duration.
+- Input and output token counts (for example, `16086 in 295 out`).
+- An expandable **Reasoning** section when the model used extended thinking.
+- The response text, which collapses automatically for long responses.
+
+**Tool calls** show the tool name, timestamp, and duration. If argument or
+result data is available, the tool call is expandable and shows **Args** and
+**Result** in a key-value table. If the call failed, an **ERROR** badge
+appears.
+
+##### Error states
+
+When a tool call returns an error status, a red **ERROR** badge appears inline
+next to it. In the Events timeline, that event also displays in red regardless
+of its type.
+
+#### Events
+
+The **Events** panel on the right shows a color-coded strip that represents
+the sequence of events within the selected turn.
+
+In the events timeline, each segment's color indicates the event type.
+
+| Color | Event type |
+|---|---|
+| Purple | User message |
+| Green | Assistant message |
+| Blue | Tool call |
+| Sienna | Sub-agent invocation |
+| Magenta | Agent handoff |
+| Gray | Context compaction |
+| Red | Any event that returned an error |
+
+Use the Events timeline to get a quick sense of how a turn was structured. For
+example, you can see whether it was LLM-heavy, tool-heavy, or involved sub-agent
+delegation before reading the full message thread.
+
+##### Scores
+
+If any signals are active for this project, a **Scores** section provides metrics for the conversation. It shows the signal scorer name, an overall numeric rating
+from 0 to 1, a confidence percentage, and the individual rubric points that
+contributed to the score. Each rubric point also shows its own confidence. Use this to
+understand not just whether a turn scored well, but which specific rubric
+criteria passed or failed.
+
+##### Meta summary
+
+The **Meta summary** section shows aggregate statistics for the selected
+conversation.
+
+| Field | Description |
+|---|---|
+| **Tokens** | Total input and output tokens. |
+| **Tool calls** | Number of tool calls across all turns. |
+| **Messages** | Total message count. |
+| **Session time** | Wall-clock duration from first to last message. |
+| **Turn page** | Which turns are currently displayed, and the total turn count. |
+
+##### Token breakdown
+
+The **Token breakdown** section shows cache and reasoning details for the
+selected conversation.
+
+| Field | Description |
+|---|---|
+| **Cache read** | Tokens served from the prompt cache. |
+| **Cache written** | Tokens written to the prompt cache. |
+| **Cache hit rate** | Percentage of input tokens served from cache. A higher rate reduces cost and latency. |
+| **Reasoning** | Tokens spent on extended thinking. |
+| **Reasoning ratio** | Percentage of output tokens spent on extended thinking. |
+
+##### Participants
+
+The **Participants** section lists the agents and models involved in the
+conversation. In multi-agent conversations, different turns may show different
+model names here.
+
+## Spans tab
+
+The **Spans** tab shows every individual span recorded across all agent
+activity in the project. Where the Conversations tab aggregates activity into
+dialogue-level rows, the Spans tab exposes the raw operations underneath: each
+LLM call, tool execution, and agent invocation as its own row. Use it to trace
+exactly which call was slow, which model consumed unexpected tokens, or which
+tool invocation failed.
+
+
+
+### Spans table
+
+The span table shares most columns with the Conversations table (agent, model,
+tool, token counts, status). Some columns unique to this view are:
+
+| Column | Description |
+|---|---|
+| **Span** | The span name and ID, with its trace ID below. |
+| **Kind** | The OpenTelemetry span kind for this operation (such as `INTERNAL`, `SERVER`, or `CLIENT`). |
+| **Operation** | The operation type (such as `chat`, `execute_tool`, or `invoke_agent`). |
+| **Finished** | The finish reason returned by the model (such as `stop` or `max_tokens`). Populated only for `chat` spans where the model reports a finish reason. |
+
+Additional columns for cache token breakdowns, reasoning tokens, LLM
+parameters, and W&B run metadata are available through the **Columns** button.
+
+The Spans tab is most useful when you need operation-level precision that the
+Conversations tab doesn't provide:
+
+- **Identifying expensive calls.** Sort by **In** or **Out** tokens to find
+ which individual LLM calls are driving cost, rather than seeing totals at
+ the conversation level.
+- **Debugging a specific operation type.** Filter by **Operation** to isolate
+ all `execute_tool` spans and check error rates, or all `chat` spans for a
+ specific model.
+- **Investigating truncation.** Filter **Finished** by `max_tokens` to find
+ spans where the model hit its token limit rather than completing normally.
+- **Correlating with a W&B run.** Hidden-by-default columns expose W&B run IDs
+ and run steps, letting you link a specific span back to a training or
+ evaluation run in W&B.
+
+### Trace grouping
+
+Click any row to select its trace and highlight all other spans that share the same trace ID. This shows you the full set of operations that were executed as part of one agent invocation. Grouping here is by trace, not by conversation. This means a single conversation may contain multiple traces if it involved sub-agent delegation.
+
+### Agent invocation detail
+
+Click a row in the **Spans** table to open a detail panel with two sub-tabs that are populated with data from the complete agent invocation.
+
+- **Chat** sub-tab shows the reconstructed conversation for the selected trace, giving narrative context for the spans you're inspecting.
+- **Spans** sub-tab shows the individual spans belonging to that trace with their operation, model, and duration. This is useful for comparing timings within a single trace without scrolling the full list.
+
+## Signals tab
+
+The **Signals** tab shows the output of automated behavioral scoring applied
+to your agent's turns. Where the Conversations and Spans tabs tell you *what*
+happened, Signals tell you *how well* it happened. They surface quality issues
+like low-effort responses or hallucinations, and error conditions like rate
+limiting or bugs, without requiring you to read individual traces.
+
+
+
+### Signals table
+
+Each row represents one signal score applied to a turn. The following columns
+appear by default.
+
+| Column | Description |
+|---|---|
+| **Type** | The level at which the signal was scored. |
+| **Scorer** | The name of the signal that produced this score (for example, **Response Quality** or **User Satisfaction**). |
+| **Last message** | A preview of the last message in the scored turn, with the role shown below. |
+| **Agent** | The agent associated with the scored turn. |
+| **Scores** | The numeric score from 0 to 1. Scores near 0 indicate a detected issue. Scores near 1 indicate no issue detected. |
+| **Trend (24h)** | Score trend for this scorer over the past 24 hours. |
+| **When** | When the signal was scored. |
+
+Use the time window selector and **Filter** bar to narrow results by scorer,
+agent, score range, or time period.
+
+### Manage and edit signals
+
+Select **Manage signals** to open a drawer that lists all active signals for
+the project. From there you can toggle signals on or off, delete them, or
+click the edit icon on any signal to modify it.
+
+The signal editor has the following fields:
+
+| Field | Description |
+|---|---|
+| **Scorer type** | The scoring method. **Rating** emits a single numeric score in [0, 1] per agent turn. |
+| **Only score turns matching** | Optional filters to restrict which turns this signal scores. Multiple filters are AND-ed. Leave empty to score every turn. |
+| **Prompt template** | A preset template to use as a starting point for the scorer prompt. |
+| **Scorer prompt** | The full prompt sent to the scoring model. Supports template variables (`{input_messages}`, `{output_messages}`, `{system_instructions}`, and `{agent_name}`) which are filled in at score time. |
+| **Scorer name** | The display name for this signal. |
+
+Select **+ New signal** to create a new scorer for your agent.
+
+For more information on built-in signals, see [Monitor using built-in signals](/weave/guides/evaluation/monitors).