From f3e6b1317ac165102f9584224191d249f9d61c6e Mon Sep 17 00:00:00 2001
From: AnandK27 <anand@collinear.ai>
Date: Wed, 1 Apr 2026 00:25:41 -0700
Subject: [PATCH 1/6] Add prime-rl training cookbook for SimLab trajectories

Cookbook that bridges SimLab's task execution with Prime Intellect's
prime-rl for RL training of agent models. The full pipeline:

- Collect tool-use trajectories from SimLab environments
- Convert to SFT datasets (HuggingFace messages format)
- Build and push a verifiers environment to Prime Intellect hub
- Run hosted RL training via `prime rl run`
- Evaluate trained models back through SimLab

Includes example customer support tasks, quality+completeness rubrics,
and configs for both SFT warmup and RL training on Qwen3.5-9B.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cookbook/README.md                            |   1 +
 cookbook/prime-rl-training/.gitignore         |   9 +
 cookbook/prime-rl-training/SKILL.md           | 161 +++++++++
 cookbook/prime-rl-training/configs/rl.toml    |  39 ++
 cookbook/prime-rl-training/configs/sft.toml   |  35 ++
 ...pancy_for_enterprise_renewal_35ba835d.json |  37 ++
 ..._rate_limiting_issue_and_pre_de0cff0d.json |  54 +++
 ...il_group_and_coordinate_acco_7a7bbde0.json |  54 +++
 .../prime-envs/simlab_tasks/pyproject.toml    |  21 ++
 .../prime-envs/simlab_tasks/simlab_tasks.py   | 176 +++++++++
 .../prime-rl-training/prime-rl-training.md    | 237 ++++++++++++
 cookbook/prime-rl-training/pyproject.toml     |  34 ++
 cookbook/prime-rl-training/run_pipeline.sh    | 125 +++++++
 .../src/prime_rl_training/__init__.py         |   1 +
 .../src/prime_rl_training/collect.py          | 152 ++++++++
 .../src/prime_rl_training/simlab_env.py       | 245 +++++++++++++
 .../prime_rl_training/trajectory_converter.py | 340 ++++++++++++++++++
 17 files changed, 1721 insertions(+)
 create mode 100644 cookbook/prime-rl-training/.gitignore
 create mode 100644 cookbook/prime-rl-training/SKILL.md
 create mode 100644 cookbook/prime-rl-training/configs/rl.toml
 create mode 100644 cookbook/prime-rl-training/configs/sft.toml
 create mode 100644 cookbook/prime-rl-training/examples/task-bundle/3_triage_and_escalate_critical_billing_discrepancy_for_enterprise_renewal_35ba835d.json
 create mode 100644 cookbook/prime-rl-training/examples/task-bundle/4_enterprise_client_escalation_resolve_david_parks_api_rate_limiting_issue_and_pre_de0cff0d.json
 create mode 100644 cookbook/prime-rl-training/examples/task-bundle/6_resolve_sla_critical_billing_dispute_for_wilson_retail_group_and_coordinate_acco_7a7bbde0.json
 create mode 100644 cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
 create mode 100644 cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
 create mode 100644 cookbook/prime-rl-training/prime-rl-training.md
 create mode 100644 cookbook/prime-rl-training/pyproject.toml
 create mode 100755 cookbook/prime-rl-training/run_pipeline.sh
 create mode 100644 cookbook/prime-rl-training/src/prime_rl_training/__init__.py
 create mode 100644 cookbook/prime-rl-training/src/prime_rl_training/collect.py
 create mode 100644 cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
 create mode 100644 cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py

diff --git a/cookbook/README.md b/cookbook/README.md
index f96677b..4b4f80b 100644
--- a/cookbook/README.md
+++ b/cookbook/README.md
@@ -28,3 +28,4 @@ The agent will walk through each step, ask you for any required inputs (model, t
 | [openai-agents-sdk](openai-agents-sdk/) | Customer-style OpenAI Agents SDK cookbook showing how to keep an existing agent app and add a thin SimLab adapter. |
 | [secure-agent-eval](secure-agent-eval/) | Evaluate agent behavior through OneCLI's credential proxy — compare correctness, audit for credential leakage, and test rate limit resilience. |
 | [simlab-auto-research](simlab-auto-research/) | Autonomous system prompt optimization using the [auto-research](https://github.com/karpathy/autoresearch) pattern. An outer agent iterates on prompts, measured by SimLab task scores. |
+| [prime-rl-training](prime-rl-training/) | Collect SimLab trajectories and train agent models with Prime Intellect's prime-rl (SFT warmup + hosted RL). |
diff --git a/cookbook/prime-rl-training/.gitignore b/cookbook/prime-rl-training/.gitignore
new file mode 100644
index 0000000..6343811
--- /dev/null
+++ b/cookbook/prime-rl-training/.gitignore
@@ -0,0 +1,9 @@
+# Generated artifacts (re-create with run_pipeline.sh)
+output/
+dataset/
+dist/
+.prime/
+__pycache__/
+*.pyc
+taskgen/
+generated-tasks/
diff --git a/cookbook/prime-rl-training/SKILL.md b/cookbook/prime-rl-training/SKILL.md
new file mode 100644
index 0000000..06013a1
--- /dev/null
+++ b/cookbook/prime-rl-training/SKILL.md
@@ -0,0 +1,161 @@
+# Prime-RL Training with SimLab Trajectories
+
+Train agent models with Prime Intellect's prime-rl using SimLab-collected trajectories.
+
+## Prerequisites
+
+Before starting, confirm:
+
+1. SimLab is installed: `simlab --version`
+2. prime CLI is installed: `prime --version`
+3. `SIMLAB_COLLINEAR_API_KEY` is set
+4. `PRIME_API_KEY` is set
+5. `OPENAI_API_KEY` is set (for baseline agent)
+
+If any prerequisite is missing, tell the user what to set and **wait before proceeding**.
+
+## Workflow
+
+### 1. Install cookbook dependencies
+
+```bash
+cd cookbook/prime-rl-training
+uv sync
+```
+
+### 2. Create SimLab environment
+
+```bash
+simlab templates list
+```
+
+Ask the user which template to use (default: `customer_service`).
+
+```bash
+simlab env init prime-rl-env --template <template>
+```
+
+### 3. Generate tasks
+
+```bash
+simlab tasks-gen init --preset customer_support --output-dir ./taskgen
+simlab tasks-gen run --config taskgen/config.toml
+```
+
+Wait for task generation to complete before proceeding.
+
+### 4. List tasks and select for rollouts
+
+```bash
+simlab tasks list --tasks-dir ./generated-tasks
+```
+
+Note the task IDs.
+
+### 5. Start environment and collect trajectories
+
+```bash
+simlab env up prime-rl-env
+```
+
+Wait for all services to become healthy, then run tasks:
+
+```bash
+simlab tasks run \
+  --env prime-rl-env \
+  --task <task_id> \
+  --tasks-dir ./generated-tasks \
+  --agent-model gpt-5.2 \
+  --agent-api-key "$OPENAI_API_KEY"
+```
+
+Repeat for each task. Wait for each to complete.
+
+### 6. Convert trajectories to SFT dataset
+
+```bash
+python -m prime_rl_training.collect sft \
+  --output-dir ./output \
+  --save-path ./dataset \
+  --min-reward 0.0 \
+  --include-failed \
+  --format jsonl
+```
+
+Verify the dataset:
+```bash
+wc -l dataset/train.jsonl
+head -1 dataset/train.jsonl | python -m json.tool
+```
+
+Present the trajectory count and a sample to the user.
+
+### 7. Push verifiers environment to Prime Intellect
+
+```bash
+prime env push -p ./prime-envs/simlab_tasks
+```
+
+Wait for confirmation. Note the environment ID from the output (e.g., `<username>/simlab-tasks`).
+
+### 8. Check model availability
+
+```bash
+prime rl models --plain
+```
+
+Present available models. Recommend `Qwen/Qwen3.5-9B` or another available model.
+
+### 9. Configure and launch RL training
+
+Update `configs/rl.toml` with the correct model and environment ID, then:
+
+```bash
+prime rl run configs/rl.toml
+```
+
+Note the run ID from the output.
+
+### 10. Monitor training
+
+```bash
+prime rl logs <run-id> -f
+prime rl metrics <run-id> --plain
+prime rl progress <run-id> --plain
+```
+
+Present metrics to the user.
+
+### 11. Tear down SimLab environment
+
+```bash
+simlab env down prime-rl-env
+```
+
+## Results collection
+
+After training completes:
+
+```bash
+prime rl get <run-id> --plain
+prime rl checkpoints <run-id> --plain
+```
+
+Present results:
+
+| Metric | Value |
+|--------|-------|
+| Run ID | ... |
+| Model | ... |
+| Steps completed | ... |
+| Final reward | ... |
+| Checkpoint ID | ... |
+
+## Troubleshooting
+
+- **`simlab: command not found`** — Install with `uv pip install simulationlab`
+- **`prime: command not found`** — Install with `pip install prime`
+- **No trajectories collected** — Ensure the SimLab environment is running (`simlab env up`) and API keys are valid
+- **Port conflict on env up** — Edit `docker-compose.yml` to change conflicting port mappings
+- **`prime rl models` shows "At Capacity"** — Try a different model or wait
+- **Environment push needs username** — The first push prompts for a Prime Intellect username (one-time setup)
diff --git a/cookbook/prime-rl-training/configs/rl.toml b/cookbook/prime-rl-training/configs/rl.toml
new file mode 100644
index 0000000..4e85892
--- /dev/null
+++ b/cookbook/prime-rl-training/configs/rl.toml
@@ -0,0 +1,39 @@
+# Prime-RL RL config for SimLab environment training
+# Usage: prime rl run configs/rl.toml
+#
+# This config runs hosted RL training on Prime Intellect's platform
+# using a SimLab verifiers environment for reward scoring.
+
+# Use the SFT-warmed model or start from a base instruct model
+model = "Qwen/Qwen3.5-9B"
+max_steps = 50
+
+# Training hyperparameters
+batch_size = 64
+rollouts_per_example = 4
+# learning_rate = 3e-6
+# lora_alpha = 16
+
+[sampling]
+max_tokens = 2048
+# temperature = 0.7
+
+# SimLab verifiers environment
+[[env]]
+id = "collinear-simlab/simlab-tasks"
+
+# Optional: W&B logging
+# [wandb]
+# project = "simlab-rl-training"
+
+# Optional: evaluation during training
+# [eval]
+# interval = 25
+# [[eval.env]]
+# id = "simlab-tasks"
+# args = { output_dir = "./output", min_reward = 0.5 }
+
+# Optional: checkpointing
+# [checkpoints]
+# interval = 50
+# keep_cloud = 3
diff --git a/cookbook/prime-rl-training/configs/sft.toml b/cookbook/prime-rl-training/configs/sft.toml
new file mode 100644
index 0000000..9298475
--- /dev/null
+++ b/cookbook/prime-rl-training/configs/sft.toml
@@ -0,0 +1,35 @@
+# Prime-RL SFT config for SimLab trajectory training
+# Usage: uv run sft @ configs/sft.toml
+#
+# This config trains a small model on successful SimLab trajectories
+# to teach it the tool-use and task-completion patterns before RL.
+
+max_steps = 200
+
+[model]
+name = "Qwen/Qwen3.5-4B"
+# Uncomment for LoRA (recommended to save memory):
+# [model.lora]
+# rank = 32
+# alpha = 64
+# target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+
+[data]
+# Local dataset path (generated by: python -m prime_rl_training.collect sft ...)
+# Or a HuggingFace dataset ID (e.g., "myorg/simlab-sft-data")
+name = "./dataset"
+seq_len = 4096
+batch_size = 16
+
+# Only train on assistant responses (mask system/user/tool messages)
+[data.loss_mask]
+system = false
+user = false
+assistant = true
+tool = false
+
+[optim]
+lr = 2e-5
+
+[ckpt]
+# Checkpoint at end of training
diff --git a/cookbook/prime-rl-training/examples/task-bundle/3_triage_and_escalate_critical_billing_discrepancy_for_enterprise_renewal_35ba835d.json b/cookbook/prime-rl-training/examples/task-bundle/3_triage_and_escalate_critical_billing_discrepancy_for_enterprise_renewal_35ba835d.json
new file mode 100644
index 0000000..ee22499
--- /dev/null
+++ b/cookbook/prime-rl-training/examples/task-bundle/3_triage_and_escalate_critical_billing_discrepancy_for_enterprise_renewal_35ba835d.json
@@ -0,0 +1,37 @@
+{
+  "meta": {
+    "version": "2.0",
+    "task_id": "3_triage_and_escalate_critical_billing_discrepancy_for_enterprise_renewal_35ba835d",
+    "display_name": "Triage and escalate critical billing discrepancy for enterprise renewal",
+    "category": "ticket_triage",
+    "difficulty": "hard",
+    "idempotent": false
+  },
+  "task": "You've received a billing dispute from Karen Mitchell regarding her enterprise renewal invoice. She reports a 40% increase with unexpected charges and is threatening to cancel by end of week. Contact Diana Walsh to obtain the specific billing details and charge breakdown from the invoice. Then contact Carlos Mendez to confirm what was discussed during the renewal process and any contract amendments. Once you have the facts from both, review the ticket details and determine whether this is a genuine billing error or a legitimate contract amendment issue. Provide Karen with a clear explanation of the charges and your recommended resolution path.",
+  "tool_servers": [],
+  "apps": [
+    "Helpdesk",
+    "Chat",
+    "Email"
+  ],
+  "npcs": [
+    {
+      "id": "karen_mitchell"
+    },
+    {
+      "id": "diana_walsh"
+    },
+    {
+      "id": "carlos_mendez"
+    }
+  ],
+  "seed_emails": [
+    {
+      "from_profile_id": "karen_mitchell",
+      "to_addr": "support@weaverenterprises.com",
+      "subject": "Fwd: Urgent: Billing Discrepancy on Enterprise Renewal Invoice",
+      "body_text": "---------- Forwarded message ---------\nFrom: Karen Mitchell <karen.mitchell@mitchellassociates.co>\n\nHi,\n\nI received our enterprise renewal invoice and noticed significant unexpected charges that weren't discussed during contract renewal. The invoice shows a 40% increase from our previous year, but our service scope and user count haven't changed.\n\nI need clarification on:\n1. Why the unit price increased\n2. What these additional line items represent\n3. Whether this reflects an undisclosed contract amendment\n\nI'm frustrated because we were led to believe our renewal would be flat or minimal increase. If this isn't resolved by end of this week, we will have to cancel and explore alternatives.\n\nPlease escalate this urgently.\n\nKaren Mitchell\nMitchell Associates",
+      "body_html": "<p>---------- Forwarded message ---------</p><p>From: Karen Mitchell &lt;karen.mitchell@mitchellassociates.co&gt;</p><p>Hi,</p><p>I received our enterprise renewal invoice and noticed significant unexpected charges that weren't discussed during contract renewal. The invoice shows a 40% increase from our previous year, but our service scope and user count haven't changed.</p><p>I need clarification on:</p><ul><li>Why the unit price increased</li><li>What these additional line items represent</li><li>Whether this reflects an undisclosed contract amendment</li></ul><p>I'm frustrated because we were led to believe our renewal would be flat or minimal increase. If this isn't resolved by end of this week, we will have to cancel and explore alternatives.</p><p>Please escalate this urgently.</p><p>Karen Mitchell<br>Mitchell Associates</p>"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/cookbook/prime-rl-training/examples/task-bundle/4_enterprise_client_escalation_resolve_david_parks_api_rate_limiting_issue_and_pre_de0cff0d.json b/cookbook/prime-rl-training/examples/task-bundle/4_enterprise_client_escalation_resolve_david_parks_api_rate_limiting_issue_and_pre_de0cff0d.json
new file mode 100644
index 0000000..1daf187
--- /dev/null
+++ b/cookbook/prime-rl-training/examples/task-bundle/4_enterprise_client_escalation_resolve_david_parks_api_rate_limiting_issue_and_pre_de0cff0d.json
@@ -0,0 +1,54 @@
+{
+  "meta": {
+    "version": "2.0",
+    "task_id": "4_enterprise_client_escalation_resolve_david_parks_api_rate_limiting_issue_and_pre_de0cff0d",
+    "display_name": "Enterprise Client Escalation: Resolve David Park's API Rate Limiting Issue and Prevent Service Interruption",
+    "category": "vip_enterprise_support",
+    "difficulty": "hard",
+    "idempotent": false
+  },
+  "task": "David Park at Park Industries has escalated a critical production API rate-limiting issue causing service interruptions. Search for or create a helpdesk ticket for this issue. Before responding to David, contact Marcus Chen in the engineering-support channel to get his technical assessment. After Marcus provides his findings, reach out to Robert Hayes via direct message to confirm the remediation messaging and any service credit or compensation. Once you have both assessments, email David with the findings, remediation steps, resolution timeline, and CC Robert Hayes for executive visibility.",
+  "tool_servers": [],
+  "apps": [
+    "Helpdesk",
+    "Chat",
+    "Email"
+  ],
+  "npcs": [
+    {
+      "id": "david_park"
+    },
+    {
+      "id": "marcus_chen"
+    },
+    {
+      "id": "robert_hayes"
+    }
+  ],
+  "seed_emails": [
+    {
+      "from_profile_id": "david_park",
+      "to_addr": "support@weaverenterprises.com",
+      "subject": "URGENT: Production API Integration Rate Limiting - Service Interruptions",
+      "body_text": "Hi,\n\nWe're experiencing critical issues with our production API integration. We're hitting unexpected rate limits that are causing service interruptions for our downstream clients. This is impacting our ability to serve our end users and is a significant operational problem.\n\nWe have an SLA in place for this account and need immediate investigation and resolution.\n\nPlease advise on the root cause and what remediation steps or timeline we should expect.\n\nThanks,\nDavid Park\nPark Industries",
+      "body_html": "<p>Hi,</p><p>We're experiencing critical issues with our production API integration. We're hitting unexpected rate limits that are causing service interruptions for our downstream clients. This is impacting our ability to serve our end users and is a significant operational problem.</p><p>We have an SLA in place for this account and need immediate investigation and resolution.</p><p>Please advise on the root cause and what remediation steps or timeline we should expect.</p><p>Thanks,<br>David Park<br>Park Industries</p>"
+    }
+  ],
+  "seed_group_channels": [
+    {
+      "channel_name": "engineering-support",
+      "member_profile_ids": [
+        "marcus_chen",
+        "priya_sharma",
+        "kevin_zhang",
+        "sandra_kim"
+      ],
+      "messages": [
+        {
+          "from_profile_id": "marcus_chen",
+          "text": "Channel for escalations and coordination between support and engineering teams on critical production issues."
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/cookbook/prime-rl-training/examples/task-bundle/6_resolve_sla_critical_billing_dispute_for_wilson_retail_group_and_coordinate_acco_7a7bbde0.json b/cookbook/prime-rl-training/examples/task-bundle/6_resolve_sla_critical_billing_dispute_for_wilson_retail_group_and_coordinate_acco_7a7bbde0.json
new file mode 100644
index 0000000..7e35ee5
--- /dev/null
+++ b/cookbook/prime-rl-training/examples/task-bundle/6_resolve_sla_critical_billing_dispute_for_wilson_retail_group_and_coordinate_acco_7a7bbde0.json
@@ -0,0 +1,54 @@
+{
+  "meta": {
+    "version": "2.0",
+    "task_id": "6_resolve_sla_critical_billing_dispute_for_wilson_retail_group_and_coordinate_acco_7a7bbde0",
+    "display_name": "Resolve SLA-critical billing dispute for Wilson Retail Group and coordinate account review",
+    "category": "ticket_resolution",
+    "difficulty": "hard",
+    "idempotent": false
+  },
+  "task": "You're handling an urgent billing dispute from James Wilson at Wilson Retail Group regarding invoice #INV-2025-47500 for $47,500. He claims a 15% discount was promised during renewal but the invoice reflects the standard rate. With ~4 hours before SLA breach, contact Diana Walsh to confirm the current billing status and any documented discount in the system. Then contact Amanda Reeves to verify what discount was discussed and agreed to during the renewal conversation. Once you have confirmation the discount was promised, contact James Foster to request approval for the billing adjustment. Document your findings in the ticket and send James Wilson a clear resolution.",
+  "tool_servers": [],
+  "apps": [
+    "Helpdesk",
+    "Chat",
+    "Email"
+  ],
+  "npcs": [
+    {
+      "id": "james_wilson_wrg"
+    },
+    {
+      "id": "diana_walsh"
+    },
+    {
+      "id": "amanda_reeves"
+    },
+    {
+      "id": "james_foster"
+    }
+  ],
+  "seed_emails": [
+    {
+      "from_profile_id": "diana_walsh",
+      "to_addr": "hr@weaverenterprises.com",
+      "subject": "Invoice #INV-2025-47500 – Wilson Retail Group Enterprise Service Fees",
+      "body_text": "Hi,\n\nI'm looking into a dispute on invoice #INV-2025-47500 for $47,500 (enterprise service fees for Wilson Retail Group). The customer claims they were promised a 15% discount on renewal but the invoice shows the previous rate.\n\nCan you confirm:\n1. What was the renewal discussion and agreed discount (if any)?\n2. What contract terms are on file?\n\nThis is urgent—ticket SLA expires in ~4 hours.\n\nThanks,\nDiana Walsh\nBilling Specialist",
+      "body_html": "<p>Hi,</p><p>I'm looking into a dispute on invoice #INV-2025-47500 for $47,500 (enterprise service fees for Wilson Retail Group). The customer claims they were promised a 15% discount on renewal but the invoice shows the previous rate.</p><p>Can you confirm:</p><ol><li>What was the renewal discussion and agreed discount (if any)?</li><li>What contract terms are on file?</li></ol><p>This is urgent—ticket SLA expires in ~4 hours.</p><p>Thanks,<br>Diana Walsh<br>Billing Specialist</p>"
+    },
+    {
+      "from_profile_id": "amanda_reeves",
+      "to_addr": "hr@weaverenterprises.com",
+      "subject": "Wilson Retail Group Renewal – Discount Discussion",
+      "body_text": "Hi,\n\nJames Wilson from Wilson Retail Group has disputed the recent $47,500 invoice, claiming we promised a 15% discount during renewal. I need to verify what was actually discussed in our last renewal conversation.\n\nCan you pull details on:\n1. Date and participants of the last renewal meeting/call?\n2. What discount (if any) was offered or discussed?\n3. Any notes or email confirmation sent to the customer?\n\nUrgent—SLA deadline in ~4 hours.\n\nThanks,\nAmanda Reeves\nSenior Account Manager",
+      "body_html": "<p>Hi,</p><p>James Wilson from Wilson Retail Group has disputed the recent $47,500 invoice, claiming we promised a 15% discount during renewal. I need to verify what was actually discussed in our last renewal conversation.</p><p>Can you pull details on:</p><ol><li>Date and participants of the last renewal meeting/call?</li><li>What discount (if any) was offered or discussed?</li><li>Any notes or email confirmation sent to the customer?</li></ol><p>Urgent—SLA deadline in ~4 hours.</p><p>Thanks,<br>Amanda Reeves<br>Senior Account Manager</p>"
+    },
+    {
+      "from_profile_id": "james_foster",
+      "to_addr": "hr@weaverenterprises.com",
+      "subject": "Approval Needed – Wilson Retail Group Invoice Adjustment",
+      "body_text": "Hi,\n\nWilson Retail Group is disputing invoice #INV-2025-47500 ($47,500) for enterprise service fees. Customer claims a 15% discount was promised on renewal but not applied.\n\nIf the discount was indeed committed, I may need your approval to adjust the invoice. Can you confirm:\n1. Is a 15% discount justified based on renewal terms?\n2. If yes, do you have authority to approve the adjustment?\n\nSLA expires in ~4 hours.\n\nThanks,\nJames Foster\nBilling Manager",
+      "body_html": "<p>Hi,</p><p>Wilson Retail Group is disputing invoice #INV-2025-47500 ($47,500) for enterprise service fees. Customer claims a 15% discount was promised on renewal but not applied.</p><p>If the discount was indeed committed, I may need your approval to adjust the invoice. Can you confirm:</p><ol><li>Is a 15% discount justified based on renewal terms?</li><li>If yes, do you have authority to approve the adjustment?</li></ol><p>SLA expires in ~4 hours.</p><p>Thanks,<br>James Foster<br>Billing Manager</p>"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml b/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
new file mode 100644
index 0000000..0928890
--- /dev/null
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "simlab-tasks"
+description = "SimLab customer support task environment for prime-rl training. Uses pre-collected trajectories from SimLab rollouts as prompts with quality-based reward scoring."
+tags = ["simlab", "tool-use", "customer-support", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers>=0.1.11",
+    "datasets",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["simlab_tasks.py", "pyproject.toml"]
+
+[tool.verifiers.eval]
+num_examples = 5
+rollouts_per_example = 3
diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
new file mode 100644
index 0000000..028f039
--- /dev/null
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
@@ -0,0 +1,176 @@
+"""SimLab customer support task environment for prime-rl training.
+
+This environment provides customer support task prompts collected from
+SimLab rollouts. The model generates responses and is scored on:
+- Similarity to successful reference trajectories (Jaccard overlap)
+- Response quality and structure (formatting, completeness)
+
+Tasks include ticket triage, billing disputes, escalations, and
+multi-turn customer conversations from the Weaver Enterprises scenario.
+"""
+
+from datasets import Dataset, load_dataset
+
+import verifiers as vf
+
+
+# --- Embedded dataset of SimLab customer support prompts ---
+# These are extracted from SimLab task bundles and successful rollouts.
+# For larger datasets, replace with a HuggingFace dataset ID.
+SIMLAB_TASKS = [
+    {
+        "question": (
+            "You've received a billing dispute from Karen Mitchell regarding her "
+            "enterprise renewal invoice. She reports a 40% increase with unexpected "
+            "charges and is threatening to cancel by end of week. Contact Diana Walsh "
+            "to obtain the specific billing details and charge breakdown from the "
+            "invoice. Then contact Carlos Mendez to confirm what was discussed during "
+            "the renewal process and any contract amendments. Once you have the facts "
+            "from both, review the ticket details and determine whether this is a "
+            "genuine billing error or a legitimate contract amendment issue. Provide "
+            "Karen with a clear explanation of the charges and your recommended "
+            "resolution path."
+        ),
+        "answer": "",
+        "info": {},
+        "task": "simlab-billing-dispute",
+    },
+    {
+        "question": (
+            "David Park from TechStart Inc has reported persistent API rate limiting "
+            "issues affecting their production environment. His enterprise SLA "
+            "guarantees 99.9% uptime and the current issues are putting them at risk "
+            "of breaching that threshold. Investigate the technical details of the "
+            "rate limiting, coordinate with engineering to identify root cause, and "
+            "provide David with a resolution timeline. Ensure the response meets the "
+            "enterprise SLA first-response requirements."
+        ),
+        "answer": "",
+        "info": {},
+        "task": "simlab-api-escalation",
+    },
+    {
+        "question": (
+            "Wilson Retail Group has filed an SLA-critical billing dispute claiming "
+            "they were double-charged for their Q4 platform usage. The account is "
+            "flagged as at-risk for churn. Review the billing records, cross-reference "
+            "with the CRM account history, and determine whether the duplicate charge "
+            "is valid. If confirmed, initiate the refund process and coordinate with "
+            "the account manager to schedule a retention call. Document all findings "
+            "in the support ticket."
+        ),
+        "answer": "",
+        "info": {},
+        "task": "simlab-billing-sla",
+    },
+]
+
+
+def _quality_reward(completion: str, **kwargs) -> float:
+    """Reward for well-structured, substantive responses."""
+    if not completion or not completion.strip():
+        return 0.0
+
+    text = completion.strip()
+    score = 0.0
+
+    # Length-based scoring
+    if len(text) > 500:
+        score += 0.3
+    elif len(text) > 200:
+        score += 0.2
+    elif len(text) > 50:
+        score += 0.1
+
+    # Structure scoring
+    if any(marker in text for marker in ["##", "**", "- ", "1.", "* "]):
+        score += 0.2
+
+    # Task-specific keyword scoring (customer support domain)
+    cs_keywords = [
+        "ticket", "customer", "escalat", "resolv", "billing",
+        "sla", "priority", "update", "follow-up", "investigation",
+    ]
+    keyword_hits = sum(1 for kw in cs_keywords if kw.lower() in text.lower())
+    score += min(0.3, keyword_hits * 0.05)
+
+    # Professional tone indicators
+    if any(phrase in text.lower() for phrase in [
+        "i understand", "thank you", "please", "we will",
+        "next steps", "resolution", "apolog",
+    ]):
+        score += 0.2
+
+    return min(score, 1.0)
+
+
+def _completeness_reward(completion: str, question: str, **kwargs) -> float:
+    """Reward for addressing all parts of the task instruction."""
+    if not completion or not question:
+        return 0.0
+
+    # Extract action items from the question
+    action_indicators = ["contact", "review", "determine", "provide", "ensure",
+                         "investigate", "coordinate", "document", "initiate"]
+    required_actions = [
+        word for word in action_indicators
+        if word.lower() in question.lower()
+    ]
+
+    if not required_actions:
+        return 0.5
+
+    addressed = sum(
+        1 for action in required_actions
+        if action.lower() in completion.lower()
+    )
+
+    return addressed / len(required_actions)
+
+
+def load_environment(
+    dataset_name: str | None = None,
+    dataset_split: str = "train",
+    system_prompt: str | None = None,
+    **kwargs,
+) -> vf.Environment:
+    """Load the SimLab customer support environment.
+
+    Args:
+        dataset_name: HuggingFace dataset ID. If None, uses embedded tasks.
+        dataset_split: Dataset split to use.
+        system_prompt: Override system prompt.
+
+    Returns:
+        A verifiers SingleTurnEnv for prime-rl training.
+    """
+    if system_prompt is None:
+        system_prompt = (
+            "You are a customer support agent at Weaver Enterprises. "
+            "You have access to helpdesk tickets, email, and chat tools. "
+            "Handle customer issues professionally and thoroughly. "
+            "Think step by step about what information you need, who to "
+            "contact, and how to resolve the issue. Provide clear, "
+            "actionable responses."
+        )
+
+    # Load dataset
+    if dataset_name:
+        train_dataset = load_dataset(dataset_name, split=dataset_split)
+    else:
+        train_dataset = Dataset.from_list(SIMLAB_TASKS)
+
+    parser = vf.Parser()
+
+    rubric = vf.Rubric(parser=parser)
+    rubric.add_reward_func(_quality_reward, weight=0.5)
+    rubric.add_reward_func(_completeness_reward, weight=0.5)
+
+    env = vf.SingleTurnEnv(
+        dataset=train_dataset,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+    )
+
+    return env
diff --git a/cookbook/prime-rl-training/prime-rl-training.md b/cookbook/prime-rl-training/prime-rl-training.md
new file mode 100644
index 0000000..2d28425
--- /dev/null
+++ b/cookbook/prime-rl-training/prime-rl-training.md
@@ -0,0 +1,237 @@
+# Prime-RL Training with SimLab Trajectories
+
+Train an agent model using Prime Intellect's prime-rl with trajectories collected from SimLab environments. This cookbook covers the full pipeline: collecting tool-use trajectories, converting to training data, building a verifiers environment, and launching RL training on Prime Intellect's hosted platform.
+
+## What is here
+
+- `prime_rl_training.trajectory_converter`
+  - converts SimLab `artifacts.json` files into prime-rl compatible SFT datasets (HuggingFace messages format)
+- `prime_rl_training.simlab_env`
+  - verifiers-compatible environment wrapper for SimLab task data
+- `prime_rl_training.collect`
+  - CLI to collect and convert trajectories (`python -m prime_rl_training.collect sft|rl`)
+- `prime-envs/simlab_tasks/`
+  - standalone verifiers environment package ready to push to Prime Intellect's Environments Hub
+- `configs/`
+  - prime-rl TOML configs for SFT warmup and hosted RL training
+- `run_pipeline.sh`
+  - end-to-end automation script
+
+## Prerequisites
+
+- **SimLab** installed:
+  ```bash
+  uv pip install "simulationlab[daytona]"
+  ```
+- **prime CLI** installed:
+  ```bash
+  pip install prime
+  ```
+- **API keys** exported:
+  ```bash
+  export SIMLAB_COLLINEAR_API_KEY="col_..."     # from platform.collinear.ai
+  export PRIME_API_KEY="pit_..."                 # from Prime Intellect platform
+  export OPENAI_API_KEY="sk-..."                 # for running the baseline agent
+  export DAYTONA_API_KEY="dtn_..."               # for remote sandbox execution (optional)
+  ```
+- **Verifier** configured (for scoring SimLab rollouts):
+  ```bash
+  export SIMLAB_VERIFIER_MODEL="gpt-5.2"
+  export SIMLAB_VERIFIER_PROVIDER="openai"
+  export SIMLAB_VERIFIER_API_KEY="$OPENAI_API_KEY"
+  ```
+
+## Install
+
+Run from `cookbook/prime-rl-training`:
+
+```bash
+uv sync
+```
+
+This project installs `simulationlab` from the local repo path `../..`.
+
+## Step 1: Create a SimLab environment
+
+Pick an environment template with tool-use tasks:
+
+```bash
+simlab templates list
+simlab env init prime-rl-env --template customer_service
+```
+
+> **Tip:** Use `erp`, `crm_sales`, `customer_service`, or `project_management` for different task patterns. Each template provides different tool servers.
+
+## Step 2: Generate tasks
+
+```bash
+simlab tasks-gen init --preset customer_support --output-dir ./taskgen
+# Optionally edit taskgen/config.toml (task count, difficulty, model)
+simlab tasks-gen run --config taskgen/config.toml
+```
+
+Or list template tasks:
+
+```bash
+simlab tasks list --env prime-rl-env
+```
+
+## Step 3: Collect trajectories with a baseline agent
+
+Run tasks with a capable model to generate training trajectories:
+
+```bash
+simlab tasks run \
+  --env prime-rl-env \
+  --task <task_id_1> <task_id_2> <task_id_3> \
+  --tasks-dir ./generated-tasks \
+  --agent-model gpt-5.2 \
+  --agent-api-key "$OPENAI_API_KEY"
+```
+
+For parallel rollouts with Daytona:
+
+```bash
+simlab tasks run \
+  --env prime-rl-env \
+  --task <task_id> \
+  --tasks-dir ./generated-tasks \
+  --daytona \
+  --rollout-count 5 \
+  --max-parallel 3 \
+  --agent-model gpt-5.2 \
+  --agent-api-key "$OPENAI_API_KEY"
+```
+
+Each rollout produces:
+```
+output/agent_run_<task>_<ts>/
+  artifacts.json          # Full trajectory (messages, tool calls, results)
+  verifier/
+    reward.json           # Structured reward (0.0-1.0)
+```
+
+> **Note:** Aim for 50-200 successful trajectories for meaningful SFT warmup.
+
+## Step 4: Convert trajectories to SFT dataset
+
+```bash
+python -m prime_rl_training.collect sft \
+  --output-dir ./output \
+  --save-path ./dataset \
+  --min-reward 0.5 \
+  --format jsonl
+```
+
+This produces `dataset/train.jsonl` in prime-rl's messages format — one row per trajectory with full tool-call history. Only trajectories with reward >= 0.5 are included.
+
+Inspect the dataset:
+```bash
+head -1 dataset/train.jsonl | python -m json.tool
+```
+
+**Optional — Push to HuggingFace Hub:**
+```bash
+python -m prime_rl_training.collect sft \
+  --output-dir ./output \
+  --push-to myorg/simlab-sft-data
+```
+
+## Step 5: SFT warmup (local, requires GPU)
+
+> **Note:** SFT training requires the open-source [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl) and a GPU. Skip to Step 6 for hosted RL-only training.
+
+```bash
+git clone https://github.com/PrimeIntellect-ai/prime-rl.git
+cd prime-rl && uv sync --all-extras
+
+# Run SFT (edit configs/sft.toml to set your dataset path first)
+uv run sft @ path/to/cookbook/configs/sft.toml
+```
+
+The SFT config trains only on assistant messages (system/user/tool masked out), teaching the model tool-use patterns from successful SimLab trajectories.
+
+## Step 6: Build and push a verifiers environment
+
+The `prime-envs/simlab_tasks/` directory contains a ready-made verifiers environment with embedded SimLab customer support prompts and quality+completeness rubrics.
+
+Push to Prime Intellect's Environments Hub:
+
+```bash
+prime env push -p ./prime-envs/simlab_tasks
+```
+
+Verify:
+```bash
+prime env info <your-username>/simlab-tasks
+```
+
+> **Customizing:** Edit `prime-envs/simlab_tasks/simlab_tasks.py` to add your own task prompts, adjust rubric weights, or load from a HuggingFace dataset instead of the embedded examples.
+
+## Step 7: Run RL training on Prime Intellect
+
+```bash
+# Check available models
+prime rl models
+
+# Edit configs/rl.toml:
+#   - Set model (e.g., Qwen/Qwen3.5-9B)
+#   - Set env ID to your pushed environment (e.g., your-username/simlab-tasks)
+
+prime rl run configs/rl.toml
+```
+
+Monitor:
+```bash
+prime rl logs <run-id> -f        # stream logs
+prime rl metrics <run-id>        # training metrics
+prime rl rollouts <run-id>       # sample rollouts
+prime rl progress <run-id>       # step progress
+```
+
+## Step 8: Deploy and evaluate
+
+After training, deploy the LoRA adapter:
+
+```bash
+prime rl checkpoints <run-id>
+prime deployments create <adapter-id>
+```
+
+Evaluate the trained model back through SimLab:
+
+```bash
+simlab tasks run \
+  --env prime-rl-env \
+  --task <task_id> \
+  --tasks-dir ./generated-tasks \
+  --agent-model <deployed-model-id> \
+  --agent-provider openai-compatible \
+  --agent-api-key "$PRIME_API_KEY" \
+  --agent-base-url "https://api.pinference.ai/api/v1"
+```
+
+Compare reward scores against your Step 3 baseline.
+
+## Step 9: Tear down
+
+```bash
+simlab env down prime-rl-env
+# With Daytona:
+simlab env down prime-rl-env --daytona
+```
+
+## Next steps
+
+- **More data:** Increase rollout count and add more task templates for broader coverage.
+- **Multi-environment RL:** Add multiple `[[env]]` sections to `rl.toml` for diverse training signal.
+- **Curriculum learning:** Start with easier templates (e.g., `erp`) and progress to harder ones (e.g., `customer_service`).
+- **Custom rubrics:** Modify the verifiers environment rubric to reward specific agent behaviors.
+
+## Troubleshooting
+
+- **`No trajectories found`** — Check that SimLab rollouts completed and `artifacts.json` files exist in the output directory.
+- **`prime rl models` shows "At Capacity"** — Try a different model or wait for availability.
+- **SFT dataset is too small** — Lower `--min-reward` to include more trajectories, or run more rollouts.
+- **RL reward stays at 0** — The rubric may be too strict for the base model. Try starting from an SFT-warmed checkpoint, or adjust rubric weights in the verifiers environment.
+- **Port conflicts on `simlab env up`** — Another service is using the port. Edit `docker-compose.yml` port mappings or stop conflicting services.
diff --git a/cookbook/prime-rl-training/pyproject.toml b/cookbook/prime-rl-training/pyproject.toml
new file mode 100644
index 0000000..eb745cf
--- /dev/null
+++ b/cookbook/prime-rl-training/pyproject.toml
@@ -0,0 +1,34 @@
+[project]
+name = "prime-rl-training"
+version = "0.1.0"
+description = "SimLab cookbook: train agents with Prime Intellect's prime-rl using SimLab trajectories"
+readme = "prime-rl-training.md"
+requires-python = ">=3.13"
+dependencies = [
+    "simulationlab",
+    "datasets>=3.0",
+    "huggingface-hub>=0.25",
+    "prime",
+    "verifiers>=0.1.11",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/prime_rl_training"]
+
+[tool.uv.sources]
+simulationlab = { path = "../..", editable = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.0",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["INP001"]
diff --git a/cookbook/prime-rl-training/run_pipeline.sh b/cookbook/prime-rl-training/run_pipeline.sh
new file mode 100755
index 0000000..c34b63c
--- /dev/null
+++ b/cookbook/prime-rl-training/run_pipeline.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# Full pipeline: SimLab trajectory collection → prime-rl SFT → RL training
+#
+# Prerequisites:
+#   - SimLab installed: pip install "simulationlab[daytona]"
+#   - prime CLI installed: pip install prime
+#   - API keys exported (see prime-rl-training.md)
+#
+# Usage:
+#   chmod +x run_pipeline.sh
+#   ./run_pipeline.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEMPLATE="${SIMLAB_TEMPLATE:-erp}"
+ENV_NAME="${SIMLAB_ENV_NAME:-prime-rl-env}"
+TASK_COUNT="${SIMLAB_TASK_COUNT:-10}"
+ROLLOUT_COUNT="${SIMLAB_ROLLOUT_COUNT:-3}"
+OUTPUT_DIR="${SCRIPT_DIR}/output"
+DATASET_DIR="${SCRIPT_DIR}/dataset"
+TASKS_DIR="${SCRIPT_DIR}/generated-tasks"
+AGENT_MODEL="${SIMLAB_AGENT_MODEL:-gpt-5.2}"
+
+echo "=== Step 1: Create SimLab environment ==="
+simlab env init "${ENV_NAME}" --template "${TEMPLATE}"
+
+echo ""
+echo "=== Step 2: Generate tasks ==="
+simlab tasks-gen init --preset "${TEMPLATE}" 2>/dev/null || true
+simlab tasks-gen run 2>/dev/null || true
+
+echo ""
+echo "=== Step 3: List available tasks ==="
+TASKS=$(simlab tasks list --env "${ENV_NAME}" --tasks-dir "${TASKS_DIR}" 2>/dev/null | head -20)
+echo "${TASKS}"
+
+echo ""
+echo "=== Step 4: Run rollouts to collect trajectories ==="
+# Get task IDs (first column, skip header lines)
+TASK_IDS=$(simlab tasks list --env "${ENV_NAME}" --tasks-dir "${TASKS_DIR}" 2>/dev/null \
+  | grep -v "^[-=]" | grep -v "^Task" | awk '{print $1}' | head -"${TASK_COUNT}")
+
+if [ -z "${TASK_IDS}" ]; then
+    echo "ERROR: No tasks found. Check your environment and task bundle."
+    exit 1
+fi
+
+for TASK_ID in ${TASK_IDS}; do
+    echo "  Running task: ${TASK_ID}"
+    simlab tasks run \
+        --env "${ENV_NAME}" \
+        --task "${TASK_ID}" \
+        --tasks-dir "${TASKS_DIR}" \
+        --agent-model "${AGENT_MODEL}" \
+        --agent-api-key "${OPENAI_API_KEY:-}" \
+        --rollout-count "${ROLLOUT_COUNT}" \
+        --max-parallel 2 \
+        --daytona \
+        2>/dev/null || echo "  Warning: task ${TASK_ID} failed, continuing..."
+done
+
+echo ""
+echo "=== Step 5: Convert trajectories to SFT dataset ==="
+python -m prime_rl_training.collect sft \
+    --output-dir "${OUTPUT_DIR}" \
+    --save-path "${DATASET_DIR}" \
+    --min-reward 0.5 \
+    --format jsonl
+
+echo ""
+echo "=== Step 6: Build verifiers environment ==="
+# Create the environment package for prime-rl
+ENV_PKG_DIR="${SCRIPT_DIR}/prime-envs/simlab_tasks"
+mkdir -p "${ENV_PKG_DIR}"
+
+# Copy the environment module
+cp "${SCRIPT_DIR}/src/prime_rl_training/simlab_env.py" "${ENV_PKG_DIR}/simlab_tasks.py"
+
+cat > "${ENV_PKG_DIR}/pyproject.toml" << 'TOML'
+[project]
+name = "simlab-tasks"
+description = "SimLab task environment for prime-rl training"
+tags = ["simlab", "tool-use", "multi-turn", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers>=0.1.11",
+    "datasets",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["simlab_tasks.py", "pyproject.toml"]
+
+[tool.verifiers.eval]
+num_examples = 5
+rollouts_per_example = 3
+TOML
+
+echo "Environment package created at ${ENV_PKG_DIR}"
+
+echo ""
+echo "=== Step 7: Push environment to Prime Intellect hub ==="
+echo "Run: prime env push simlab-tasks --path ${ENV_PKG_DIR}"
+echo "(Skipping automatic push — run manually to review first)"
+
+echo ""
+echo "=== Step 8: Run RL training ==="
+echo "Run: prime rl run ${SCRIPT_DIR}/configs/rl.toml"
+echo "(Skipping automatic training — run manually to review config)"
+
+echo ""
+echo "=== Pipeline complete ==="
+echo "Dataset saved to: ${DATASET_DIR}"
+echo "Environment package: ${ENV_PKG_DIR}"
+echo ""
+echo "Next steps:"
+echo "  1. Review the dataset: head ${DATASET_DIR}/train.jsonl"
+echo "  2. Push environment: prime env push simlab-tasks --path ${ENV_PKG_DIR}"
+echo "  3. Start RL training: prime rl run configs/rl.toml"
+echo "  4. Monitor: prime rl logs <run-id> -f"
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/__init__.py b/cookbook/prime-rl-training/src/prime_rl_training/__init__.py
new file mode 100644
index 0000000..f82bad0
--- /dev/null
+++ b/cookbook/prime-rl-training/src/prime_rl_training/__init__.py
@@ -0,0 +1 @@
+"""Prime-RL training cookbook for SimLab."""
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/collect.py b/cookbook/prime-rl-training/src/prime_rl_training/collect.py
new file mode 100644
index 0000000..8f74b61
--- /dev/null
+++ b/cookbook/prime-rl-training/src/prime_rl_training/collect.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""CLI script to collect SimLab trajectories and convert them for prime-rl.
+
+Usage:
+    # Collect from SimLab output directory and save as SFT dataset
+    python -m prime_rl_training.collect sft --output-dir ./output --save-path ./dataset
+
+    # Collect and push to HuggingFace Hub
+    python -m prime_rl_training.collect sft --output-dir ./output --push-to myorg/simlab-sft
+
+    # Build prompt dataset for RL training
+    python -m prime_rl_training.collect rl --output-dir ./output --save-path ./rl-dataset
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def cmd_sft(args: argparse.Namespace) -> None:
+    """Collect trajectories and convert to SFT dataset."""
+    from prime_rl_training.trajectory_converter import (
+        collect_trajectories,
+        push_to_hub,
+        save_dataset,
+        trajectories_to_sft_dataset,
+    )
+
+    trajectories = collect_trajectories(
+        Path(args.output_dir),
+        min_reward=args.min_reward,
+        include_failed=args.include_failed,
+    )
+
+    if not trajectories:
+        logger.error("No trajectories found in %s", args.output_dir)
+        sys.exit(1)
+
+    logger.info("Collected %d trajectories", len(trajectories))
+
+    # Load tool definitions if provided
+    tool_definitions = None
+    if args.tools_file:
+        with open(args.tools_file) as f:
+            tool_definitions = json.load(f)
+
+    rows = trajectories_to_sft_dataset(
+        trajectories,
+        tool_definitions=tool_definitions,
+    )
+
+    logger.info("Converted to %d SFT dataset rows", len(rows))
+
+    if args.push_to:
+        url = push_to_hub(rows, args.push_to, private=not args.public)
+        logger.info("Pushed to %s", url)
+    else:
+        path = save_dataset(rows, Path(args.save_path), format=args.format)
+        logger.info("Saved to %s", path)
+
+
+def cmd_rl(args: argparse.Namespace) -> None:
+    """Collect trajectories and build RL prompt dataset."""
+    from prime_rl_training.trajectory_converter import collect_trajectories
+
+    trajectories = collect_trajectories(
+        Path(args.output_dir),
+        min_reward=args.min_reward,
+        include_failed=False,
+    )
+
+    if not trajectories:
+        logger.error("No trajectories found in %s", args.output_dir)
+        sys.exit(1)
+
+    logger.info("Collected %d trajectories", len(trajectories))
+
+    save_path = Path(args.save_path)
+    save_path.mkdir(parents=True, exist_ok=True)
+
+    # Build prompt dataset (question/answer/info/task format for verifiers)
+    rows = []
+    for i, traj in enumerate(trajectories):
+        prompt = ""
+        answer = ""
+        for msg in traj["messages"]:
+            if msg.get("role") == "user" and not prompt:
+                prompt = msg["content"]
+            if msg.get("role") == "assistant" and msg.get("content"):
+                answer = msg["content"]
+
+        if not prompt:
+            continue
+
+        rows.append({
+            "question": prompt,
+            "answer": answer,
+            "info": json.dumps({
+                "reward": traj.get("reward", 0.0),
+                "task_id": traj.get("task_id", f"simlab_{i}"),
+            }),
+            "task": "simlab-task",
+        })
+
+    file_path = save_path / "train.jsonl"
+    with open(file_path, "w") as f:
+        for row in rows:
+            f.write(json.dumps(row) + "\n")
+
+    logger.info("Saved %d RL prompt rows to %s", len(rows), file_path)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect SimLab trajectories for prime-rl training"
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # SFT subcommand
+    sft_parser = subparsers.add_parser("sft", help="Build SFT dataset from trajectories")
+    sft_parser.add_argument("--output-dir", required=True, help="SimLab output directory")
+    sft_parser.add_argument("--save-path", default="./dataset", help="Where to save the dataset")
+    sft_parser.add_argument("--push-to", help="HuggingFace repo ID to push to")
+    sft_parser.add_argument("--public", action="store_true", help="Make HF repo public")
+    sft_parser.add_argument("--format", choices=["jsonl", "parquet"], default="jsonl")
+    sft_parser.add_argument("--min-reward", type=float, default=0.5, help="Min reward threshold")
+    sft_parser.add_argument("--include-failed", action="store_true", help="Include failed trajectories")
+    sft_parser.add_argument("--tools-file", help="JSON file with tool definitions")
+
+    # RL subcommand
+    rl_parser = subparsers.add_parser("rl", help="Build RL prompt dataset")
+    rl_parser.add_argument("--output-dir", required=True, help="SimLab output directory")
+    rl_parser.add_argument("--save-path", default="./rl-dataset", help="Where to save")
+    rl_parser.add_argument("--min-reward", type=float, default=0.5, help="Min reward threshold")
+
+    args = parser.parse_args()
+
+    if args.command == "sft":
+        cmd_sft(args)
+    elif args.command == "rl":
+        cmd_rl(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py b/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
new file mode 100644
index 0000000..0e3cef6
--- /dev/null
+++ b/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
@@ -0,0 +1,245 @@
+"""Verifiers environment that wraps SimLab tasks for prime-rl RL training.
+
+This environment bridges SimLab's task execution and verification system
+with prime-rl's verifiers framework. It:
+
+1. Loads SimLab tasks as training prompts (task instructions)
+2. Provides SimLab tool servers as callable tools for the RL agent
+3. Scores rollouts using SimLab's verifier system (binary or rubric-based)
+
+The environment can operate in two modes:
+
+- **Offline mode** (default): Uses pre-collected trajectories from SimLab
+  rollouts as a dataset. The rubric scores based on trajectory quality
+  metrics (task completion, tool usage efficiency).
+
+- **Online mode**: Connects to a live SimLab environment and runs the
+  agent's rollouts against actual tool servers. Requires Docker or Daytona.
+
+For the cookbook, we focus on offline mode since it doesn't require
+running SimLab infrastructure during training.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from datasets import Dataset
+
+import verifiers as vf
+
+logger = logging.getLogger(__name__)
+
+
+def _load_simlab_trajectories(
+    output_dir: str,
+    min_reward: float = 0.0,
+) -> list[dict[str, Any]]:
+    """Load SimLab output artifacts and build dataset rows.
+
+    Each row has:
+        - question: the task instruction (user prompt)
+        - answer: the successful final response (for reference scoring)
+        - info: metadata dict with full trajectory, reward, task_id
+        - task: "simlab-task"
+    """
+    from prime_rl_training.trajectory_converter import (
+        collect_trajectories,
+    )
+
+    trajectories = collect_trajectories(
+        Path(output_dir),
+        min_reward=min_reward,
+        include_failed=False,
+    )
+
+    rows: list[dict[str, Any]] = []
+    for traj in trajectories:
+        messages = traj["messages"]
+        # Extract the user instruction (first user message)
+        instruction = ""
+        for msg in messages:
+            if msg.get("role") == "user":
+                instruction = msg.get("content", "")
+                break
+
+        # Extract the final assistant response
+        final_response = ""
+        for msg in reversed(messages):
+            if msg.get("role") == "assistant" and msg.get("content"):
+                final_response = msg["content"]
+                break
+
+        if not instruction:
+            continue
+
+        rows.append({
+            "question": instruction,
+            "answer": final_response,
+            "info": {
+                "full_messages": messages,
+                "reward": traj["reward"],
+                "task_id": traj["task_id"],
+                "source_path": traj["source_path"],
+            },
+            "task": "simlab-task",
+        })
+
+    return rows
+
+
+def _load_simlab_task_bundle(
+    tasks_dir: str,
+) -> list[dict[str, Any]]:
+    """Load tasks from a SimLab task bundle directory.
+
+    Task bundles contain JSON files with task definitions including
+    instructions, seed data specs, and verifier references.
+    """
+    tasks_path = Path(tasks_dir)
+    rows: list[dict[str, Any]] = []
+
+    for task_file in sorted(tasks_path.glob("*.json")):
+        try:
+            with open(task_file) as f:
+                task_data = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            continue
+
+        # Handle both single task and list of tasks
+        task_list = task_data if isinstance(task_data, list) else [task_data]
+        for task in task_list:
+            instruction = task.get("instruction", task.get("prompt", ""))
+            if not instruction:
+                continue
+            rows.append({
+                "question": instruction,
+                "answer": "",  # no reference answer for RL
+                "info": {
+                    "task_id": task.get("id", task.get("task_id", "")),
+                    "task_data": task,
+                },
+                "task": "simlab-task",
+            })
+
+    return rows
+
+
+def _trajectory_similarity_reward(
+    completion: str,
+    answer: str,
+    **kwargs,
+) -> float:
+    """Score a completion by similarity to the reference trajectory.
+
+    For offline training, we compare the model's output against the
+    successful trajectory's final response using a simple overlap metric.
+    This provides a dense reward signal for RL training.
+
+    Returns a score in [0, 1].
+    """
+    if not answer:
+        return 0.1 if completion.strip() else 0.0
+
+    # Normalized token overlap (bag-of-words Jaccard similarity)
+    comp_tokens = set(completion.lower().split())
+    ref_tokens = set(answer.lower().split())
+
+    if not ref_tokens:
+        return 0.1 if comp_tokens else 0.0
+
+    intersection = comp_tokens & ref_tokens
+    union = comp_tokens | ref_tokens
+    jaccard = len(intersection) / len(union) if union else 0.0
+
+    return jaccard
+
+
+def _format_reward(completion: str, **kwargs) -> float:
+    """Reward for well-structured responses."""
+    if not completion or not completion.strip():
+        return 0.0
+
+    text = completion.strip()
+
+    if len(text) < 20:
+        return 0.2
+
+    score = 0.5
+    if any(marker in text for marker in ["##", "- ", "1.", "* "]):
+        score += 0.2
+    if len(text) > 100:
+        score += 0.2
+    if text.endswith((".", "!", "?", "```")):
+        score += 0.1
+
+    return min(score, 1.0)
+
+
+def load_environment(
+    output_dir: str = "./output",
+    tasks_dir: str | None = None,
+    min_reward: float = 0.5,
+    system_prompt: str | None = None,
+    **kwargs,
+) -> vf.Environment:
+    """Load a SimLab verifiers environment for prime-rl training.
+
+    This creates a SingleTurnEnv that uses SimLab trajectory data as
+    the training dataset. The rubric scores model outputs based on
+    similarity to successful trajectories and response quality.
+
+    Args:
+        output_dir: Path to SimLab output directory containing artifacts.
+        tasks_dir: Optional path to a SimLab task bundle directory.
+                   If provided, tasks are loaded from the bundle instead
+                   of from collected trajectories.
+        min_reward: Minimum reward threshold for including trajectories
+                    from the output directory.
+        system_prompt: System prompt for the agent. Defaults to a
+                       general-purpose tool-using agent prompt.
+
+    Returns:
+        A verifiers Environment ready for prime-rl training.
+    """
+    if system_prompt is None:
+        system_prompt = (
+            "You are a capable assistant that can use tools to complete tasks. "
+            "Think step by step about what information you need and which tools "
+            "to use. Be thorough and precise in your responses."
+        )
+
+    # Load dataset
+    if tasks_dir:
+        rows = _load_simlab_task_bundle(tasks_dir)
+    else:
+        rows = _load_simlab_trajectories(output_dir, min_reward=min_reward)
+
+    if not rows:
+        raise ValueError(
+            f"No training data found. Check output_dir={output_dir!r} "
+            f"or tasks_dir={tasks_dir!r}"
+        )
+
+    logger.info("Loaded %d training examples from SimLab data", len(rows))
+
+    dataset = Dataset.from_list(rows)
+
+    # Build rubric
+    rubric = vf.Rubric()
+    rubric.add_reward_func(_trajectory_similarity_reward, weight=0.5)
+    rubric.add_reward_func(_format_reward, weight=0.3)
+
+    parser = vf.Parser()
+
+    env = vf.SingleTurnEnv(
+        dataset=dataset,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+    )
+
+    return env
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py b/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
new file mode 100644
index 0000000..6e0c5bc
--- /dev/null
+++ b/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
@@ -0,0 +1,340 @@
+"""Convert SimLab rollout artifacts into prime-rl compatible datasets.
+
+SimLab artifacts.json contains the full agent trajectory: system prompt,
+user instructions, tool calls, tool results, and final observations.
+This module converts those into the HuggingFace messages format that
+prime-rl expects for SFT training, and optionally into prompt/completion
+pairs for simpler setups.
+
+Prime-RL SFT format (messages):
+    Each row has a "messages" column containing a list of dicts:
+        [{"role": "system", "content": "..."}, {"role": "user", ...}, ...]
+    Only assistant turns contribute to loss by default.
+
+Prime-RL SFT format (prompt/completion):
+    Each row has "prompt" (user instruction) and "completion" (assistant response).
+
+For tool-calling trajectories, tool calls and tool results are interleaved
+as assistant/tool message pairs, matching the OpenAI chat format that
+prime-rl tokenizes via chat templates.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def load_artifacts(artifacts_path: Path) -> dict[str, Any]:
+    """Load a SimLab artifacts.json file."""
+    with open(artifacts_path) as f:
+        return json.load(f)
+
+
+def load_reward(verifier_dir: Path) -> float:
+    """Load the reward from a SimLab verifier output directory.
+
+    Looks for reward.json first (structured), falls back to reward.txt (plain).
+    Returns 0.0 if no reward file is found.
+    """
+    reward_json = verifier_dir / "reward.json"
+    reward_txt = verifier_dir / "reward.txt"
+
+    if reward_json.exists():
+        with open(reward_json) as f:
+            data = json.load(f)
+        # reward.json may have "reward" or "score" key
+        return float(data.get("reward", data.get("score", 0.0)))
+    elif reward_txt.exists():
+        return float(reward_txt.read_text().strip())
+    return 0.0
+
+
+def artifacts_to_messages(artifacts: dict[str, Any]) -> list[dict[str, str]]:
+    """Convert a SimLab artifacts dict into a list of chat messages.
+
+    Handles three data shapes commonly found in SimLab artifacts:
+    1. "messages" key - already in message format (from reference agent or adapters)
+    2. "tool_calls" / "tool_results" lists - interleaved tool-use trajectory
+    3. "instruction" + "final_observation" - minimal prompt/response pair
+
+    Returns a list of {"role": ..., "content": ...} dicts.
+    """
+    messages: list[dict[str, str]] = []
+
+    # If artifacts already contain a messages list, use it directly
+    if "messages" in artifacts and isinstance(artifacts["messages"], list):
+        for msg in artifacts["messages"]:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            # Normalize tool_calls in assistant messages
+            if role == "assistant" and "tool_calls" in msg:
+                # Include both text content and tool call info
+                tool_calls = msg["tool_calls"]
+                messages.append({"role": "assistant", "content": content, "tool_calls": tool_calls})
+                # Add corresponding tool results if present
+                for tc in tool_calls:
+                    call_id = tc.get("id", "")
+                    # Look for a following tool message with matching id
+                    # (handled below if tool messages follow in the list)
+            elif role == "tool":
+                messages.append({
+                    "role": "tool",
+                    "content": content,
+                    "tool_call_id": msg.get("tool_call_id", ""),
+                })
+            else:
+                messages.append({"role": role, "content": content})
+        return messages
+
+    # Build from structured fields
+    instruction = artifacts.get("instruction", "")
+    if instruction:
+        messages.append({"role": "user", "content": instruction})
+
+    # Interleave tool calls and results chronologically
+    tool_calls = artifacts.get("tool_calls", [])
+    tool_results = artifacts.get("tool_results", [])
+
+    for i, tc in enumerate(tool_calls):
+        # Assistant makes a tool call
+        tool_name = tc.get("name", tc.get("tool_name", "unknown"))
+        tool_args = tc.get("arguments", tc.get("args", tc.get("input", {})))
+        if isinstance(tool_args, str):
+            try:
+                tool_args = json.loads(tool_args)
+            except json.JSONDecodeError:
+                pass
+
+        call_id = tc.get("id", f"call_{i}")
+        messages.append({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": call_id,
+                "type": "function",
+                "function": {
+                    "name": tool_name,
+                    "arguments": json.dumps(tool_args) if isinstance(tool_args, dict) else str(tool_args),
+                },
+            }],
+        })
+
+        # Corresponding tool result
+        if i < len(tool_results):
+            result = tool_results[i]
+            result_content = result.get("content", result.get("output", result.get("result", "")))
+            if not isinstance(result_content, str):
+                result_content = json.dumps(result_content)
+            messages.append({
+                "role": "tool",
+                "content": result_content,
+                "tool_call_id": call_id,
+            })
+
+    # Final assistant response
+    final = artifacts.get("final_observation", artifacts.get("final_output", ""))
+    if final:
+        messages.append({"role": "assistant", "content": final})
+
+    return messages
+
+
+def collect_trajectories(
+    output_dir: Path,
+    *,
+    min_reward: float = 0.0,
+    include_failed: bool = False,
+) -> list[dict[str, Any]]:
+    """Scan a SimLab output directory tree and collect trajectory data.
+
+    Looks for the standard SimLab output structure:
+        output_dir/
+            agent_run_<task>_<ts>/
+                artifacts.json
+                verifier/
+                    reward.json | reward.txt
+            parallel_run_<task>_<ts>/
+                rollout_0/
+                    artifacts.json
+                    verifier/reward.json
+                rollout_1/
+                    ...
+
+    Args:
+        output_dir: Root output directory to scan.
+        min_reward: Minimum reward threshold. Trajectories below this are skipped
+                    unless include_failed is True.
+        include_failed: If True, include all trajectories regardless of reward.
+
+    Returns:
+        List of dicts with keys: messages, reward, task_id, source_path.
+    """
+    output_dir = Path(output_dir)
+    trajectories: list[dict[str, Any]] = []
+
+    # Find all artifacts.json files
+    for artifacts_path in sorted(output_dir.rglob("artifacts.json")):
+        verifier_dir = artifacts_path.parent / "verifier"
+        reward = load_reward(verifier_dir)
+
+        if not include_failed and reward < min_reward:
+            logger.debug("Skipping %s (reward=%.2f < %.2f)", artifacts_path, reward, min_reward)
+            continue
+
+        try:
+            artifacts = load_artifacts(artifacts_path)
+        except (json.JSONDecodeError, OSError) as exc:
+            logger.warning("Failed to load %s: %s", artifacts_path, exc)
+            continue
+
+        messages = artifacts_to_messages(artifacts)
+        if not messages:
+            logger.warning("No messages extracted from %s", artifacts_path)
+            continue
+
+        # Try to extract task_id from directory name
+        task_id = ""
+        dir_name = artifacts_path.parent.name
+        parent_name = artifacts_path.parent.parent.name
+        for name in (dir_name, parent_name):
+            if name.startswith("agent_run_") or name.startswith("parallel_run_"):
+                parts = name.split("_")
+                # agent_run_<task_id>_<timestamp> or parallel_run_<task_id>_<timestamp>
+                if len(parts) >= 3:
+                    task_id = "_".join(parts[2:-1])  # everything between prefix and timestamp
+                    break
+
+        trajectories.append({
+            "messages": messages,
+            "reward": reward,
+            "task_id": task_id,
+            "source_path": str(artifacts_path),
+        })
+
+    logger.info("Collected %d trajectories from %s", len(trajectories), output_dir)
+    return trajectories
+
+
+def trajectories_to_sft_dataset(
+    trajectories: list[dict[str, Any]],
+    *,
+    tool_definitions: list[dict[str, Any]] | None = None,
+) -> list[dict[str, Any]]:
+    """Convert collected trajectories into prime-rl SFT dataset rows.
+
+    Each row has:
+        - "messages": list of role/content dicts (prime-rl messages format)
+        - "tools": optional JSON string of tool definitions (OpenAI format)
+
+    Args:
+        trajectories: Output of collect_trajectories().
+        tool_definitions: Optional list of tool schemas in OpenAI function format.
+                          If provided, added to every row so the model learns tool use.
+
+    Returns:
+        List of dataset rows ready for HuggingFace Dataset.from_list().
+    """
+    rows: list[dict[str, Any]] = []
+
+    for traj in trajectories:
+        row: dict[str, Any] = {"messages": traj["messages"]}
+        if tool_definitions:
+            row["tools"] = json.dumps(tool_definitions)
+        rows.append(row)
+
+    return rows
+
+
+def trajectories_to_prompt_completion(
+    trajectories: list[dict[str, Any]],
+) -> list[dict[str, str]]:
+    """Convert trajectories to simple prompt/completion pairs.
+
+    Uses the first user message as prompt and the last assistant message
+    as completion. Useful for simpler SFT without tool-use structure.
+    """
+    rows: list[dict[str, str]] = []
+
+    for traj in trajectories:
+        messages = traj["messages"]
+        prompt = ""
+        completion = ""
+        for msg in messages:
+            if msg["role"] == "user" and not prompt:
+                prompt = msg["content"]
+            if msg["role"] == "assistant" and msg.get("content"):
+                completion = msg["content"]
+        if prompt and completion:
+            rows.append({"prompt": prompt, "completion": completion})
+
+    return rows
+
+
+def save_dataset(
+    rows: list[dict[str, Any]],
+    output_path: Path,
+    *,
+    format: str = "jsonl",
+) -> Path:
+    """Save dataset rows to disk.
+
+    Args:
+        rows: Dataset rows (from trajectories_to_sft_dataset or
+              trajectories_to_prompt_completion).
+        output_path: Directory to save into.
+        format: "jsonl" for JSON Lines, "parquet" for Parquet (requires datasets lib).
+
+    Returns:
+        Path to the saved file.
+    """
+    output_path = Path(output_path)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    if format == "parquet":
+        from datasets import Dataset
+
+        ds = Dataset.from_list(rows)
+        file_path = output_path / "train.parquet"
+        ds.to_parquet(str(file_path))
+        logger.info("Saved %d rows to %s", len(rows), file_path)
+        return file_path
+
+    # Default: JSONL
+    file_path = output_path / "train.jsonl"
+    with open(file_path, "w") as f:
+        for row in rows:
+            f.write(json.dumps(row) + "\n")
+    logger.info("Saved %d rows to %s", len(rows), file_path)
+    return file_path
+
+
+def push_to_hub(
+    rows: list[dict[str, Any]],
+    repo_id: str,
+    *,
+    split: str = "train",
+    private: bool = True,
+) -> str:
+    """Push dataset to HuggingFace Hub for prime-rl to consume.
+
+    Args:
+        rows: Dataset rows.
+        repo_id: HuggingFace repo ID (e.g., "myorg/simlab-sft-data").
+        split: Dataset split name.
+        private: Whether to create a private repo.
+
+    Returns:
+        The repo URL.
+    """
+    from datasets import Dataset
+
+    ds = Dataset.from_list(rows)
+    ds.push_to_hub(repo_id, split=split, private=private)
+    url = f"https://huggingface.co/datasets/{repo_id}"
+    logger.info("Pushed %d rows to %s", len(rows), url)
+    return url

From e9251794950ca18468ea74a7499c142bfdc070b8 Mon Sep 17 00:00:00 2001
From: AnandK27 <anand@collinear.ai>
Date: Wed, 1 Apr 2026 13:18:27 -0700
Subject: [PATCH 2/6] Fix mypy errors in prime-rl cookbook

- Import Environment, SingleTurnEnv, Parser, Rubric directly from
  verifiers submodules instead of via lazy vf.* attributes
- Change messages list type to dict[str, Any] to allow tool_calls values

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../prime-envs/simlab_tasks/simlab_tasks.py          | 12 ++++++++----
 .../src/prime_rl_training/simlab_env.py              | 12 ++++++++----
 .../src/prime_rl_training/trajectory_converter.py    |  2 +-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
index 028f039..6ef5fdf 100644
--- a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
@@ -12,6 +12,10 @@
 from datasets import Dataset, load_dataset
 
 import verifiers as vf
+from verifiers.envs.environment import Environment
+from verifiers.envs.singleturn_env import SingleTurnEnv
+from verifiers.parsers.parser import Parser
+from verifiers.rubrics.rubric import Rubric
 
 
 # --- Embedded dataset of SimLab customer support prompts ---
@@ -133,7 +137,7 @@ def load_environment(
     dataset_split: str = "train",
     system_prompt: str | None = None,
     **kwargs,
-) -> vf.Environment:
+) -> Environment:
     """Load the SimLab customer support environment.
 
     Args:
@@ -160,13 +164,13 @@ def load_environment(
     else:
         train_dataset = Dataset.from_list(SIMLAB_TASKS)
 
-    parser = vf.Parser()
+    parser = Parser()
 
-    rubric = vf.Rubric(parser=parser)
+    rubric = Rubric(parser=parser)
     rubric.add_reward_func(_quality_reward, weight=0.5)
     rubric.add_reward_func(_completeness_reward, weight=0.5)
 
-    env = vf.SingleTurnEnv(
+    env = SingleTurnEnv(
         dataset=train_dataset,
         system_prompt=system_prompt,
         parser=parser,
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py b/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
index 0e3cef6..6e7a7fe 100644
--- a/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
+++ b/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
@@ -30,6 +30,10 @@
 from datasets import Dataset
 
 import verifiers as vf
+from verifiers.envs.environment import Environment
+from verifiers.envs.singleturn_env import SingleTurnEnv
+from verifiers.parsers.parser import Parser
+from verifiers.rubrics.rubric import Rubric
 
 logger = logging.getLogger(__name__)
 
@@ -185,7 +189,7 @@ def load_environment(
     min_reward: float = 0.5,
     system_prompt: str | None = None,
     **kwargs,
-) -> vf.Environment:
+) -> Environment:
     """Load a SimLab verifiers environment for prime-rl training.
 
     This creates a SingleTurnEnv that uses SimLab trajectory data as
@@ -229,13 +233,13 @@ def load_environment(
     dataset = Dataset.from_list(rows)
 
     # Build rubric
-    rubric = vf.Rubric()
+    rubric = Rubric()
     rubric.add_reward_func(_trajectory_similarity_reward, weight=0.5)
     rubric.add_reward_func(_format_reward, weight=0.3)
 
-    parser = vf.Parser()
+    parser = Parser()
 
-    env = vf.SingleTurnEnv(
+    env = SingleTurnEnv(
         dataset=dataset,
         system_prompt=system_prompt,
         parser=parser,
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py b/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
index 6e0c5bc..98042f0 100644
--- a/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
+++ b/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
@@ -64,7 +64,7 @@ def artifacts_to_messages(artifacts: dict[str, Any]) -> list[dict[str, str]]:
 
     Returns a list of {"role": ..., "content": ...} dicts.
     """
-    messages: list[dict[str, str]] = []
+    messages: list[dict[str, Any]] = []
 
     # If artifacts already contain a messages list, use it directly
     if "messages" in artifacts and isinstance(artifacts["messages"], list):

From e6aa4dca4af7b298c4c93f834f0b430756598903 Mon Sep 17 00:00:00 2001
From: AnandK27 <anand@collinear.ai>
Date: Wed, 1 Apr 2026 13:19:48 -0700
Subject: [PATCH 3/6] Remove unused verifiers import flagged by ruff F401

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py    | 1 -
 cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
index 6ef5fdf..bc653ad 100644
--- a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
@@ -11,7 +11,6 @@
 
 from datasets import Dataset, load_dataset
 
-import verifiers as vf
 from verifiers.envs.environment import Environment
 from verifiers.envs.singleturn_env import SingleTurnEnv
 from verifiers.parsers.parser import Parser
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py b/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
index 6e7a7fe..a310020 100644
--- a/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
+++ b/cookbook/prime-rl-training/src/prime_rl_training/simlab_env.py
@@ -29,7 +29,6 @@
 
 from datasets import Dataset
 
-import verifiers as vf
 from verifiers.envs.environment import Environment
 from verifiers.envs.singleturn_env import SingleTurnEnv
 from verifiers.parsers.parser import Parser

From 0823469c79950c845141e816fb9868145cc826d7 Mon Sep 17 00:00:00 2001
From: AnandK27 <anand@collinear.ai>
Date: Wed, 1 Apr 2026 13:47:43 -0700
Subject: [PATCH 4/6] Update verifiers env with real SimLab task prompts, use
 Qwen3.5-4B

- Replace placeholder task prompts with actual generated SimLab customer
  support tasks (enterprise escalation, billing dispute, SLA breach)
- Bump env version to 0.2.0
- Switch default RL model to Qwen/Qwen3.5-4B

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cookbook/prime-rl-training/configs/rl.toml    |  2 +-
 .../prime-envs/simlab_tasks/pyproject.toml    |  2 +-
 .../prime-envs/simlab_tasks/simlab_tasks.py   | 55 ++++++++++---------
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/cookbook/prime-rl-training/configs/rl.toml b/cookbook/prime-rl-training/configs/rl.toml
index 4e85892..b20b2e6 100644
--- a/cookbook/prime-rl-training/configs/rl.toml
+++ b/cookbook/prime-rl-training/configs/rl.toml
@@ -5,7 +5,7 @@
 # using a SimLab verifiers environment for reward scoring.
 
 # Use the SFT-warmed model or start from a base instruct model
-model = "Qwen/Qwen3.5-9B"
+model = "Qwen/Qwen3.5-4B"
 max_steps = 50
 
 # Training hyperparameters
diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml b/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
index 0928890..398a74f 100644
--- a/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
@@ -2,7 +2,7 @@
 name = "simlab-tasks"
 description = "SimLab customer support task environment for prime-rl training. Uses pre-collected trajectories from SimLab rollouts as prompts with quality-based reward scoring."
 tags = ["simlab", "tool-use", "customer-support", "train", "eval"]
-version = "0.1.0"
+version = "0.2.0"
 requires-python = ">=3.10"
 dependencies = [
     "verifiers>=0.1.11",
diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
index bc653ad..540e353 100644
--- a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
@@ -23,48 +23,49 @@
 SIMLAB_TASKS = [
     {
         "question": (
-            "You've received a billing dispute from Karen Mitchell regarding her "
-            "enterprise renewal invoice. She reports a 40% increase with unexpected "
-            "charges and is threatening to cancel by end of week. Contact Diana Walsh "
-            "to obtain the specific billing details and charge breakdown from the "
-            "invoice. Then contact Carlos Mendez to confirm what was discussed during "
-            "the renewal process and any contract amendments. Once you have the facts "
-            "from both, review the ticket details and determine whether this is a "
-            "genuine billing error or a legitimate contract amendment issue. Provide "
-            "Karen with a clear explanation of the charges and your recommended "
-            "resolution path."
+            "You've received an urgent report from James Wilson at Wilson Retail Group "
+            "about intermittent sync failures in their platform integration during peak "
+            "sales, causing critical disruptions to order processing and inventory sync. "
+            "This is a VIP enterprise account with a 2-hour SLA. First, email Amanda "
+            "Reeves to confirm the current account status and any ongoing issues she's "
+            "aware of. Then create or locate the helpdesk ticket and escalate to Marcus "
+            "Chen via Chat with full business context, including details Amanda provides. "
+            "Keep James informed of progress throughout."
         ),
         "answer": "",
         "info": {},
-        "task": "simlab-billing-dispute",
+        "task": "simlab-enterprise-escalation",
     },
     {
         "question": (
-            "David Park from TechStart Inc has reported persistent API rate limiting "
-            "issues affecting their production environment. His enterprise SLA "
-            "guarantees 99.9% uptime and the current issues are putting them at risk "
-            "of breaching that threshold. Investigate the technical details of the "
-            "rate limiting, coordinate with engineering to identify root cause, and "
-            "provide David with a resolution timeline. Ensure the response meets the "
-            "enterprise SLA first-response requirements."
+            "Review Karen Mitchell's billing dispute for Invoice #INV-2024-5847. Contact "
+            "Diana Walsh to get the exact details of the billing error and confirm the "
+            "correct amount. Then contact James Foster to get his approval for the "
+            "corrected invoice amount and any customer compensation or credit he is "
+            "willing to authorize. Once you have both the error details from Diana and "
+            "the approved corrected amount and credits from James, send Karen an apology "
+            "email that includes the specific explanation of the error, the corrected "
+            "invoice, and details of any credit or compensation approved."
         ),
         "answer": "",
         "info": {},
-        "task": "simlab-api-escalation",
+        "task": "simlab-billing-dispute",
     },
     {
         "question": (
-            "Wilson Retail Group has filed an SLA-critical billing dispute claiming "
-            "they were double-charged for their Q4 platform usage. The account is "
-            "flagged as at-risk for churn. Review the billing records, cross-reference "
-            "with the CRM account history, and determine whether the duplicate charge "
-            "is valid. If confirmed, initiate the refund process and coordinate with "
-            "the account manager to schedule a retention call. Document all findings "
-            "in the support ticket."
+            "Karen Mitchell reported an account access issue 18 hours ago and hasn't "
+            "received a response — we're at risk of breaching the 24-hour SLA. She's "
+            "also posted frustration in the support-escalations channel. Contact Sarah "
+            "Johnson to get her recommendation on which support agent should handle this "
+            "based on current workload and expertise. Then reach out to Marcus Chen to "
+            "determine if the 'Invalid credentials' error requires backend investigation "
+            "or if standard account recovery will resolve it. Once you have their "
+            "guidance, assign the ticket to the appropriate agent and send Karen an "
+            "acknowledgment email that addresses her frustration and outlines next steps."
         ),
         "answer": "",
         "info": {},
-        "task": "simlab-billing-sla",
+        "task": "simlab-sla-breach",
     },
 ]
 

From 573a2bf94c3ab87d7f79ad6ed482b810fea216db Mon Sep 17 00:00:00 2001
From: AnandK27 <anand@collinear.ai>
Date: Wed, 1 Apr 2026 15:58:35 -0700
Subject: [PATCH 5/6] Fix zero-reward bug: extract text from verifiers Messages
 objects

The rubric functions received completion as a Messages list (not a
plain string). Thinking models put output in reasoning_content with
content=null, so the old string-based rubric always scored 0.

Added _extract_text() helper that handles both plain strings and
Messages lists, extracting content + reasoning_content from all
assistant messages. Tested: Step 0 reward went from 0.0 to 0.4813.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../prime-envs/simlab_tasks/pyproject.toml    |  2 +-
 .../prime-envs/simlab_tasks/simlab_tasks.py   | 39 ++++++++++++++++---
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml b/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
index 398a74f..8127068 100644
--- a/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/pyproject.toml
@@ -2,7 +2,7 @@
 name = "simlab-tasks"
 description = "SimLab customer support task environment for prime-rl training. Uses pre-collected trajectories from SimLab rollouts as prompts with quality-based reward scoring."
 tags = ["simlab", "tool-use", "customer-support", "train", "eval"]
-version = "0.2.0"
+version = "0.3.0"
 requires-python = ">=3.10"
 dependencies = [
     "verifiers>=0.1.11",
diff --git a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
index 540e353..b1b11cb 100644
--- a/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
+++ b/cookbook/prime-rl-training/prime-envs/simlab_tasks/simlab_tasks.py
@@ -70,12 +70,38 @@
 ]
 
 
-def _quality_reward(completion: str, **kwargs) -> float:
+def _extract_text(completion: object) -> str:
+    """Extract all text from a verifiers completion.
+
+    The completion can be a plain string or a list of message objects.
+    For message lists, we concatenate all assistant content AND
+    reasoning_content so the rubric can score thinking models too.
+    """
+    if isinstance(completion, str):
+        return completion
+
+    parts: list[str] = []
+    if isinstance(completion, list):
+        for msg in completion:
+            # Handle both dict and message objects
+            content = getattr(msg, "content", None) or (msg.get("content") if isinstance(msg, dict) else None)
+            if content:
+                parts.append(str(content))
+            # Also grab reasoning/thinking content
+            reasoning = getattr(msg, "reasoning_content", None) or (msg.get("reasoning_content") if isinstance(msg, dict) else None)
+            if reasoning:
+                parts.append(str(reasoning))
+
+    return "\n".join(parts)
+
+
+def _quality_reward(completion: object, **kwargs: object) -> float:
     """Reward for well-structured, substantive responses."""
-    if not completion or not completion.strip():
+    text = _extract_text(completion)
+    if not text or not text.strip():
         return 0.0
 
-    text = completion.strip()
+    text = text.strip()
     score = 0.0
 
     # Length-based scoring
@@ -108,9 +134,10 @@ def _quality_reward(completion: str, **kwargs) -> float:
     return min(score, 1.0)
 
 
-def _completeness_reward(completion: str, question: str, **kwargs) -> float:
+def _completeness_reward(completion: object, question: str, **kwargs: object) -> float:
     """Reward for addressing all parts of the task instruction."""
-    if not completion or not question:
+    text = _extract_text(completion)
+    if not text or not question:
         return 0.0
 
     # Extract action items from the question
@@ -126,7 +153,7 @@ def _completeness_reward(completion: str, question: str, **kwargs) -> float:
 
     addressed = sum(
         1 for action in required_actions
-        if action.lower() in completion.lower()
+        if action.lower() in text.lower()
     )
 
     return addressed / len(required_actions)

From 3dbfa51e735ba3556e2c1f182200f3b2f9e05d4d Mon Sep 17 00:00:00 2001
From: AnandK27 <anand@collinear.ai>
Date: Fri, 3 Apr 2026 10:39:46 -0700
Subject: [PATCH 6/6] Fix PR review issues in prime-rl cookbook

- P1: Stop run_pipeline.sh from overwriting committed env files
- P2: Add try/except ValueError for reward.txt parsing
- P2: Remove dead code in artifacts_to_messages tool_calls loop
- P2: Change default agent model from gpt-5.2 to gpt-4.1-mini

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cookbook/prime-rl-training/SKILL.md           |  2 +-
 .../prime-rl-training/prime-rl-training.md    |  6 ++--
 cookbook/prime-rl-training/run_pipeline.sh    | 35 ++-----------------
 .../prime_rl_training/trajectory_converter.py | 10 +++---
 4 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/cookbook/prime-rl-training/SKILL.md b/cookbook/prime-rl-training/SKILL.md
index 06013a1..f815a8f 100644
--- a/cookbook/prime-rl-training/SKILL.md
+++ b/cookbook/prime-rl-training/SKILL.md
@@ -65,7 +65,7 @@ simlab tasks run \
   --env prime-rl-env \
   --task <task_id> \
   --tasks-dir ./generated-tasks \
-  --agent-model gpt-5.2 \
+  --agent-model gpt-4.1-mini \
   --agent-api-key "$OPENAI_API_KEY"
 ```
 
diff --git a/cookbook/prime-rl-training/prime-rl-training.md b/cookbook/prime-rl-training/prime-rl-training.md
index 2d28425..1b76c41 100644
--- a/cookbook/prime-rl-training/prime-rl-training.md
+++ b/cookbook/prime-rl-training/prime-rl-training.md
@@ -36,7 +36,7 @@ Train an agent model using Prime Intellect's prime-rl with trajectories collecte
   ```
 - **Verifier** configured (for scoring SimLab rollouts):
   ```bash
-  export SIMLAB_VERIFIER_MODEL="gpt-5.2"
+  export SIMLAB_VERIFIER_MODEL="gpt-4.1-mini"
   export SIMLAB_VERIFIER_PROVIDER="openai"
   export SIMLAB_VERIFIER_API_KEY="$OPENAI_API_KEY"
   ```
@@ -85,7 +85,7 @@ simlab tasks run \
   --env prime-rl-env \
   --task <task_id_1> <task_id_2> <task_id_3> \
   --tasks-dir ./generated-tasks \
-  --agent-model gpt-5.2 \
+  --agent-model gpt-4.1-mini \
   --agent-api-key "$OPENAI_API_KEY"
 ```
 
@@ -99,7 +99,7 @@ simlab tasks run \
   --daytona \
   --rollout-count 5 \
   --max-parallel 3 \
-  --agent-model gpt-5.2 \
+  --agent-model gpt-4.1-mini \
   --agent-api-key "$OPENAI_API_KEY"
 ```
 
diff --git a/cookbook/prime-rl-training/run_pipeline.sh b/cookbook/prime-rl-training/run_pipeline.sh
index c34b63c..07487a7 100755
--- a/cookbook/prime-rl-training/run_pipeline.sh
+++ b/cookbook/prime-rl-training/run_pipeline.sh
@@ -20,7 +20,7 @@ ROLLOUT_COUNT="${SIMLAB_ROLLOUT_COUNT:-3}"
 OUTPUT_DIR="${SCRIPT_DIR}/output"
 DATASET_DIR="${SCRIPT_DIR}/dataset"
 TASKS_DIR="${SCRIPT_DIR}/generated-tasks"
-AGENT_MODEL="${SIMLAB_AGENT_MODEL:-gpt-5.2}"
+AGENT_MODEL="${SIMLAB_AGENT_MODEL:-gpt-4.1-mini}"
 
 echo "=== Step 1: Create SimLab environment ==="
 simlab env init "${ENV_NAME}" --template "${TEMPLATE}"
@@ -70,38 +70,9 @@ python -m prime_rl_training.collect sft \
 
 echo ""
 echo "=== Step 6: Build verifiers environment ==="
-# Create the environment package for prime-rl
+# Use the already-committed environment package
 ENV_PKG_DIR="${SCRIPT_DIR}/prime-envs/simlab_tasks"
-mkdir -p "${ENV_PKG_DIR}"
-
-# Copy the environment module
-cp "${SCRIPT_DIR}/src/prime_rl_training/simlab_env.py" "${ENV_PKG_DIR}/simlab_tasks.py"
-
-cat > "${ENV_PKG_DIR}/pyproject.toml" << 'TOML'
-[project]
-name = "simlab-tasks"
-description = "SimLab task environment for prime-rl training"
-tags = ["simlab", "tool-use", "multi-turn", "train", "eval"]
-version = "0.1.0"
-requires-python = ">=3.10"
-dependencies = [
-    "verifiers>=0.1.11",
-    "datasets",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.build]
-include = ["simlab_tasks.py", "pyproject.toml"]
-
-[tool.verifiers.eval]
-num_examples = 5
-rollouts_per_example = 3
-TOML
-
-echo "Environment package created at ${ENV_PKG_DIR}"
+echo "Environment package ready at ${ENV_PKG_DIR}"
 
 echo ""
 echo "=== Step 7: Push environment to Prime Intellect hub ==="
diff --git a/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py b/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
index 98042f0..1694904 100644
--- a/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
+++ b/cookbook/prime-rl-training/src/prime_rl_training/trajectory_converter.py
@@ -50,7 +50,10 @@ def load_reward(verifier_dir: Path) -> float:
         # reward.json may have "reward" or "score" key
         return float(data.get("reward", data.get("score", 0.0)))
     elif reward_txt.exists():
-        return float(reward_txt.read_text().strip())
+        try:
+            return float(reward_txt.read_text().strip())
+        except ValueError:
+            return 0.0
     return 0.0
 
 
@@ -76,11 +79,6 @@ def artifacts_to_messages(artifacts: dict[str, Any]) -> list[dict[str, str]]:
                 # Include both text content and tool call info
                 tool_calls = msg["tool_calls"]
                 messages.append({"role": "assistant", "content": content, "tool_calls": tool_calls})
-                # Add corresponding tool results if present
-                for tc in tool_calls:
-                    call_id = tc.get("id", "")
-                    # Look for a following tool message with matching id
-                    # (handled below if tool messages follow in the list)
             elif role == "tool":
                 messages.append({
                     "role": "tool",