diff --git a/docs/index.rst b/docs/index.rst
index c59b425..554ce24 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -176,11 +176,11 @@ LitXBench
KnowMat2 (GPT-5.2 High) |
0.52 |
0.43 |
- 0.43 ± 0.29 |
- 0.28 |
- 0.66 |
- 0.66 |
- 0.19 |
+ 0.43 ± 0.06 |
+ 0.27 |
+ 0.67 |
+ 0.65 |
+ 0.17 |
— |
19.40 |
code run paper pr |
diff --git a/scripts/paper/combined_results/process_f1_summary.csv b/scripts/paper/combined_results/process_f1_summary.csv
index cf42329..c06aeb8 100644
--- a/scripts/paper/combined_results/process_f1_summary.csv
+++ b/scripts/paper/combined_results/process_f1_summary.csv
@@ -1,5 +1,5 @@
method process_f1
-KnowMat2 0.6592 +/- 0.6302
+KnowMat2 0.6734 +/- 0.1376
Claude Haiku 4.5 0.8426 +/- 0.0586
GPT 5 Mini Medium 0.8422 +/- 0.0698
Gemini 3 Flash 0.8560 +/- 0.0197
diff --git a/scripts/paper/combined_results/zero_shot_summary_with_ci.csv b/scripts/paper/combined_results/zero_shot_summary_with_ci.csv
index e36c866..fcfda77 100644
--- a/scripts/paper/combined_results/zero_shot_summary_with_ci.csv
+++ b/scripts/paper/combined_results/zero_shot_summary_with_ci.csv
@@ -1,5 +1,5 @@
method overall_precision overall_recall overall_f1 meas_f1 process_f1 material_f1 config_f1 avg_attempts cost_usd
-KnowMat2 0.5231 0.4286 0.4320 +/- 0.2916 0.2758 0.6592 0.6589 0.1876 1.00 0
+KnowMat2 0.5245 0.4258 0.4288 +/- 0.0586 0.2666 0.6734 0.6511 0.1745 1.00 0
Claude Haiku 4.5 0.6417 0.6862 0.6510 +/- 0.0130 0.5089 0.8426 0.9380 0.3840 2.21 1.7177
GPT 5 Mini Medium 0.6706 0.7023 0.6764 +/- 0.0434 0.5222 0.8422 0.9422 0.4051 2.49 3.46597
Gemini 3 Flash 0.7404 0.7560 0.7376 +/- 0.0489 0.6092 0.8560 0.9692 0.5166 2.58 1.73078
diff --git a/scripts/paper/pareto_front.py b/scripts/paper/pareto_front.py
index c2c9bf8..baf14ea 100644
--- a/scripts/paper/pareto_front.py
+++ b/scripts/paper/pareto_front.py
@@ -1,4 +1,4 @@
-# %%
+import math
import re
from dataclasses import dataclass
@@ -20,16 +20,16 @@ class ModelPoint:
models: tuple[ModelPoint, ...] = (
- ModelPoint("Gemini CLI (Gemini-3.1 Pro Preview)", 0.7952, 6.45507, GREEN, "s"),
- ModelPoint("Gemini 3.1 Pro", 0.7701, 4.17163, BLUE, "s"),
- ModelPoint("Gemini 3 Flash", 0.7371, 1.73078, RED, "s"),
- ModelPoint("Claude Code (Opus 4.6)", 0.7754, 26.1145, GREEN, "o"),
- ModelPoint("Claude Opus 4.6", 0.7203, 5.37406, BLUE, "o"),
- ModelPoint("Claude Haiku 4.5", 0.6488, 1.7177, RED, "o"),
- ModelPoint("Codex (GPT 5.2 Codex High)", 0.7246, 4.17361, GREEN, "^"),
- ModelPoint("GPT 5.2 High", 0.7244, 4.99104, BLUE, "^"),
- ModelPoint("GPT 5 Mini Med.", 0.6731, 3.46597, RED, "^"),
- ModelPoint("KnowMat2", 0.4320, 19.40, GREEN, "p"),
+ ModelPoint("Gemini CLI (Gemini-3.1 Pro Preview)", 0.7958, 6.45507, GREEN, "s"),
+ ModelPoint("Gemini 3.1 Pro", 0.7708, 4.17163, BLUE, "s"),
+ ModelPoint("Gemini 3 Flash", 0.7376, 1.73078, RED, "s"),
+ ModelPoint("Claude Code (Opus 4.6)", 0.7766, 26.1145, GREEN, "o"),
+ ModelPoint("Claude Opus 4.6", 0.7231, 5.37406, BLUE, "o"),
+ ModelPoint("Claude Haiku 4.5", 0.6510, 1.7177, RED, "o"),
+ ModelPoint("Codex (GPT 5.2 Codex High)", 0.7268, 4.17361, GREEN, "^"),
+ ModelPoint("GPT 5.2 High", 0.7273, 4.99104, BLUE, "^"),
+ ModelPoint("GPT 5 Mini Med.", 0.6764, 3.46597, RED, "^"),
+ ModelPoint("KnowMat2", 0.4288, 19.40, GREEN, "p"),
)
@@ -44,9 +44,24 @@ def pareto_front(points: tuple[ModelPoint, ...]) -> tuple[ModelPoint, ...]:
return tuple(pareto_points)
-pareto = pareto_front(models)
-pareto_cost = [point.cost for point in pareto]
-pareto_f1 = [point.f1 for point in pareto]
+def upper_convex_front(points: tuple[ModelPoint, ...]) -> tuple[ModelPoint, ...]:
+ front: list[ModelPoint] = []
+
+ def slope(left: ModelPoint, right: ModelPoint) -> float:
+ x_left = math.log10(left.cost)
+ x_right = math.log10(right.cost)
+ return (right.f1 - left.f1) / (x_right - x_left)
+
+ for point in pareto_front(points):
+ while len(front) >= 2 and slope(front[-2], front[-1]) <= slope(front[-1], point):
+ front.pop()
+ front.append(point)
+ return tuple(front)
+
+
+convex_front = upper_convex_front(models)
+front_cost = [point.cost for point in convex_front]
+front_f1 = [point.f1 for point in convex_front]
plt.rcParams.update(
{
@@ -79,7 +94,7 @@ def clean_text(text: str) -> str:
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_box_aspect(1)
-ax.plot(pareto_cost, pareto_f1, color=LINE_C, linewidth=3.2, linestyle="--", zorder=2, label="Pareto front")
+ax.plot(front_cost, front_f1, color=LINE_C, linewidth=3.2, linestyle="--", zorder=2, label="Convex Pareto front")
for point in models:
ax.scatter(
@@ -131,4 +146,3 @@ def clean_text(text: str) -> str:
plt.savefig("pareto_front.pdf", bbox_inches="tight") # vector output
plt.savefig("pareto_front.png", bbox_inches="tight", dpi=1200) # high-res raster
plt.show()
-# %%