Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,11 @@ LitXBench
<td><a href="https://github.com/hasan-sayeed/KnowMat2">KnowMat2 (GPT-5.2 High)</a></td>
<td>0.52</td>
<td>0.43</td>
<td>0.43 &plusmn; 0.29</td>
<td>0.28</td>
<td>0.66</td>
<td>0.66</td>
<td>0.19</td>
<td>0.43 &plusmn; 0.06</td>
<td>0.27</td>
<td>0.67</td>
<td>0.65</td>
<td>0.17</td>
<td>&mdash;</td>
<td>19.40</td>
<td class="links-cell"><a href="https://github.com/curtischong/KnowMat2">code</a> <a href="https://github.com/curtischong/KnowMat2/tree/master/std_1_high/knowmat2">run</a> <a href="https://chemrxiv.org/engage/chemrxiv/article-details/6902772ea482cba122c41c14">paper</a> <a href="https://github.com/Radical-AI/litxbench/pull/4">pr</a></td>
Expand Down
2 changes: 1 addition & 1 deletion scripts/paper/combined_results/process_f1_summary.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
method process_f1
KnowMat2 0.6592 +/- 0.6302
KnowMat2 0.6734 +/- 0.1376
Claude Haiku 4.5 0.8426 +/- 0.0586
GPT 5 Mini Medium 0.8422 +/- 0.0698
Gemini 3 Flash 0.8560 +/- 0.0197
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
method overall_precision overall_recall overall_f1 meas_f1 process_f1 material_f1 config_f1 avg_attempts cost_usd
KnowMat2 0.5231 0.4286 0.4320 +/- 0.2916 0.2758 0.6592 0.6589 0.1876 1.00 0
KnowMat2 0.5245 0.4258 0.4288 +/- 0.0586 0.2666 0.6734 0.6511 0.1745 1.00 0
Claude Haiku 4.5 0.6417 0.6862 0.6510 +/- 0.0130 0.5089 0.8426 0.9380 0.3840 2.21 1.7177
GPT 5 Mini Medium 0.6706 0.7023 0.6764 +/- 0.0434 0.5222 0.8422 0.9422 0.4051 2.49 3.46597
Gemini 3 Flash 0.7404 0.7560 0.7376 +/- 0.0489 0.6092 0.8560 0.9692 0.5166 2.58 1.73078
Expand Down
46 changes: 30 additions & 16 deletions scripts/paper/pareto_front.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# %%
import math
import re
from dataclasses import dataclass

Expand All @@ -20,16 +20,16 @@ class ModelPoint:


models: tuple[ModelPoint, ...] = (
ModelPoint("Gemini CLI (Gemini-3.1 Pro Preview)", 0.7952, 6.45507, GREEN, "s"),
ModelPoint("Gemini 3.1 Pro", 0.7701, 4.17163, BLUE, "s"),
ModelPoint("Gemini 3 Flash", 0.7371, 1.73078, RED, "s"),
ModelPoint("Claude Code (Opus 4.6)", 0.7754, 26.1145, GREEN, "o"),
ModelPoint("Claude Opus 4.6", 0.7203, 5.37406, BLUE, "o"),
ModelPoint("Claude Haiku 4.5", 0.6488, 1.7177, RED, "o"),
ModelPoint("Codex (GPT 5.2 Codex High)", 0.7246, 4.17361, GREEN, "^"),
ModelPoint("GPT 5.2 High", 0.7244, 4.99104, BLUE, "^"),
ModelPoint("GPT 5 Mini Med.", 0.6731, 3.46597, RED, "^"),
ModelPoint("KnowMat2", 0.4320, 19.40, GREEN, "p"),
ModelPoint("Gemini CLI (Gemini-3.1 Pro Preview)", 0.7958, 6.45507, GREEN, "s"),
ModelPoint("Gemini 3.1 Pro", 0.7708, 4.17163, BLUE, "s"),
ModelPoint("Gemini 3 Flash", 0.7376, 1.73078, RED, "s"),
ModelPoint("Claude Code (Opus 4.6)", 0.7766, 26.1145, GREEN, "o"),
ModelPoint("Claude Opus 4.6", 0.7231, 5.37406, BLUE, "o"),
ModelPoint("Claude Haiku 4.5", 0.6510, 1.7177, RED, "o"),
ModelPoint("Codex (GPT 5.2 Codex High)", 0.7268, 4.17361, GREEN, "^"),
ModelPoint("GPT 5.2 High", 0.7273, 4.99104, BLUE, "^"),
ModelPoint("GPT 5 Mini Med.", 0.6764, 3.46597, RED, "^"),
ModelPoint("KnowMat2", 0.4288, 19.40, GREEN, "p"),
)


Expand All @@ -44,9 +44,24 @@ def pareto_front(points: tuple[ModelPoint, ...]) -> tuple[ModelPoint, ...]:
return tuple(pareto_points)


pareto = pareto_front(models)
pareto_cost = [point.cost for point in pareto]
pareto_f1 = [point.f1 for point in pareto]
def upper_convex_front(points: tuple[ModelPoint, ...]) -> tuple[ModelPoint, ...]:
front: list[ModelPoint] = []

def slope(left: ModelPoint, right: ModelPoint) -> float:
x_left = math.log10(left.cost)
x_right = math.log10(right.cost)
return (right.f1 - left.f1) / (x_right - x_left)

for point in pareto_front(points):
while len(front) >= 2 and slope(front[-2], front[-1]) <= slope(front[-1], point):
front.pop()
front.append(point)
return tuple(front)


convex_front = upper_convex_front(models)
front_cost = [point.cost for point in convex_front]
front_f1 = [point.f1 for point in convex_front]

plt.rcParams.update(
{
Expand Down Expand Up @@ -79,7 +94,7 @@ def clean_text(text: str) -> str:
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_box_aspect(1)

ax.plot(pareto_cost, pareto_f1, color=LINE_C, linewidth=3.2, linestyle="--", zorder=2, label="Pareto front")
ax.plot(front_cost, front_f1, color=LINE_C, linewidth=3.2, linestyle="--", zorder=2, label="Convex Pareto front")

for point in models:
ax.scatter(
Expand Down Expand Up @@ -131,4 +146,3 @@ def clean_text(text: str) -> str:
plt.savefig("pareto_front.pdf", bbox_inches="tight") # vector output
plt.savefig("pareto_front.png", bbox_inches="tight", dpi=1200) # high-res raster
plt.show()
# %%
Loading