From 237eac7897bbf8eaef8ee3889295faaef0fe07d9 Mon Sep 17 00:00:00 2001 From: Curtis Chong Date: Mon, 18 May 2026 11:33:09 -0400 Subject: [PATCH 1/3] knowmat2 results were generated from stale data. this regenerates the summary data --- docs/index.rst | 10 +++++----- .../combined_results/process_f1_summary.csv | 2 +- .../zero_shot_summary_with_ci.csv | 2 +- scripts/paper/pareto_front.py | 20 +++++++++---------- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c59b425..554ce24 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -176,11 +176,11 @@ LitXBench KnowMat2 (GPT-5.2 High) 0.52 0.43 - 0.43 ± 0.29 - 0.28 - 0.66 - 0.66 - 0.19 + 0.43 ± 0.06 + 0.27 + 0.67 + 0.65 + 0.17 — 19.40 code run paper pr diff --git a/scripts/paper/combined_results/process_f1_summary.csv b/scripts/paper/combined_results/process_f1_summary.csv index cf42329..c06aeb8 100644 --- a/scripts/paper/combined_results/process_f1_summary.csv +++ b/scripts/paper/combined_results/process_f1_summary.csv @@ -1,5 +1,5 @@ method process_f1 -KnowMat2 0.6592 +/- 0.6302 +KnowMat2 0.6734 +/- 0.1376 Claude Haiku 4.5 0.8426 +/- 0.0586 GPT 5 Mini Medium 0.8422 +/- 0.0698 Gemini 3 Flash 0.8560 +/- 0.0197 diff --git a/scripts/paper/combined_results/zero_shot_summary_with_ci.csv b/scripts/paper/combined_results/zero_shot_summary_with_ci.csv index e36c866..fcfda77 100644 --- a/scripts/paper/combined_results/zero_shot_summary_with_ci.csv +++ b/scripts/paper/combined_results/zero_shot_summary_with_ci.csv @@ -1,5 +1,5 @@ method overall_precision overall_recall overall_f1 meas_f1 process_f1 material_f1 config_f1 avg_attempts cost_usd -KnowMat2 0.5231 0.4286 0.4320 +/- 0.2916 0.2758 0.6592 0.6589 0.1876 1.00 0 +KnowMat2 0.5245 0.4258 0.4288 +/- 0.0586 0.2666 0.6734 0.6511 0.1745 1.00 0 Claude Haiku 4.5 0.6417 0.6862 0.6510 +/- 0.0130 0.5089 0.8426 0.9380 0.3840 2.21 1.7177 GPT 5 Mini Medium 0.6706 0.7023 0.6764 +/- 0.0434 0.5222 0.8422 0.9422 0.4051 2.49 3.46597 Gemini 3 Flash 0.7404 0.7560 0.7376 +/- 0.0489 0.6092 0.8560 0.9692 0.5166 2.58 1.73078 diff --git a/scripts/paper/pareto_front.py b/scripts/paper/pareto_front.py index c2c9bf8..8205902 100644 --- a/scripts/paper/pareto_front.py +++ b/scripts/paper/pareto_front.py @@ -20,16 +20,16 @@ class ModelPoint: models: tuple[ModelPoint, ...] = ( - ModelPoint("Gemini CLI (Gemini-3.1 Pro Preview)", 0.7952, 6.45507, GREEN, "s"), - ModelPoint("Gemini 3.1 Pro", 0.7701, 4.17163, BLUE, "s"), - ModelPoint("Gemini 3 Flash", 0.7371, 1.73078, RED, "s"), - ModelPoint("Claude Code (Opus 4.6)", 0.7754, 26.1145, GREEN, "o"), - ModelPoint("Claude Opus 4.6", 0.7203, 5.37406, BLUE, "o"), - ModelPoint("Claude Haiku 4.5", 0.6488, 1.7177, RED, "o"), - ModelPoint("Codex (GPT 5.2 Codex High)", 0.7246, 4.17361, GREEN, "^"), - ModelPoint("GPT 5.2 High", 0.7244, 4.99104, BLUE, "^"), - ModelPoint("GPT 5 Mini Med.", 0.6731, 3.46597, RED, "^"), - ModelPoint("KnowMat2", 0.4320, 19.40, GREEN, "p"), + ModelPoint("Gemini CLI (Gemini-3.1 Pro Preview)", 0.7958, 6.45507, GREEN, "s"), + ModelPoint("Gemini 3.1 Pro", 0.7708, 4.17163, BLUE, "s"), + ModelPoint("Gemini 3 Flash", 0.7376, 1.73078, RED, "s"), + ModelPoint("Claude Code (Opus 4.6)", 0.7766, 26.1145, GREEN, "o"), + ModelPoint("Claude Opus 4.6", 0.7231, 5.37406, BLUE, "o"), + ModelPoint("Claude Haiku 4.5", 0.6510, 1.7177, RED, "o"), + ModelPoint("Codex (GPT 5.2 Codex High)", 0.7268, 4.17361, GREEN, "^"), + ModelPoint("GPT 5.2 High", 0.7273, 4.99104, BLUE, "^"), + ModelPoint("GPT 5 Mini Med.", 0.6764, 3.46597, RED, "^"), + ModelPoint("KnowMat2", 0.4288, 19.40, GREEN, "p"), ) From dc0c85644a6941e691146e04a58a109a556fbb01 Mon Sep 17 00:00:00 2001 From: Curtis Chong Date: Mon, 18 May 2026 12:23:29 -0400 Subject: [PATCH 2/3] fix pareto front function to draw the convex hull instead --- scripts/paper/pareto_front.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/scripts/paper/pareto_front.py b/scripts/paper/pareto_front.py index 8205902..21beaec 100644 --- a/scripts/paper/pareto_front.py +++ b/scripts/paper/pareto_front.py @@ -1,5 +1,5 @@ -# %% import re +import math from dataclasses import dataclass import matplotlib.pyplot as plt @@ -44,9 +44,24 @@ def pareto_front(points: tuple[ModelPoint, ...]) -> tuple[ModelPoint, ...]: return tuple(pareto_points) -pareto = pareto_front(models) -pareto_cost = [point.cost for point in pareto] -pareto_f1 = [point.f1 for point in pareto] +def upper_convex_front(points: tuple[ModelPoint, ...]) -> tuple[ModelPoint, ...]: + front: list[ModelPoint] = [] + + def slope(left: ModelPoint, right: ModelPoint) -> float: + x_left = math.log10(left.cost) + x_right = math.log10(right.cost) + return (right.f1 - left.f1) / (x_right - x_left) + + for point in pareto_front(points): + while len(front) >= 2 and slope(front[-2], front[-1]) <= slope(front[-1], point): + front.pop() + front.append(point) + return tuple(front) + + +convex_front = upper_convex_front(models) +front_cost = [point.cost for point in convex_front] +front_f1 = [point.f1 for point in convex_front] plt.rcParams.update( { @@ -79,7 +94,7 @@ def clean_text(text: str) -> str: fig, ax = plt.subplots(figsize=(8, 8)) ax.set_box_aspect(1) -ax.plot(pareto_cost, pareto_f1, color=LINE_C, linewidth=3.2, linestyle="--", zorder=2, label="Pareto front") +ax.plot(front_cost, front_f1, color=LINE_C, linewidth=3.2, linestyle="--", zorder=2, label="Convex Pareto front") for point in models: ax.scatter( @@ -131,4 +146,3 @@ def clean_text(text: str) -> str: plt.savefig("pareto_front.pdf", bbox_inches="tight") # vector output plt.savefig("pareto_front.png", bbox_inches="tight", dpi=1200) # high-res raster plt.show() -# %% From 82f7ba7b970f039be11fd421347aaeabbcf05b6a Mon Sep 17 00:00:00 2001 From: Curtis Chong Date: Mon, 18 May 2026 12:24:50 -0400 Subject: [PATCH 3/3] fix ruff --- scripts/paper/pareto_front.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/paper/pareto_front.py b/scripts/paper/pareto_front.py index 21beaec..baf14ea 100644 --- a/scripts/paper/pareto_front.py +++ b/scripts/paper/pareto_front.py @@ -1,5 +1,5 @@ -import re import math +import re from dataclasses import dataclass import matplotlib.pyplot as plt