From 2564ea36a3600d43274548ffb01d6dc7013b3754 Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 20 May 2026 03:17:43 -0400 Subject: [PATCH 1/2] leaderboard: derive per-dataset metrics from data; chip filters + metric toggle on every table - per-dataset shard reads metrics from actual runs (no MAP/recall_1000 phantom columns) - shared FilterChips + MatrixCell components reused across home / dataset / method / model / retriever pages - every per-X table gets chip filters (method/model/retriever/metric as applicable) + metric toggle - pretty metric labels (nDCG@10, R@1k, R@100, MAP) everywhere - drop double scrollbar on home + per-dataset tables - /models index renders display label, not provider-prefixed id - /runs page shows method display name; reproduce snippet aligned to example pipeline with correct Pyserini index names and qrels-based trec_eval - /about page no longer claims run.txt/queries.tsv are guaranteed; path includes retriever segment Co-Authored-By: Claude Opus 4.7 --- reproducibility/site/scripts/build-data.ts | 44 ++- .../site/src/components/FilterChips.astro | 131 +++++++++ .../site/src/components/MatrixCell.astro | 62 +++++ reproducibility/site/src/pages/about.astro | 37 +-- .../site/src/pages/datasets/[id].astro | 158 +++++++---- reproducibility/site/src/pages/index.astro | 252 ++++++------------ .../site/src/pages/methods/[id].astro | 84 +++--- .../site/src/pages/models/[id].astro | 86 +++--- .../site/src/pages/models/index.astro | 2 +- .../site/src/pages/retrievers/[id].astro | 83 ++++-- .../site/src/pages/runs/[run_id].astro | 107 ++++---- 11 files changed, 652 insertions(+), 394 deletions(-) create mode 100644 reproducibility/site/src/components/FilterChips.astro create mode 100644 reproducibility/site/src/components/MatrixCell.astro diff --git a/reproducibility/site/scripts/build-data.ts b/reproducibility/site/scripts/build-data.ts index 875da4b..d542cb0 100644 --- a/reproducibility/site/scripts/build-data.ts +++ b/reproducibility/site/scripts/build-data.ts @@ -94,6 +94,7 @@ interface RunDetail { params_hash: string; dataset_id: string; method_id: string; + method_display: string; model: string; retriever_id: string; retriever_display: string; @@ -195,11 +196,16 @@ function readRunDetails(retrievers: Record(); for (const r of dsRows) { const lm = logicalMethod(r.method_id, r.method_params_json); @@ -251,7 +256,7 @@ function buildPerDatasetViews( model_display: displayModel(r.model), retriever_id: r.retriever_id, retriever_display: r.retriever, - run_id: r.run_id, // populated/overwritten by the best cell + run_ids: {} as Record, // metric → run_id of the winning value metrics: {} as Record, best_for: {} as Record, }); @@ -259,13 +264,34 @@ function buildPerDatasetViews( const row = map.get(key); if (row.metrics[r.metric] === undefined || r.value > row.metrics[r.metric]) { row.metrics[r.metric] = r.value; - row.run_id = r.run_id; + row.run_ids[r.metric] = r.run_id; } } + // Discover which metrics actually exist in the data — the registry's + // eval_metrics is aspirational and may over-specify (e.g. MAP on DL, + // recall_1000 on BEIR). Render only what we have. + const present = new Set(); + for (const row of map.values()) { + for (const m of Object.keys(row.metrics)) present.add(m); + } + const allMetrics = Array.from(present); + const primary = present.has("ndcg_cut_10") ? "ndcg_cut_10" : allMetrics[0] ?? null; + const secondary = present.has("recall_1000") + ? "recall_1000" + : present.has("recall_100") + ? "recall_100" + : allMetrics.find((m) => m !== primary) ?? null; + // Order: primary first, secondary second, then anything else. + const orderedMetrics = [ + ...(primary ? [primary] : []), + ...(secondary && secondary !== primary ? [secondary] : []), + ...allMetrics.filter((m) => m !== primary && m !== secondary), + ]; + // best_for flags relative to the rows above. const list = Array.from(map.values()); - for (const m of allowed) { + for (const m of orderedMetrics) { let best = -Infinity; let bestRow: any = null; for (const row of list) { @@ -277,8 +303,10 @@ function buildPerDatasetViews( writeJSON(path.join(VIEWS_DIR, `dataset-${datasetId}.json`), { dataset_id: datasetId, - dataset: datasets[datasetId] ?? { id: datasetId, name: datasetId, eval_metrics: allowed }, - metric_columns: allowed, + dataset: datasets[datasetId] ?? { id: datasetId, name: datasetId, eval_metrics: orderedMetrics }, + metric_columns: orderedMetrics, + primary_metric: primary, + secondary_metric: secondary, runs: list, }); } diff --git a/reproducibility/site/src/components/FilterChips.astro b/reproducibility/site/src/components/FilterChips.astro new file mode 100644 index 0000000..b41fbf3 --- /dev/null +++ b/reproducibility/site/src/components/FilterChips.astro @@ -0,0 +1,131 @@ +--- +/** + * Chip-style filter bar for any leaderboard table. + * + * Each group corresponds to a column on the table's attributes + * (e.g. data-method, data-model). Clicking a chip hides rows whose attribute + * doesn't match, by toggling the .qg-chip-hidden class and dispatching + * "qg-itable-reapply" on the nearest .qg-itable wrapper so InteractiveTable + * re-syncs its row-visibility + shown-count. + * + * The optional `metric` group is special: it swaps .qg-cell-primary / + * .qg-cell-secondary visibility and the matching column-label spans across + * the whole page, then re-keys cells' data-sort-value to the now-visible + * metric so sort follows what's on screen. + */ +interface ChipValue { + value: string; + label: string; +} +interface ChipGroup { + /** "method" | "model" | "retriever" | "metric"; matches */ + key: string; + /** Visible header text. */ + label: string; + /** First item is shown as the active default. For `metric`, use + * [{value:"primary", label:"nDCG@10"}, {value:"secondary", label:"Recall"}]. */ + values: ChipValue[]; +} +interface Props { + /** id of the table to filter (used to scope row queries to this table). */ + tableId: string; + groups: ChipGroup[]; +} +const { tableId, groups } = Astro.props; +--- + +
+ {groups.map((g) => ( +
+ {g.label}: +
+ {g.values.map((v, i) => ( + + ))} +
+
+ ))} +
+ + + + diff --git a/reproducibility/site/src/components/MatrixCell.astro b/reproducibility/site/src/components/MatrixCell.astro new file mode 100644 index 0000000..40d2a24 --- /dev/null +++ b/reproducibility/site/src/components/MatrixCell.astro @@ -0,0 +1,62 @@ +--- +/** + * One cell in any of the leaderboard tables. + * + * Cells render two metric values (primary + optional secondary) layered on top + * of each other; FilterChips' metric-toggle flips visibility via the global + * .qg-cell-primary / .qg-cell-secondary classes. The cell exposes + * data-primary-value and data-secondary-value so InteractiveTable's sort + * picks up whichever metric is currently visible. + */ +interface Cell { + value: number; + best: boolean; +} +interface Props { + primary?: Cell; + secondary?: Cell | null; + runId?: string | null; + digits?: number; +} +const { primary, secondary, runId, digits = 3 } = Astro.props; +const primaryValue = primary?.value ?? ""; +const secondaryValue = secondary?.value ?? ""; +const empty = primary === undefined && (secondary == null || secondary === undefined); +--- + + + {empty ? ( + + ) : runId ? ( + + {primary !== undefined && ( + + {primary.value.toFixed(digits)} + + )} + {secondary != null && ( + + )} + + ) : ( + <> + {primary !== undefined && ( + + {primary.value.toFixed(digits)} + + )} + {secondary != null && ( + + )} + + )} + diff --git a/reproducibility/site/src/pages/about.astro b/reproducibility/site/src/pages/about.astro index 59867f3..4496f45 100644 --- a/reproducibility/site/src/pages/about.astro +++ b/reproducibility/site/src/pages/about.astro @@ -1,5 +1,16 @@ --- import Default from "../layouts/Default.astro"; +import CodeBlock from "../components/CodeBlock.astro"; + +const submitCmd = `python examples/querygym_pyserini/pipeline.py \\ + --dataset msmarco-v1-passage.trecdl2019 \\ + --method query2doc --model gpt-4.1 \\ + --output-dir outputs/dl19_query2doc + +python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2doc +make repro-aggregate +git add reproducibility/data/ && git commit -m "..." && git push +gh pr create`; --- @@ -8,16 +19,14 @@ import Default from "../layouts/Default.astro";

The QueryGym Leaderboard tracks reproducible query-reformulation results - across IR benchmarks (BEIR, MS MARCO, TREC DL). Every row is backed by: + across IR benchmarks (BEIR, MS MARCO, TREC DL). Every row is backed by a + JSON file conforming to reproducibility/schema.json v1. + Submissions may also include the reformulated-queries TSV and a + TREC-format .run.txt for full re-evaluation; both are optional.

-
    -
  • a JSON file conforming to reproducibility/schema.json v1,
  • -
  • a TREC-format .run.txt for re-evaluation, and
  • -
  • the reformulated queries TSV used to produce the run file.
  • -

- All three live in the repository - under reproducibility/data/runs/{dataset}/{method}/{model}/. + All artifacts live in the repository + under reproducibility/data/runs/{dataset}/{method}/{model}/{retriever}/. Citing a number is as simple as linking the commit + the run JSON.

@@ -27,15 +36,9 @@ import Default from "../layouts/Default.astro";

Run the example pipeline, then use submit_run.py and open a PR.

-
python examples/querygym_pyserini/pipeline.py \
-    --dataset msmarco-v1-passage.trecdl2019 \
-    --method query2e --model gpt-4.1-mini \
-    --output-dir outputs/dl19_query2e
-
-python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e
-make repro-aggregate
-git add reproducibility/data/ && git commit -m "..." && git push
-gh pr create
+
+ {submitCmd} +

Full guide: Reproducibility User Guide ↗

diff --git a/reproducibility/site/src/pages/datasets/[id].astro b/reproducibility/site/src/pages/datasets/[id].astro index fdf0065..405d4d5 100644 --- a/reproducibility/site/src/pages/datasets/[id].astro +++ b/reproducibility/site/src/pages/datasets/[id].astro @@ -1,17 +1,15 @@ --- import Default from "../../layouts/Default.astro"; import EmptyState from "../../components/EmptyState.astro"; -import MetricCell from "../../components/MetricCell.astro"; import InteractiveTable from "../../components/InteractiveTable.astro"; +import FilterChips from "../../components/FilterChips.astro"; +import MatrixCell from "../../components/MatrixCell.astro"; import datasetsList from "../../data/datasets.json"; -// Eagerly load all per-dataset shards. Astro statically analyzes this so it -// only bundles paths that exist at build time. const shards = import.meta.glob<{ default: any }>( "../../data/views/dataset-*.json", { eager: true }, ); - function shardFor(id: string): any | null { const key = Object.keys(shards).find((k) => k.endsWith(`/dataset-${id}.json`)); return key ? shards[key].default : null; @@ -22,11 +20,59 @@ export async function getStaticPaths() { return datasets.map((d: any) => ({ params: { id: d.id } })); } +const METRIC_LABEL: Record = { + ndcg_cut_10: "nDCG@10", + recall_1000: "R@1k", + recall_100: "R@100", + map: "MAP", +}; + const { id } = Astro.params; const view = shardFor(id!); const datasetMeta = datasetsList.find((d: any) => d.id === id); -const runs = view?.runs ?? []; -const metricCols: string[] = view?.metric_columns ?? datasetMeta?.eval_metrics ?? []; +const runs = (view?.runs ?? []) as any[]; +const primary: string | null = view?.primary_metric ?? null; +const secondary: string | null = view?.secondary_metric ?? null; + +const tableId = "qg-dataset-table"; + +// Distinct values for chip filters, derived from this dataset's actual rows. +const uniq = (xs: any[], key: string, displayKey?: string) => { + const m = new Map(); + for (const r of xs) m.set(r[key], r[displayKey ?? key] ?? r[key]); + return Array.from(m.entries()).sort((a, b) => a[0].localeCompare(b[0])); +}; +const methodChoices = uniq(runs, "method_id", "method_display"); +const modelChoices = uniq(runs, "model", "model_display"); +const retrieverChoices = uniq(runs, "retriever_id", "retriever_display"); + +const chipGroups = [ + { + key: "method", + label: "Method", + values: [{ value: "", label: "All" }, ...methodChoices.map(([v, l]) => ({ value: v, label: l }))], + }, + { + key: "model", + label: "Model", + values: [{ value: "", label: "All" }, ...modelChoices.map(([v, l]) => ({ value: v, label: l }))], + }, + { + key: "retriever", + label: "Retriever", + values: [{ value: "", label: "All" }, ...retrieverChoices.map(([v, l]) => ({ value: v, label: l }))], + }, +]; +if (secondary && secondary !== primary) { + chipGroups.push({ + key: "metric", + label: "Metric", + values: [ + { value: "primary", label: METRIC_LABEL[primary!] ?? primary! }, + { value: "secondary", label: METRIC_LABEL[secondary] ?? secondary }, + ], + }); +} ---
{id}
- { - runs.length === 0 ? ( -
- -
- ) : ( + {runs.length === 0 ? ( +
+ +
+ ) : ( + <>
- -
- - - - - - - {metricCols.map((m) => ( - - ))} - - - - - {runs.map((r: any) => ( - + + + + +
+
MethodModelRetriever{m}Run
+ + + + + + + + + + {runs.map((r: any) => { + const pCell = primary && r.metrics?.[primary] !== undefined + ? { value: r.metrics[primary], best: !!r.best_for?.[primary] } + : undefined; + const sCell = secondary && secondary !== primary && r.metrics?.[secondary] !== undefined + ? { value: r.metrics[secondary], best: !!r.best_for?.[secondary] } + : null; + const runId = (primary && r.run_ids?.[primary]) || (secondary && r.run_ids?.[secondary]); + return ( + - {metricCols.map((m) => ( - - ))} - + - ))} - -
MethodModelRetriever + {METRIC_LABEL[primary!] ?? primary} + {secondary && ( + + )} +
{r.method_display ?? r.method_id} {r.model_display ?? r.model} {r.retriever_display ?? r.retriever_id} - - - - {r.run_id.slice(0, 8)}… - -
-
-
-
- ) - } + ); + })} + + + + + + )}
diff --git a/reproducibility/site/src/pages/index.astro b/reproducibility/site/src/pages/index.astro index 66c3153..f6cd3da 100644 --- a/reproducibility/site/src/pages/index.astro +++ b/reproducibility/site/src/pages/index.astro @@ -2,6 +2,8 @@ import Default from "../layouts/Default.astro"; import Stat from "../components/Stat.astro"; import InteractiveTable from "../components/InteractiveTable.astro"; +import FilterChips from "../components/FilterChips.astro"; +import MatrixCell from "../components/MatrixCell.astro"; import overview from "../data/overview.json"; import matrix from "../data/matrix.json"; import retrievers from "../data/retrievers.json"; @@ -9,7 +11,6 @@ import models from "../data/models.json"; const populated = overview.run_count > 0; -// Short dataset labels for the matrix header. const SHORT: Record = { "msmarco-v1-passage.trecdl2019": "DL 2019", "msmarco-v1-passage.trecdl2020": "DL 2020", @@ -26,6 +27,7 @@ const METRIC_LABEL: Record = { ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", + map: "MAP", }; const rows = [...matrix.rows].sort((a: any, b: any) => { @@ -58,171 +60,93 @@ const datasetCols = matrix.dataset_columns; - { - populated && ( -
-
-
- Retriever: -
- - {retrievers.map((r: any) => ( - - ))} -
-
-
- Model: -
- - {models.map((m: any) => ( - - ))} -
-
-
- Metric: -
- - -
-
-
-
- ) - } + {populated && ( + ({ value: r.id, label: r.display_name })), + ], + }, + { + key: "model", + label: "Model", + values: [ + { value: "", label: "All" }, + ...models.map((m: any) => ({ value: m.id, label: m.display ?? m.id })), + ], + }, + { + key: "metric", + label: "Metric", + values: [ + { value: "primary", label: "nDCG@10" }, + { value: "secondary", label: "Recall" }, + ], + }, + ]} + /> + )} - { - populated ? ( - -
- - - - - - - {datasetCols.map((d: any) => ( - - ))} - - - - {rows.map((row: any) => ( - +
+
MethodModelRetriever -
{SHORT[d.id] ?? d.name}
-
- {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} -
- -
+ + + + + + {datasetCols.map((d: any) => ( + - - - {datasetCols.map((d: any) => { - const cell = row.values?.[d.id] ?? {}; - const runId = row.run_ids?.[d.id]; - const primary = cell[d.primary_metric]; - const secondary = d.secondary_metric ? cell[d.secondary_metric] : null; - const primaryValue = primary?.value ?? ""; - return ( - - ); - })} - +
{SHORT[d.id] ?? d.name}
+
+ {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} +
+ + ))} - -
MethodModelRetriever - {row.method_display ?? row.method_id}{row.model_display ?? row.model}{row.retriever_display ?? row.retriever_id} - {runId ? ( - - - {primary !== undefined ? primary.value.toFixed(3) : "—"} - - {secondary && ( - - )} - - ) : ( - - )} -
-
-
- ) : ( -
- No runs yet. The matrix will populate when results land. + + + + {rows.map((row: any) => ( + + {row.method_display ?? row.method_id} + {row.model_display ?? row.model} + {row.retriever_display ?? row.retriever_id} + {datasetCols.map((d: any) => { + const cell = row.values?.[d.id] ?? {}; + return ( + + ); + })} + + ))} + +
- ) - } + + ) : ( +
+ No runs yet. The matrix will populate when results land. +
+ )}
- - - - diff --git a/reproducibility/site/src/pages/methods/[id].astro b/reproducibility/site/src/pages/methods/[id].astro index 9c5c4d8..20fa8bb 100644 --- a/reproducibility/site/src/pages/methods/[id].astro +++ b/reproducibility/site/src/pages/methods/[id].astro @@ -2,6 +2,8 @@ import Default from "../../layouts/Default.astro"; import EmptyState from "../../components/EmptyState.astro"; import InteractiveTable from "../../components/InteractiveTable.astro"; +import FilterChips from "../../components/FilterChips.astro"; +import MatrixCell from "../../components/MatrixCell.astro"; import methods from "../../data/methods.json"; const shards = import.meta.glob<{ default: any }>( @@ -21,7 +23,7 @@ export async function getStaticPaths() { const { id } = Astro.params; const view = shardFor(id!); const meta = methods.find((m: any) => m.id === id); -const rows = view?.rows ?? []; +const rows = (view?.rows ?? []) as any[]; const SHORT: Record = { "msmarco-v1-passage.trecdl2019": "DL 2019", @@ -35,12 +37,20 @@ const SHORT: Record = { "beir-v1.0.0-trec-news": "News", }; const METRIC_LABEL: Record = { - ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", + ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", map: "MAP", }; const datasetCols = (await import("../../data/matrix.json")).default.dataset_columns; - const title = view?.method_display ?? meta?.display ?? id ?? "Method"; +const tableId = "qg-method-table"; + +const uniq = (xs: any[], key: string, displayKey?: string) => { + const m = new Map(); + for (const r of xs) m.set(r[key], r[displayKey ?? key] ?? r[key]); + return Array.from(m.entries()).sort((a, b) => a[0].localeCompare(b[0])); +}; +const modelChoices = uniq(rows, "model", "model_display"); +const retrieverChoices = uniq(rows, "retriever_id", "retriever_display"); --- @@ -49,50 +59,64 @@ const title = view?.method_display ?? meta?.display ?? id ?? "Method";
{id}
{rows.length} model × retriever combinations
- { - rows.length === 0 ? ( -
- + {rows.length === 0 ? ( +
+ +
+ ) : ( + <> +
+ ({ value: v, label: l }))] }, + { key: "retriever", label: "Retriever", + values: [{ value: "", label: "All" }, ...retrieverChoices.map(([v, l]) => ({ value: v, label: l }))] }, + { key: "metric", label: "Metric", + values: [{ value: "primary", label: "nDCG@10" }, { value: "secondary", label: "Recall" }] }, + ]} + />
- ) : ( -
+ +
- +
{datasetCols.map((d: any) => ( - ))} {rows.map((row: any) => ( - + {datasetCols.map((d: any) => { const cell = row.values?.[d.id] ?? {}; - const runId = row.run_ids?.[d.id]; - const primary = cell[d.primary_metric]; - const v = primary?.value ?? ""; return ( - + ); })} @@ -102,6 +126,6 @@ const title = view?.method_display ?? meta?.display ?? id ?? "Method"; - ) - } + + )} diff --git a/reproducibility/site/src/pages/models/[id].astro b/reproducibility/site/src/pages/models/[id].astro index 9973e86..5cb80ff 100644 --- a/reproducibility/site/src/pages/models/[id].astro +++ b/reproducibility/site/src/pages/models/[id].astro @@ -2,6 +2,8 @@ import Default from "../../layouts/Default.astro"; import EmptyState from "../../components/EmptyState.astro"; import InteractiveTable from "../../components/InteractiveTable.astro"; +import FilterChips from "../../components/FilterChips.astro"; +import MatrixCell from "../../components/MatrixCell.astro"; import models from "../../data/models.json"; const shards = import.meta.glob<{ default: any }>( @@ -21,9 +23,8 @@ export async function getStaticPaths() { const { id } = Astro.params; const view = shardFor(id!); const meta = models.find((m: any) => m.slug === id); -const rows = view?.rows ?? []; +const rows = (view?.rows ?? []) as any[]; -// Same dataset-label/metric-label tables as the home page. const SHORT: Record = { "msmarco-v1-passage.trecdl2019": "DL 2019", "msmarco-v1-passage.trecdl2020": "DL 2020", @@ -36,13 +37,20 @@ const SHORT: Record = { "beir-v1.0.0-trec-news": "News", }; const METRIC_LABEL: Record = { - ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", + ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", map: "MAP", }; -// Dataset columns from the shared home matrix shape (re-derive from rows). const datasetCols = (await import("../../data/matrix.json")).default.dataset_columns; - const title = meta?.display ?? view?.model ?? id ?? "Model"; +const tableId = "qg-model-table"; + +const uniq = (xs: any[], key: string, displayKey?: string) => { + const m = new Map(); + for (const r of xs) m.set(r[key], r[displayKey ?? key] ?? r[key]); + return Array.from(m.entries()).sort((a, b) => a[0].localeCompare(b[0])); +}; +const methodChoices = uniq(rows, "method_id", "method_display"); +const retrieverChoices = uniq(rows, "retriever_id", "retriever_display"); --- @@ -50,50 +58,64 @@ const title = meta?.display ?? view?.model ?? id ?? "Model";

{title}

{rows.length} method × retriever combinations
- { - rows.length === 0 ? ( -
- + {rows.length === 0 ? ( +
+ +
+ ) : ( + <> +
+ ({ value: v, label: l }))] }, + { key: "retriever", label: "Retriever", + values: [{ value: "", label: "All" }, ...retrieverChoices.map(([v, l]) => ({ value: v, label: l }))] }, + { key: "metric", label: "Metric", + values: [{ value: "primary", label: "nDCG@10" }, { value: "secondary", label: "Recall" }] }, + ]} + />
- ) : ( -
+ +
-
Model Retriever - {SHORT[d.id] ?? d.name} - / {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} + +
{SHORT[d.id] ?? d.name}
+
+ {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} +
+
{row.model_display ?? row.model} {row.retriever_display ?? row.retriever_id} - {runId && primary !== undefined ? ( - - - {primary.value.toFixed(3)} - - - ) : ( - - )} -
+
{datasetCols.map((d: any) => ( - ))} {rows.map((row: any) => ( - + {datasetCols.map((d: any) => { const cell = row.values?.[d.id] ?? {}; - const runId = row.run_ids?.[d.id]; - const primary = cell[d.primary_metric]; - const v = primary?.value ?? ""; return ( - + ); })} @@ -103,6 +125,6 @@ const title = meta?.display ?? view?.model ?? id ?? "Model"; - ) - } + + )} diff --git a/reproducibility/site/src/pages/models/index.astro b/reproducibility/site/src/pages/models/index.astro index b0a7f60..39b5203 100644 --- a/reproducibility/site/src/pages/models/index.astro +++ b/reproducibility/site/src/pages/models/index.astro @@ -18,7 +18,7 @@ import models from "../../data/models.json"; {models.map((m: any) => (
  • -
    {m.id}
    +
    {m.display ?? m.id}
    {m.run_count} run{m.run_count === 1 ? "" : "s"}
    diff --git a/reproducibility/site/src/pages/retrievers/[id].astro b/reproducibility/site/src/pages/retrievers/[id].astro index d089dd9..0ac3d3a 100644 --- a/reproducibility/site/src/pages/retrievers/[id].astro +++ b/reproducibility/site/src/pages/retrievers/[id].astro @@ -2,6 +2,8 @@ import Default from "../../layouts/Default.astro"; import EmptyState from "../../components/EmptyState.astro"; import InteractiveTable from "../../components/InteractiveTable.astro"; +import FilterChips from "../../components/FilterChips.astro"; +import MatrixCell from "../../components/MatrixCell.astro"; import retrievers from "../../data/retrievers.json"; const shards = import.meta.glob<{ default: any }>( @@ -21,7 +23,7 @@ export async function getStaticPaths() { const { id } = Astro.params; const view = shardFor(id!); const meta = retrievers.find((r: any) => r.id === id); -const rows = view?.rows ?? []; +const rows = (view?.rows ?? []) as any[]; const SHORT: Record = { "msmarco-v1-passage.trecdl2019": "DL 2019", @@ -35,11 +37,20 @@ const SHORT: Record = { "beir-v1.0.0-trec-news": "News", }; const METRIC_LABEL: Record = { - ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", + ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100", map: "MAP", }; const datasetCols = (await import("../../data/matrix.json")).default.dataset_columns; const title = meta?.display_name ?? id ?? "Retriever"; +const tableId = "qg-retriever-table"; + +const uniq = (xs: any[], key: string, displayKey?: string) => { + const m = new Map(); + for (const r of xs) m.set(r[key], r[displayKey ?? key] ?? r[key]); + return Array.from(m.entries()).sort((a, b) => a[0].localeCompare(b[0])); +}; +const methodChoices = uniq(rows, "method_id", "method_display"); +const modelChoices = uniq(rows, "model", "model_display"); --- @@ -48,50 +59,64 @@ const title = meta?.display_name ?? id ?? "Retriever";
    {id} · {meta?.paradigm}
    {rows.length} method × model combinations
    - { - rows.length === 0 ? ( -
    - + {rows.length === 0 ? ( +
    + +
    + ) : ( + <> +
    + ({ value: v, label: l }))] }, + { key: "model", label: "Model", + values: [{ value: "", label: "All" }, ...modelChoices.map(([v, l]) => ({ value: v, label: l }))] }, + { key: "metric", label: "Metric", + values: [{ value: "primary", label: "nDCG@10" }, { value: "secondary", label: "Recall" }] }, + ]} + />
    - ) : ( -
    + +
    -
  • Method Retriever - {SHORT[d.id] ?? d.name} - / {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} + +
    {SHORT[d.id] ?? d.name}
    +
    + {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} +
    +
    {row.method_display ?? row.method_id} {row.retriever_display ?? row.retriever_id} - {runId && primary !== undefined ? ( - - - {primary.value.toFixed(3)} - - - ) : ( - - )} -
    +
    {datasetCols.map((d: any) => ( - ))} {rows.map((row: any) => ( - + {datasetCols.map((d: any) => { const cell = row.values?.[d.id] ?? {}; - const runId = row.run_ids?.[d.id]; - const primary = cell[d.primary_metric]; - const v = primary?.value ?? ""; return ( - + ); })} @@ -101,6 +126,6 @@ const title = meta?.display_name ?? id ?? "Retriever"; - ) - } + + )} diff --git a/reproducibility/site/src/pages/runs/[run_id].astro b/reproducibility/site/src/pages/runs/[run_id].astro index 77dae55..25970ab 100644 --- a/reproducibility/site/src/pages/runs/[run_id].astro +++ b/reproducibility/site/src/pages/runs/[run_id].astro @@ -12,14 +12,16 @@ export async function getStaticPaths() { const { run_id } = Astro.params; const run = (runs as Record)[run_id!]; -// Compose the reproduce snippets from the run's config. const cfg = (run?.config ?? {}) as any; const retrieval = (cfg.retrieval ?? {}) as any; const dsCfg = (cfg.dataset_config ?? {}) as any; const methodParams = (cfg.method_params ?? {}) as Record; const llm = (cfg.llm_config ?? {}) as Record; -// Pretty-format method_params for the Python snippet (skipping noise/secrets). +// method_params we surface in the reproduce snippet — strip locally-pathy noise +// (collection_path / train_queries_path etc.) that won't apply on a fresh +// checkout. Mode + num_examples + train_split are the knobs that change +// behavior across runs. const FILTERED_PARAM_KEYS = new Set([ "judge_rel_mode", "collection_path", @@ -31,66 +33,67 @@ const cleanParams: Record = {}; for (const [k, v] of Object.entries(methodParams)) { if (!FILTERED_PARAM_KEYS.has(k)) cleanParams[k] = v; } -const paramKwargs = Object.entries(cleanParams) - .map(([k, v]) => ` ${k}=${JSON.stringify(v)},`) - .join("\n"); - -const pyReformulate = `import querygym as qg - -# 1. Build the reformulator with this run's method + model + params. -reformulator = qg.create_reformulator( - "${run.method_id}", - model="${run.model}", - temperature=${llm.temperature ?? 1.0}, - max_tokens=${llm.max_tokens ?? 128},${paramKwargs ? "\n" + paramKwargs : ""} -) - -# 2. Reformulate queries from the dataset's topics file. -# (Pyserini topic name: ${dsCfg.topics ?? ""} ; ${dsCfg.num_queries ?? "?"} queries) -queries = qg.loaders.load_pyserini_topics("${dsCfg.topics ?? ""}") -reformulated = reformulator.reformulate_batch(queries) - -# 3. Write the reformulated queries TSV (this becomes --topics for retrieval). -qg.loaders.save_topics_tsv(reformulated, "reformulated_queries.tsv") -`; - -// Retrieval command depends on retriever paradigm. -const retrId = retrieval.retriever_id ?? ""; +const methodParamsJson = Object.keys(cleanParams).length + ? JSON.stringify(cleanParams) + : null; + +// Reproduce step 1: run the example pipeline to reformulate queries. +// This is the same script that produced the leaderboard row, so the resulting +// reformulated_queries.tsv is byte-identical given the same model + seed. +const pipelineCmd = `python examples/querygym_pyserini/pipeline.py \\ + --dataset ${run.dataset_id} \\ + --method ${run.method_id} \\ + --model ${run.model} \\ + --steps reformulate \\ + --temperature ${llm.temperature ?? 1.0} \\ + --max-tokens ${llm.max_tokens ?? 128} \\${methodParamsJson ? ` + --method-params '${methodParamsJson}' \\` : ""} + --output-dir outputs/reproduce`; + +// Reproduce step 2: retrieval. Index names differ per paradigm — Pyserini +// publishes a base BM25 index plus suffixed SPLADE/BGE variants. const paradigm = retrieval.paradigm ?? ""; const params = (retrieval.params ?? {}) as any; -const queriesPath = run.artifacts.queries_url - ? `# ↑ download from the leaderboard's queries link, or regenerate with the Python above\nreformulated_queries.tsv` - : "reformulated_queries.tsv"; +// Strip the trailing ".flat" segment BEIR BM25 indexes carry; SPLADE/BGE +// indexes for the same dataset live under . not .flat.. +const baseIndex = String(dsCfg.index ?? "").replace(/\.flat$/, ""); let retrievalCmd = ""; if (paradigm === "lexical") { retrievalCmd = `python -m pyserini.search.lucene \\ --threads 16 --batch-size 128 \\ --index ${dsCfg.index ?? ""} \\ - --topics reformulated_queries.tsv \\ + --topics outputs/reproduce/queries/reformulated_queries.tsv \\ --bm25 --k1 ${params.k1 ?? 0.9} --b ${params.b ?? 0.4} \\ --output run.txt \\ --hits 1000`; } else if (paradigm === "learned_sparse") { retrievalCmd = `python -m pyserini.search.lucene \\ --threads 16 --batch-size 128 \\ - --index ${dsCfg.index ?? ""}.splade-pp-ed \\ - --topics reformulated_queries.tsv \\ + --index ${baseIndex || ""}.splade-pp-ed \\ + --topics outputs/reproduce/queries/reformulated_queries.tsv \\ --encoder ${params.model ?? "naver/splade-cocondenser-ensembledistil"} \\ --output run.txt \\ --hits 1000 --impact`; } else if (paradigm === "dense") { retrievalCmd = `python -m pyserini.search.faiss \\ --threads 16 --batch-size 128 \\ - --index ${dsCfg.index ?? ""}.bge-base-en-v1.5 \\ - --topics reformulated_queries.tsv \\ + --index ${baseIndex || ""}.bge-base-en-v1.5 \\ + --topics outputs/reproduce/queries/reformulated_queries.tsv \\ --encoder ${params.encoder ?? "BAAI/bge-base-en-v1.5"} \\ --output run.txt \\ --hits 1000`; } -const trecEvalCmd = `python -m pyserini.eval.trec_eval -c -m ${run.metrics ? Object.keys(run.metrics).join(" -m ").replace(/_/g, ".") : "ndcg_cut.10"} \\ - ${dsCfg.topics ?? ""} run.txt`; +// Reproduce step 3: evaluate. trec_eval reads the prebuilt qrels key from the +// dataset registry (qrels.name), which Pyserini resolves to the canonical +// qrels file. Metric flags map from this run's stored metric ids. +const trecMetrics = Object.keys(run.metrics ?? {}) + .map((m) => m.replace(/_/g, ".")) + .join(" -m "); +const qrelsName = dsCfg.qrels?.name ?? dsCfg.qrels ?? dsCfg.topics?.name ?? dsCfg.topics ?? ""; +const trecEvalCmd = `python -m pyserini.eval.trec_eval -c -m ${trecMetrics || "ndcg_cut.10"} \\ + ${qrelsName} run.txt`; --- @@ -100,7 +103,7 @@ const trecEvalCmd = `python -m pyserini.eval.trec_eval -c -m ${run.metrics ? Obj
    - + @@ -124,14 +127,15 @@ const trecEvalCmd = `python -m pyserini.eval.trec_eval -c -m ${run.metrics ? Obj

    Reproduce this run

    - Two steps: (1) reformulate the queries with QueryGym, (2) run retrieval with Pyserini. + Three steps: (1) reformulate the queries with QueryGym's example pipeline, + (2) run retrieval with Pyserini, (3) evaluate with trec_eval.

    - {pyReformulate} + {pipelineCmd} {retrievalCmd && ( - {retrievalCmd} + {retrievalCmd} )} - {trecEvalCmd} + {trecEvalCmd}
    @@ -162,23 +166,6 @@ const trecEvalCmd = `python -m pyserini.eval.trec_eval -c -m ${run.metrics ? Obj

    Config

    -
    {JSON.stringify(run.config, null, 2)}
    + {JSON.stringify(run.config, null, 2)}
    - - From 2f6b74fdcae6cb2fa038ffe8ba749156f766bd2c Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 20 May 2026 14:36:49 -0400 Subject: [PATCH 2/2] leaderboard: sticky thead + sticky axis cols, themed scrollbar, unified filter card - Wrap every table in a fixed-height card with a styled 8px thin scrollbar so the page chrome stays in view while rows scroll - Sticky thead inside the scroll container; sticky leftmost axis columns (Method/Model/Retriever, varies per page) with CSS-var-driven widths and a mobile fallback - Inline sort arrows on stacked dataset/metric column headers via a slot the table wires into - Filter chips moved into a dedicated card; metric toggle now also re-fires the current sort so row order matches the visible metric - MatrixCell always renders both metric spans (em-dash for missing) and uses the new .qg-cell-best highlight (accent + dark-mode glow) - Decimal precision unified at 4 across MatrixCell, side-by-side dataset cells, and the run-detail metrics table - /datasets/[id] renders both metrics side by side instead of a single-column toggle - /datasets/ index drops the stale eval_metrics badge - /runs/[run_id] reproduce snippet simplifies the qrels lookup - Stat cards gain hover:border-qg-accent; InteractiveTable search input restyled with magnifier icon; MetricCell removed (dead code) Co-Authored-By: Claude Opus 4.7 --- .../site/src/components/FilterChips.astro | 4 +- .../src/components/InteractiveTable.astro | 54 ++++-- .../site/src/components/MatrixCell.astro | 45 ++--- .../site/src/components/MetricCell.astro | 20 --- .../site/src/components/Stat.astro | 2 +- .../site/src/pages/datasets/[id].astro | 130 +++++++------- .../site/src/pages/datasets/index.astro | 3 - reproducibility/site/src/pages/index.astro | 158 +++++++++--------- .../site/src/pages/methods/[id].astro | 30 ++-- .../site/src/pages/models/[id].astro | 30 ++-- .../site/src/pages/retrievers/[id].astro | 30 ++-- .../site/src/pages/runs/[run_id].astro | 3 +- reproducibility/site/src/styles/global.css | 116 +++++++++++++ 13 files changed, 370 insertions(+), 255 deletions(-) delete mode 100644 reproducibility/site/src/components/MetricCell.astro diff --git a/reproducibility/site/src/components/FilterChips.astro b/reproducibility/site/src/components/FilterChips.astro index b41fbf3..a9aa251 100644 --- a/reproducibility/site/src/components/FilterChips.astro +++ b/reproducibility/site/src/components/FilterChips.astro @@ -34,7 +34,7 @@ interface Props { const { tableId, groups } = Astro.props; --- -
    +
    {groups.map((g) => (
    {g.label}: @@ -109,6 +109,8 @@ const { tableId, groups } = Astro.props; const v = primaryShown ? td.dataset.primaryValue : td.dataset.secondaryValue; td.dataset.sortValue = v ?? ""; }); + // If a sort is active, re-fire it on the now-visible metric values. + itableRoot?.dispatchEvent(new CustomEvent("qg-itable-resort")); } bar.querySelectorAll("[data-group]").forEach((g) => { diff --git a/reproducibility/site/src/components/InteractiveTable.astro b/reproducibility/site/src/components/InteractiveTable.astro index f124377..03dc988 100644 --- a/reproducibility/site/src/components/InteractiveTable.astro +++ b/reproducibility/site/src/components/InteractiveTable.astro @@ -27,13 +27,28 @@ const initialSortAttr = initialSort
    - - +
    + + +
    + 0 / 0 rows
    @@ -48,16 +63,6 @@ const initialSortAttr = initialSort .qg-itable table thead th[data-sort-skip] { cursor: default; } - .qg-itable table thead th .qg-sort-arrow { - opacity: 0.35; - margin-left: 0.25rem; - font-size: 0.7rem; - } - .qg-itable table thead th[data-sort-dir="asc"] .qg-sort-arrow, - .qg-itable table thead th[data-sort-dir="desc"] .qg-sort-arrow { - opacity: 1; - color: var(--qg-accent); - }
    Method Model - {SHORT[d.id] ?? d.name} - / {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} + +
    {SHORT[d.id] ?? d.name}
    +
    + {METRIC_LABEL[d.primary_metric] ?? d.primary_metric} +
    +
    {row.method_display ?? row.method_id} {row.model_display ?? row.model} - {runId && primary !== undefined ? ( - - - {primary.value.toFixed(3)} - - - ) : ( - - )} -