yngvem · yngvem · Mar 1, 2026 · Feb 28, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,10 +9,6 @@ repos:
     -   id: check-yaml
     -   id: check-added-large-files
         args: ['--maxkb=4096']
--   repo: https://github.com/shellcheck-py/shellcheck-py
-    rev: v0.10.0.1
-    hooks:
-    -   id: shellcheck
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.12.2
     hooks:

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/compare_transcription_eval_tools/README.md b/examples/compare_transcription_eval_tools/README.md
@@ -0,0 +1,9 @@
+# Code for comparing the WER and CER across tools
+
+This analysis is inspired by [https://dl.acm.org/doi/epdf/10.1145/3476887.3476888](https://dl.acm.org/doi/epdf/10.1145/3476887.3476888). Use the following code to run it.
+
+```
+bash synthetic_emoji_run_analysis.sh && bash hip21_run_analysis.sh
+```
+
+Note that running the analysis requires pipx and Docker are installed on your system.
diff --git a/examples/compare_transcription_eval_tools/calamari/evaluate.py b/examples/compare_transcription_eval_tools/calamari/evaluate.py
diff --git a/examples/compare_transcription_eval_tools/calamari/run_calamari.Dockerfile b/examples/compare_transcription_eval_tools/calamari/run_calamari.Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.14-trixie
+
+RUN pip install edit_distance==1.0.7 numpy==2.4.2
+
+WORKDIR /analysis
+COPY ocr_results/ ./ocr_results/
+COPY evaluate.py textprocessors.py run_calamari.py .
+
+CMD ["python", "run_calamari.py"]
diff --git a/examples/compare_transcription_eval_tools/calamari/run_calamari.py b/examples/compare_transcription_eval_tools/calamari/run_calamari.py
@@ -0,0 +1,30 @@
+import json
+import pathlib
+
+import evaluate
+
+input_dir = pathlib.Path("ocr_results")
+out_dir = pathlib.Path("output")
+
+char_report_files = []
+word_report_files = []
+
+all_ref = {}
+all_pred = {}
+
+result = {}
+for i, ref_file in enumerate(input_dir.glob("*.ref.txt")):
+    print(f"Analysing file {i}")
+    name = ref_file.name.partition(".")[0]
+    ref = {name: (input_dir / f"{name}.ref.txt").read_text(encoding="utf-8")}
+    pred = {name: (input_dir / f"{name}.pred.txt").read_text(encoding="utf-8")}
+
+    all_ref |= ref
+    all_pred |= pred
+
+    result[name] = evaluate.Evaluator().evaluate(gt_data=ref, pred_data=pred)
+
+result["overall"] = evaluate.Evaluator().evaluate(gt_data=all_ref, pred_data=all_pred)
+
+with open(out_dir / "result.json", "w") as f:
+    json.dump(result, f, indent=2)
diff --git a/examples/compare_transcription_eval_tools/calamari/run_calamari.sh b/examples/compare_transcription_eval_tools/calamari/run_calamari.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+cp -r ../ocr_results ./ocr_results/
+docker build -f run_calamari.Dockerfile -t stringalign_run_calamari .
+rm -rf ./ocr_results/
+mkdir -p calamari_results
+
+
+docker run --rm --volume $(pwd)/calamari_results:/analysis/output stringalign_run_calamari
+docker image rm stringalign_run_calamari
diff --git a/examples/compare_transcription_eval_tools/calamari/textprocessors.py b/examples/compare_transcription_eval_tools/calamari/textprocessors.py
diff --git a/examples/compare_transcription_eval_tools/check_for_multi_codepoint_grapheme_clusters.py b/examples/compare_transcription_eval_tools/check_for_multi_codepoint_grapheme_clusters.py
@@ -0,0 +1,15 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = ["regex"]
+# ///
+
+import unicodedata
+from pathlib import Path
+
+import regex
+
+text = Path("data.txt").read_text()
+matches = regex.findall(r"\X", text)
+for match in matches:
+    if len(unicodedata.normalize("NFC", match)) > 1:
+        print(match)
diff --git a/examples/compare_transcription_eval_tools/dinglehopper/container_script.sh b/examples/compare_transcription_eval_tools/dinglehopper/container_script.sh
@@ -0,0 +1,11 @@
+mkdir pred ref && \
+    bash -c 'for f in ocr_results/*ref.txt; do cp "$f" "ref/$(basename "$f" .ref.txt).txt"; done' && \
+    bash -c 'for f in ocr_results/*pred.txt; do cp "$f" "pred/$(basename "$f" .pred.txt).txt"; done'
+
+
+dinglehopper ref/ \
+    pred/ \
+    report \
+    output/ \
+    --plain-encoding utf-8 && \
+    dinglehopper-summarize output/
diff --git a/examples/compare_transcription_eval_tools/dinglehopper/run_dinglehopper.Dockerfile b/examples/compare_transcription_eval_tools/dinglehopper/run_dinglehopper.Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.14-trixie
+
+RUN pip install dinglehopper==0.11.0
+
+WORKDIR /analysis
+COPY ocr_results/ ./ocr_results/
+RUN mkdir pred ref && \
+    bash -c 'for f in ocr_results/*ref.txt; do cp "$f" "ref/$(basename "$f" .ref.txt).txt"; done' && \
+    bash -c 'for f in ocr_results/*pred.txt; do cp "$f" "pred/$(basename "$f" .pred.txt).txt"; done'
+
+CMD ["bash", "-c", "dinglehopper ref/ pred/ report output/  --plain-encoding utf-8 && dinglehopper-summarize output/"]
diff --git a/examples/compare_transcription_eval_tools/dinglehopper/run_dinglehopper.sh b/examples/compare_transcription_eval_tools/dinglehopper/run_dinglehopper.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+cp -r ../ocr_results ./ocr_results/
+docker build -f run_dinglehopper.Dockerfile -t stringalign_run_dinglehopper .
+rm -rf ./ocr_results/
+mkdir -p dinglehopper_results
+
+
+docker run --rm --volume $(pwd)/dinglehopper_results:/analysis/output stringalign_run_dinglehopper
+docker image rm stringalign_run_dinglehopper
diff --git a/examples/compare_transcription_eval_tools/hip21_get_data.py b/examples/compare_transcription_eval_tools/hip21_get_data.py
@@ -0,0 +1,56 @@
+import csv
+import xml.etree.ElementTree as ET
+from collections.abc import Generator
+from pathlib import Path
+
+from tqdm import tqdm
+
+
+def get_page_text(file: Path) -> Generator[str]:
+    root = ET.parse(file)
+    ns = {"PAGE": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19"}
+    unicode_tags = root.findall(".//PAGE:TextRegion/PAGE:TextEquiv/PAGE:Unicode", ns)
+    for unicode_tag in unicode_tags:
+        if text := unicode_tag.text.strip():  # type: ignore
+            yield text + "\n"
+
+
+def get_alto_text(file: Path) -> Generator[str]:
+    root = ET.parse(file)
+    ns = "http://www.loc.gov/standards/alto/ns-v3#"
+    text_line_tags = root.findall(".//ALTO:TextLine", {"ALTO": ns})
+    for text_line_tag in text_line_tags:
+        out = []
+        for tag in text_line_tag:
+            tag_type = tag.tag.casefold()
+            if tag_type == f"{{{ns}}}string":
+                out.append(tag.attrib["CONTENT"])
+            elif tag_type == f"{{{ns}}}sp":
+                out.append(" ")
+            elif tag_type == f"{{{ns}}}hyp":
+                out.append(tag.attrib["CONTENT"])
+            else:
+                raise ValueError()
+
+        if text := "".join(out).strip():
+            yield text + "\n"
+
+
+repo_parent = Path(__file__).parent / "hip21_ocrevaluation"
+data_parent = repo_parent / "data"
+output_dir = Path(__file__).parent / "ocr_results"
+output_dir.mkdir(exist_ok=True)
+
+with open(repo_parent / "primaID.csv") as f:
+    reader = csv.DictReader(f)
+    stems = [row["\ufeffprimaID"] for row in reader if row["dataset"] == "impact"]
+
+for stem in tqdm(sorted(stems)):
+    gt_xml = data_parent / f"{stem}.gt.xml"
+    gt4hist_xml = data_parent / f"{stem}.gt4hist.xml"
+
+    with open(output_dir / f"{stem}.ref.txt", "w", encoding="utf-8") as f:
+        f.writelines(get_page_text(gt_xml))
+
+    with open(output_dir / f"{stem}.pred.txt", "w", encoding="utf-8") as f:
+        f.writelines(get_alto_text(gt4hist_xml))
diff --git a/examples/compare_transcription_eval_tools/hip21_get_results.py b/examples/compare_transcription_eval_tools/hip21_get_results.py
@@ -0,0 +1,140 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = ["pandas", "jinja2"]
+# ///
+"""
+This script gets the results from all the string comparison tools and creates a single DataFrame to display them
+"""
+
+import json
+import pathlib
+import re
+
+import pandas as pd
+
+pages = [f.stem.partition(".")[0] for f in pathlib.Path("ocr_results").iterdir()]
+
+results = {}
+
+
+# Load Calamari results
+with open("calamari/calamari_results/result.json", "r") as f:
+    calamari = json.load(f)
+results["Calamari"] = {key: {"cer": value["avg_ler"]} for key, value in calamari.items()}
+
+
+# Load Dinglehopper results
+dinglehopper_path = pathlib.Path("dinglehopper/dinglehopper_results")
+results["Dinglehopper"] = {}
+
+for page in pages:
+    with open(dinglehopper_path / f"{page}.txt-report.json", "r") as f:
+        page_data = json.load(f)
+
+    results["Dinglehopper"][page] = {"cer": page_data["cer"], "wer": page_data["wer"]}
+
+with open(dinglehopper_path / "summary.json", "r") as f:
+    summary_data = json.load(f)
+results["Dinglehopper"]["overall"] = {"cer": summary_data["cer_avg"], "wer": summary_data["wer_avg"]}
+
+# Load ISRI results
+isri_path = pathlib.Path("isri/isri_results")
+results["ISRI"] = {}
+
+
+def get_isri_error_rate(file: pathlib.Path) -> float:
+    with open(file, "r") as f:
+        report = f.readlines()
+
+    accuracy_line = report[4]
+    (accuracy_match,) = re.findall(r"(\d+(?:\.\d+)?)%\s+Accuracy", accuracy_line)
+    return 1 - float(accuracy_match) / 100
+
+
+results["ISRI"]["overall"] = {
+    "cer": get_isri_error_rate(isri_path / "character_report.txt"),
+    "wer": get_isri_error_rate(isri_path / "word_report.txt"),
+}
+
+for page in pages:
+    results["ISRI"][page] = {
+        "cer": get_isri_error_rate(isri_path / f"{page}.char_report.txt"),
+        "wer": get_isri_error_rate(isri_path / f"{page}.word_report.txt"),
+    }
+
+# Load jiwer
+with open("jiwer/jiwer_results/result.json", "r") as f:
+    results["Jiwer"] = json.load(f)
+
+# Load meeteval
+with open("meeteval/meeteval_results/result.json", "r") as f:
+    results["Meeteval"] = json.load(f)
+
+
+# Load ocrevalUAtion
+def parse_ocrevalUAtion_report(file: pathlib.Path) -> dict[str, float]:
+    with open(file, "r") as f:
+        report = f.readlines()
+    cer_line = report[10]
+    (cer,) = re.findall(r"<td>CER</td><td>(\d+(?:\.\d+)?)</td>", cer_line)
+
+    wer_line = report[13]
+    (wer,) = re.findall(r"<td>WER</td><td>(\d+(?:\.\d+)?)</td>", wer_line)
+
+    return {"cer": float(cer) / 100, "wer": float(wer) / 100}
+
+
+ocrevalUAtion_path = pathlib.Path("ocrevalUAtion/ocrevalUAtion_results")
+results["ocrevalUAtion"] = {
+    page: parse_ocrevalUAtion_report(ocrevalUAtion_path / f"{page}.report.html") for page in pages
+}
+
+# Load stringalign
+with open("stringalign/stringalign_results/result.json", "r") as f:
+    results["Stringalign"] = json.load(f)
+# Load stringalign
+with open("stringalign/stringalign_results/result_dinglehopper_processing.json", "r") as f:
+    results["Stringalign (Dinglehopper)"] = json.load(f)
+
+# Convert results to records
+result_records = []
+for method, single_method_results in results.items():
+    for sample, sample_result in single_method_results.items():
+        result_records.append({"method": method, "sample": sample, **sample_result})
+
+# Assemble in DataFrame
+df = pd.DataFrame(result_records).sort_values(["sample", "method"]).set_index(["sample", "method"])
+df_no_overall = df.drop("overall", level="sample", errors="ignore")
+df.loc[("overall", "ocrevalUAtion"),] = df.loc[(slice(None), "ocrevalUAtion"),].mean()
+
+dinglehopper_diff_df = df.loc[(slice(None), "Dinglehopper"),].reset_index(level="method", drop=True) - df.loc[
+    (slice(None), "Stringalign (Dinglehopper)"),
+].reset_index(level="method", drop=True)
+df = df.drop("Stringalign (Dinglehopper)", level="method", errors="ignore")
+
+# Get dispersion measures
+mean_absolute_deviation_from_mean = (
+    df.drop("overall", level="sample", errors="ignore")  # This time without the Dinglehopper data as well
+    .groupby("sample")  # Get dataframes for each sample with all methods
+    .transform(
+        lambda s: abs(s - s.median())
+    )  # For each method, compute its absolute deviation from the mean in that sample
+    .groupby("method")  # Get dataframes for each tools with all samples
+    .mean()
+)
+
+overall_res = df.loc["overall"].copy()
+overall_res.columns = pd.MultiIndex.from_tuples([("Overall measure", metric.upper()) for metric in df.columns])
+mean_absolute_deviation_from_mean.columns = pd.MultiIndex.from_tuples(
+    [("MADM", metric.upper()) for metric in df.columns]
+)
+
+out_df = pd.merge(overall_res, mean_absolute_deviation_from_mean, how="outer", on="method")
+with open("hip21_table.tex", "w", encoding="utf-8") as f:
+    f.write(out_df.to_latex(na_rep="--", float_format="%.4f"))
+
+print(out_df)
+print(
+    "Diff between Stringalign and Dinglehopper when confusables are resolved equally\n",
+    dinglehopper_diff_df.drop("overall", errors="ignore").max(),
+)
diff --git a/examples/compare_transcription_eval_tools/hip21_run_analysis.sh b/examples/compare_transcription_eval_tools/hip21_run_analysis.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# shellcheck disable=all
+run_analysis() {
+
+
+    cd isri
+    rm -rf isri_results
+    time bash run_isri.sh
+    cd ..
+
+    cd jiwer
+    rm -rf jiwer_results
+    time bash run_jiwer.sh
+    cd ..
+
+    cd meeteval
+    rm -rf meeteval_results
+    time bash run_meeteval.sh
+    cd ..
+
+    cd ocrevalUAtion
+    rm -rf ocrevalUAtion_results
+    time bash run_ocrevalUAtion.sh
+    cd ..
+
+    cd stringalign
+    rm -rf stringalign_results
+    time bash run_stringalign.sh
+    cd ..
+
+    cd dinglehopper
+    rm -rf dinglehopper_results
+    time bash run_dinglehopper.sh
+    cd ..
+
+    cd calamari
+    rm -rf calamari_results
+    time bash run_calamari.sh
+    cd ..
+}
+
+rm -rf ocr_results
+
+git clone https://github.com/cneud/hip21_ocrevaluation.git
+cd hip21_ocrevaluation
+git checkout 9979dacfeebef65b419a44ea3f12a0bcba153c6f
+cd ..
+
+pipx run hip21_get_data.py
+run_analysis
+pipx run hip21_get_results.py
diff --git a/examples/compare_transcription_eval_tools/isri/run_isri.Dockerfile b/examples/compare_transcription_eval_tools/isri/run_isri.Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.14-trixie
+
+RUN apt-get update && \
+    apt-get install -y build-essential libutf8proc-dev git && \
+    git clone https://github.com/eddieantonio/ocreval && \
+    cd ocreval && \
+    git checkout 873a0de5796c0b9ccf07a549afdd30159a9e0b3e && \
+    make && make install
+
+WORKDIR /analysis
+COPY ocr_results/ ./ocr_results/
+COPY run_isri_analysis.py .
+
+CMD ["python3", "run_isri_analysis.py"]