Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ repos:
- id: check-yaml
- id: check-added-large-files
args: ['--maxkb=4096']
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.10.0.1
hooks:
- id: shellcheck
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.12.2
hooks:
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions examples/compare_transcription_eval_tools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Code for comparing the WER and CER across tools

This analysis is inspired by [https://dl.acm.org/doi/epdf/10.1145/3476887.3476888](https://dl.acm.org/doi/epdf/10.1145/3476887.3476888). Use the following code to run it.

```
bash synthetic_emoji_run_analysis.sh && bash hip21_run_analysis.sh
```

Note that running the analysis requires pipx and Docker are installed on your system.
835 changes: 835 additions & 0 deletions examples/compare_transcription_eval_tools/calamari/evaluate.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM python:3.14-trixie

RUN pip install edit_distance==1.0.7 numpy==2.4.2

WORKDIR /analysis
COPY ocr_results/ ./ocr_results/
COPY evaluate.py textprocessors.py run_calamari.py .

CMD ["python", "run_calamari.py"]
30 changes: 30 additions & 0 deletions examples/compare_transcription_eval_tools/calamari/run_calamari.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import json
import pathlib

import evaluate

input_dir = pathlib.Path("ocr_results")
out_dir = pathlib.Path("output")

char_report_files = []
word_report_files = []

all_ref = {}
all_pred = {}

result = {}
for i, ref_file in enumerate(input_dir.glob("*.ref.txt")):
print(f"Analysing file {i}")
name = ref_file.name.partition(".")[0]
ref = {name: (input_dir / f"{name}.ref.txt").read_text(encoding="utf-8")}
pred = {name: (input_dir / f"{name}.pred.txt").read_text(encoding="utf-8")}

all_ref |= ref
all_pred |= pred

result[name] = evaluate.Evaluator().evaluate(gt_data=ref, pred_data=pred)

result["overall"] = evaluate.Evaluator().evaluate(gt_data=all_ref, pred_data=all_pred)

with open(out_dir / "result.json", "w") as f:
json.dump(result, f, indent=2)
10 changes: 10 additions & 0 deletions examples/compare_transcription_eval_tools/calamari/run_calamari.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

cp -r ../ocr_results ./ocr_results/
docker build -f run_calamari.Dockerfile -t stringalign_run_calamari .
rm -rf ./ocr_results/
mkdir -p calamari_results


docker run --rm --volume $(pwd)/calamari_results:/analysis/output stringalign_run_calamari
docker image rm stringalign_run_calamari
831 changes: 831 additions & 0 deletions examples/compare_transcription_eval_tools/calamari/textprocessors.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# /// script
# requires-python = ">=3.13"
# dependencies = ["regex"]
# ///

import unicodedata
from pathlib import Path

import regex

text = Path("data.txt").read_text()
matches = regex.findall(r"\X", text)
for match in matches:
if len(unicodedata.normalize("NFC", match)) > 1:
print(match)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
mkdir pred ref && \
bash -c 'for f in ocr_results/*ref.txt; do cp "$f" "ref/$(basename "$f" .ref.txt).txt"; done' && \
bash -c 'for f in ocr_results/*pred.txt; do cp "$f" "pred/$(basename "$f" .pred.txt).txt"; done'


dinglehopper ref/ \
pred/ \
report \
output/ \
--plain-encoding utf-8 && \
dinglehopper-summarize output/
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM python:3.14-trixie

RUN pip install dinglehopper==0.11.0

WORKDIR /analysis
COPY ocr_results/ ./ocr_results/
RUN mkdir pred ref && \
bash -c 'for f in ocr_results/*ref.txt; do cp "$f" "ref/$(basename "$f" .ref.txt).txt"; done' && \
bash -c 'for f in ocr_results/*pred.txt; do cp "$f" "pred/$(basename "$f" .pred.txt).txt"; done'

CMD ["bash", "-c", "dinglehopper ref/ pred/ report output/ --plain-encoding utf-8 && dinglehopper-summarize output/"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

cp -r ../ocr_results ./ocr_results/
docker build -f run_dinglehopper.Dockerfile -t stringalign_run_dinglehopper .
rm -rf ./ocr_results/
mkdir -p dinglehopper_results


docker run --rm --volume $(pwd)/dinglehopper_results:/analysis/output stringalign_run_dinglehopper
docker image rm stringalign_run_dinglehopper
56 changes: 56 additions & 0 deletions examples/compare_transcription_eval_tools/hip21_get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import csv
import xml.etree.ElementTree as ET
from collections.abc import Generator
from pathlib import Path

from tqdm import tqdm


def get_page_text(file: Path) -> Generator[str]:
root = ET.parse(file)
ns = {"PAGE": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19"}
unicode_tags = root.findall(".//PAGE:TextRegion/PAGE:TextEquiv/PAGE:Unicode", ns)
for unicode_tag in unicode_tags:
if text := unicode_tag.text.strip(): # type: ignore
yield text + "\n"


def get_alto_text(file: Path) -> Generator[str]:
root = ET.parse(file)
ns = "http://www.loc.gov/standards/alto/ns-v3#"
text_line_tags = root.findall(".//ALTO:TextLine", {"ALTO": ns})
for text_line_tag in text_line_tags:
out = []
for tag in text_line_tag:
tag_type = tag.tag.casefold()
if tag_type == f"{{{ns}}}string":
out.append(tag.attrib["CONTENT"])
elif tag_type == f"{{{ns}}}sp":
out.append(" ")
elif tag_type == f"{{{ns}}}hyp":
out.append(tag.attrib["CONTENT"])
else:
raise ValueError()

if text := "".join(out).strip():
yield text + "\n"


repo_parent = Path(__file__).parent / "hip21_ocrevaluation"
data_parent = repo_parent / "data"
output_dir = Path(__file__).parent / "ocr_results"
output_dir.mkdir(exist_ok=True)

with open(repo_parent / "primaID.csv") as f:
reader = csv.DictReader(f)
stems = [row["\ufeffprimaID"] for row in reader if row["dataset"] == "impact"]

for stem in tqdm(sorted(stems)):
gt_xml = data_parent / f"{stem}.gt.xml"
gt4hist_xml = data_parent / f"{stem}.gt4hist.xml"

with open(output_dir / f"{stem}.ref.txt", "w", encoding="utf-8") as f:
f.writelines(get_page_text(gt_xml))

with open(output_dir / f"{stem}.pred.txt", "w", encoding="utf-8") as f:
f.writelines(get_alto_text(gt4hist_xml))
140 changes: 140 additions & 0 deletions examples/compare_transcription_eval_tools/hip21_get_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# /// script
# requires-python = ">=3.13"
# dependencies = ["pandas", "jinja2"]
# ///
"""
This script gets the results from all the string comparison tools and creates a single DataFrame to display them
"""

import json
import pathlib
import re

import pandas as pd

pages = [f.stem.partition(".")[0] for f in pathlib.Path("ocr_results").iterdir()]

results = {}


# Load Calamari results
with open("calamari/calamari_results/result.json", "r") as f:
calamari = json.load(f)
results["Calamari"] = {key: {"cer": value["avg_ler"]} for key, value in calamari.items()}


# Load Dinglehopper results
dinglehopper_path = pathlib.Path("dinglehopper/dinglehopper_results")
results["Dinglehopper"] = {}

for page in pages:
with open(dinglehopper_path / f"{page}.txt-report.json", "r") as f:
page_data = json.load(f)

results["Dinglehopper"][page] = {"cer": page_data["cer"], "wer": page_data["wer"]}

with open(dinglehopper_path / "summary.json", "r") as f:
summary_data = json.load(f)
results["Dinglehopper"]["overall"] = {"cer": summary_data["cer_avg"], "wer": summary_data["wer_avg"]}

# Load ISRI results
isri_path = pathlib.Path("isri/isri_results")
results["ISRI"] = {}


def get_isri_error_rate(file: pathlib.Path) -> float:
with open(file, "r") as f:
report = f.readlines()

accuracy_line = report[4]
(accuracy_match,) = re.findall(r"(\d+(?:\.\d+)?)%\s+Accuracy", accuracy_line)
return 1 - float(accuracy_match) / 100


results["ISRI"]["overall"] = {
"cer": get_isri_error_rate(isri_path / "character_report.txt"),
"wer": get_isri_error_rate(isri_path / "word_report.txt"),
}

for page in pages:
results["ISRI"][page] = {
"cer": get_isri_error_rate(isri_path / f"{page}.char_report.txt"),
"wer": get_isri_error_rate(isri_path / f"{page}.word_report.txt"),
}

# Load jiwer
with open("jiwer/jiwer_results/result.json", "r") as f:
results["Jiwer"] = json.load(f)

# Load meeteval
with open("meeteval/meeteval_results/result.json", "r") as f:
results["Meeteval"] = json.load(f)


# Load ocrevalUAtion
def parse_ocrevalUAtion_report(file: pathlib.Path) -> dict[str, float]:
with open(file, "r") as f:
report = f.readlines()
cer_line = report[10]
(cer,) = re.findall(r"<td>CER</td><td>(\d+(?:\.\d+)?)</td>", cer_line)

wer_line = report[13]
(wer,) = re.findall(r"<td>WER</td><td>(\d+(?:\.\d+)?)</td>", wer_line)

return {"cer": float(cer) / 100, "wer": float(wer) / 100}


ocrevalUAtion_path = pathlib.Path("ocrevalUAtion/ocrevalUAtion_results")
results["ocrevalUAtion"] = {
page: parse_ocrevalUAtion_report(ocrevalUAtion_path / f"{page}.report.html") for page in pages
}

# Load stringalign
with open("stringalign/stringalign_results/result.json", "r") as f:
results["Stringalign"] = json.load(f)
# Load stringalign
with open("stringalign/stringalign_results/result_dinglehopper_processing.json", "r") as f:
results["Stringalign (Dinglehopper)"] = json.load(f)

# Convert results to records
result_records = []
for method, single_method_results in results.items():
for sample, sample_result in single_method_results.items():
result_records.append({"method": method, "sample": sample, **sample_result})

# Assemble in DataFrame
df = pd.DataFrame(result_records).sort_values(["sample", "method"]).set_index(["sample", "method"])
df_no_overall = df.drop("overall", level="sample", errors="ignore")
df.loc[("overall", "ocrevalUAtion"),] = df.loc[(slice(None), "ocrevalUAtion"),].mean()

dinglehopper_diff_df = df.loc[(slice(None), "Dinglehopper"),].reset_index(level="method", drop=True) - df.loc[
(slice(None), "Stringalign (Dinglehopper)"),
].reset_index(level="method", drop=True)
df = df.drop("Stringalign (Dinglehopper)", level="method", errors="ignore")

# Get dispersion measures
mean_absolute_deviation_from_mean = (
df.drop("overall", level="sample", errors="ignore") # This time without the Dinglehopper data as well
.groupby("sample") # Get dataframes for each sample with all methods
.transform(
lambda s: abs(s - s.median())
) # For each method, compute its absolute deviation from the mean in that sample
.groupby("method") # Get dataframes for each tools with all samples
.mean()
)

overall_res = df.loc["overall"].copy()
overall_res.columns = pd.MultiIndex.from_tuples([("Overall measure", metric.upper()) for metric in df.columns])
mean_absolute_deviation_from_mean.columns = pd.MultiIndex.from_tuples(
[("MADM", metric.upper()) for metric in df.columns]
)

out_df = pd.merge(overall_res, mean_absolute_deviation_from_mean, how="outer", on="method")
with open("hip21_table.tex", "w", encoding="utf-8") as f:
f.write(out_df.to_latex(na_rep="--", float_format="%.4f"))

print(out_df)
print(
"Diff between Stringalign and Dinglehopper when confusables are resolved equally\n",
dinglehopper_diff_df.drop("overall", errors="ignore").max(),
)
51 changes: 51 additions & 0 deletions examples/compare_transcription_eval_tools/hip21_run_analysis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
# shellcheck disable=all
run_analysis() {


cd isri
rm -rf isri_results
time bash run_isri.sh
cd ..

cd jiwer
rm -rf jiwer_results
time bash run_jiwer.sh
cd ..

cd meeteval
rm -rf meeteval_results
time bash run_meeteval.sh
cd ..

cd ocrevalUAtion
rm -rf ocrevalUAtion_results
time bash run_ocrevalUAtion.sh
cd ..

cd stringalign
rm -rf stringalign_results
time bash run_stringalign.sh
cd ..

cd dinglehopper
rm -rf dinglehopper_results
time bash run_dinglehopper.sh
cd ..

cd calamari
rm -rf calamari_results
time bash run_calamari.sh
cd ..
}

rm -rf ocr_results

git clone https://github.com/cneud/hip21_ocrevaluation.git
cd hip21_ocrevaluation
git checkout 9979dacfeebef65b419a44ea3f12a0bcba153c6f
cd ..

pipx run hip21_get_data.py
run_analysis
pipx run hip21_get_results.py
14 changes: 14 additions & 0 deletions examples/compare_transcription_eval_tools/isri/run_isri.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM python:3.14-trixie

RUN apt-get update && \
apt-get install -y build-essential libutf8proc-dev git && \
git clone https://github.com/eddieantonio/ocreval && \
cd ocreval && \
git checkout 873a0de5796c0b9ccf07a549afdd30159a9e0b3e && \
make && make install

WORKDIR /analysis
COPY ocr_results/ ./ocr_results/
COPY run_isri_analysis.py .

CMD ["python3", "run_isri_analysis.py"]
Loading