From 12cd6f3616b65e152e12e7cd05c9102e8637c323 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Sat, 13 Dec 2025 13:48:00 -0500
Subject: [PATCH 01/33] Test on Python 3.14 and move actions to Ubuntu 24.04

---
 .github/workflows/main.yml | 4 ++--
 README.md                  | 2 +-
 setup.py                   | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index fb51447..1170954 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -12,12 +12,12 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
 
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
 
     steps:
       - uses: actions/checkout@v4
diff --git a/README.md b/README.md
index 8b560dc..e894734 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ This will install the package and add the command `seqscore` in your Python
 environment.
 
 SeqScore requires Python 3.9 or higher. It is tested on Python 3.9,
-3.10, 3.11, 3.12, and 3.13.
+3.10, 3.11, 3.12, 3.13, and 3.14.
 
 ## License
 
diff --git a/setup.py b/setup.py
index 0c227b3..90b2082 100755
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@ def setup_package() -> None:
             "Programming Language :: Python :: 3.11",
             "Programming Language :: Python :: 3.12",
             "Programming Language :: Python :: 3.13",
+            "Programming Language :: Python :: 3.14",
             "Topic :: Scientific/Engineering :: Artificial Intelligence",
         ],
         url="https://github.com/bltlab/seqscore",

From efdffada7cdfe6f3dce4a8e24f35db6b01102e0a Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Sun, 14 Dec 2025 08:36:16 -0500
Subject: [PATCH 02/33] Increment version to 0.8.0

---
 seqscore/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqscore/__init__.py b/seqscore/__init__.py
index 49e0fc1..777f190 100644
--- a/seqscore/__init__.py
+++ b/seqscore/__init__.py
@@ -1 +1 @@
-__version__ = "0.7.0"
+__version__ = "0.8.0"

From fb0f7453b8ab0f5681fcc238e5a8525672a3a84b Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Sun, 14 Dec 2025 08:39:45 -0500
Subject: [PATCH 03/33] Update mypy and ruff to latest versions

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0c54998..e976d18 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,8 +10,8 @@ pytest==8.3.5
 pytest-cov==5.0.0
 
 # For development
-mypy==1.14.1
-ruff==0.9.10
+mypy==1.19.0
+ruff==0.14.9
 
 # Documentation build
 # Disabled for now since we don't need them

From 37d031b7d2dfb3b41ad7ec3ff18e48d04d766895 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Sun, 14 Dec 2025 12:12:31 -0500
Subject: [PATCH 04/33] Add more scoring tests and rename error counting flag

---
 seqscore/conll.py     |  2 +-
 seqscore/scoring.py   | 19 ++++++---
 tests/test_scoring.py | 96 +++++++++++++++++++++++++++++++++----------
 3 files changed, 89 insertions(+), 28 deletions(-)

diff --git a/seqscore/conll.py b/seqscore/conll.py
index d60bb8b..05ad864 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -514,7 +514,7 @@ def score_conll_files(
         )
 
         class_scores, acc_scores = compute_scores(
-            pred_docs, ref_docs, count_fp_fn=error_counts
+            pred_docs, ref_docs, count_fp_fn_examples=error_counts
         )
         all_class_scores.append(class_scores)
         all_acc_scores.append(class_scores)
diff --git a/seqscore/scoring.py b/seqscore/scoring.py
index 1622613..c0c8079 100644
--- a/seqscore/scoring.py
+++ b/seqscore/scoring.py
@@ -133,7 +133,7 @@ def compute_scores(
     pred_docs: Sequence[Sequence[LabeledSequence]],
     ref_docs: Sequence[Sequence[LabeledSequence]],
     *,
-    count_fp_fn: bool = False,
+    count_fp_fn_examples: bool = False,
 ) -> tuple[ClassificationScore, AccuracyScore]:
     accuracy = AccuracyScore()
     classification = ClassificationScore()
@@ -174,7 +174,7 @@ def compute_scores(
                 ref_sequence.mentions,
                 classification,
                 tokens=ref_sequence.tokens,
-                count_fp_fn=count_fp_fn,
+                count_fp_fn_examples=count_fp_fn_examples,
             )
 
     return classification, accuracy
@@ -205,13 +205,20 @@ def score_sequence_mentions(
     score: ClassificationScore,
     *,
     tokens: Optional[Sequence[str]] = (),
-    count_fp_fn: bool = False,
+    count_fp_fn_examples: bool = False,
 ) -> None:
     """Update a ClassificationScore for a single sequence's mentions.
 
     Since mentions are defined per-sequence, the behavior is not defined
-    if you provide mentions corresponding to multiple sequences.
+    if you provide mentions corresponding to multiple sequences. Tokens
+    must be provided if you want false positives and negative examples
+    to be counted.
     """
+    if count_fp_fn_examples and not tokens:
+        raise ValueError(
+            "Tokens must be provided to count false positive/negative examples"
+        )
+
     # Compute span accuracy
     pred_mentions_set = set(pred_mentions)
     ref_mentions_set = set(ref_mentions)
@@ -226,7 +233,7 @@ def score_sequence_mentions(
             # False positive
             score.false_pos += 1
             score.type_scores[pred.type].false_pos += 1
-            if count_fp_fn:
+            if count_fp_fn_examples:
                 error_tokens = tokens[pred.span.start : pred.span.end]
                 score.count_false_positive(error_tokens, pred.type)
 
@@ -235,7 +242,7 @@ def score_sequence_mentions(
         if ref not in pred_mentions_set:
             score.false_neg += 1
             score.type_scores[ref.type].false_neg += 1
-            if count_fp_fn:
+            if count_fp_fn_examples:
                 error_tokens = tokens[ref.span.start : ref.span.end]
                 score.count_false_negative(error_tokens, ref.type)
 
diff --git a/tests/test_scoring.py b/tests/test_scoring.py
index ebb8881..a74558d 100644
--- a/tests/test_scoring.py
+++ b/tests/test_scoring.py
@@ -1,3 +1,4 @@
+from collections import Counter
 from decimal import Decimal
 
 import pytest
@@ -8,6 +9,7 @@
     AccuracyScore,
     ClassificationScore,
     TokenCountError,
+    TokensWithType,
     compute_scores,
     convert_score,
     score_label_sequences,
@@ -45,7 +47,7 @@ def test_score_sentence_labels_invalid() -> None:
         score_sequence_label_accuracy(pred_labels, ref_labels, AccuracyScore())
 
 
-def test_score_sentence_mentions_correct() -> None:
+def test_score_sequence_mentions_correct() -> None:
     ref_mentions = [Mention(Span(0, 2), "PER"), Mention(Span(4, 5), "ORG")]
     pred_mentions = [Mention(Span(0, 2), "PER"), Mention(Span(4, 5), "ORG")]
     score = ClassificationScore()
@@ -63,8 +65,14 @@ def test_score_sentence_mentions_correct() -> None:
     assert score.recall == 1.0
     assert score.f1 == 1.0
 
+    # Test that tokens are required for counting FP/FN
+    with pytest.raises(ValueError):
+        score_sequence_mentions(
+            pred_mentions, ref_mentions, score, count_fp_fn_examples=True
+        )
+
 
-def test_score_sentence_mentions_incorrect1() -> None:
+def test_score_sequence_mentions_incorrect1() -> None:
     ref_mentions = [
         Mention(Span(0, 2), "LOC"),
         Mention(Span(4, 5), "PER"),
@@ -100,6 +108,28 @@ def test_score_sentence_mentions_incorrect1() -> None:
         2 * (score.precision * score.recall) / (score.precision + score.recall)
     )
 
+    # Run again and check counted fp/fn examples. We do this in a second pass so
+    # we can cover both True/False cases for count_fp_fn_examples.
+    score2 = ClassificationScore()
+    tokens = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]
+    score_sequence_mentions(
+        pred_mentions, ref_mentions, score2, count_fp_fn_examples=True, tokens=tokens
+    )
+    expected_false_pos = Counter(
+        [
+            TokensWithType(("a", "b"), "ORG"),
+            TokensWithType(("g",), "SPURIOUS"),
+        ]
+    )
+    expected_false_neg = Counter(
+        [
+            TokensWithType(("a", "b"), "LOC"),
+            TokensWithType(("h",), "MISC"),
+        ]
+    )
+    assert score2.false_pos_examples == expected_false_pos
+    assert score2.false_neg_examples == expected_false_neg
+
 
 def test_score_label_sequences_correct() -> None:
     ref_labels = [["O", "B-ORG", "I-ORG", "O"], ["B-PER", "I-PER"]]
@@ -192,60 +222,84 @@ def test_accuracy_score_empty() -> None:
     assert score.accuracy == 0.0
 
 
+def test_compute_scores() -> None:
+    ref_labels = ("O", "B-ORG", "I-ORG", "O", "B-LOC")
+    ref_mentions = (
+        Mention(Span(1, 3), "ORG"),
+        Mention(Span(4, 5), "LOC"),
+    )
+    pred_labels = ("O", "B-ORG", "I-ORG", "O", "B-ORG")
+    pred_mentions = (
+        Mention(Span(1, 3), "ORG"),
+        Mention(Span(4, 5), "ORG"),
+    )
+    tokens = ("a", "b", "c", "d", "e")
+    ref_sequence = LabeledSequence(tokens, ref_labels, ref_mentions)
+    pred_sequence = LabeledSequence(tokens, pred_labels, pred_mentions)
+    class_score, acc_score = compute_scores([[pred_sequence]], [[ref_sequence]])
+    assert acc_score.accuracy == 4 / 5
+    print(class_score)
+    assert class_score.true_pos == 1
+    assert class_score.false_pos == 1
+    assert class_score.false_neg == 1
+
+
 def test_token_count_error() -> None:
-    ref_labels = ["O", "B-ORG", "I-ORG", "O"]
-    pred_labels = ["O", "B-ORG", "I-ORG", "O", "O"]
+    ref_labels = ("O", "B-ORG", "I-ORG", "O")
+    pred_labels = ("O", "B-ORG", "I-ORG", "O", "O")
     ref_sequence = LabeledSequence(
-        ["a", "b", "c", "d"], ref_labels, provenance=SequenceProvenance(0, "test")
+        ("a", "b", "c", "d"), ref_labels, provenance=SequenceProvenance(0, "test")
     )
     pred_sequence = LabeledSequence(
-        ["a", "b", "c", "d", "e"], pred_labels, provenance=SequenceProvenance(0, "test")
+        ("a", "b", "c", "d", "e"), pred_labels, provenance=SequenceProvenance(0, "test")
     )
     with pytest.raises(TokenCountError):
         compute_scores([[pred_sequence]], [[ref_sequence]])
 
 
-def test_provenance_none_raises_error() -> None:
-    labels = ["O", "B-ORG"]
-    sequence = LabeledSequence(["a", "b"], labels, provenance=None)
+def test_token_count_error_provenance_none_raises_error() -> None:
+    labels = ("O", "B-ORG")
+    sequence = LabeledSequence(("a", "b"), labels, provenance=None)
     with pytest.raises(ValueError):
         TokenCountError.from_predicted_sequence(2, sequence)
 
 
 def test_differing_num_docs() -> None:
-    ref_labels = ["O", "B-ORG"]
-    pred_labels = ["O", "B-LOC"]
+    ref_labels = ("O", "B-ORG")
+    pred_labels = ("O", "B-LOC")
+    tokens = ("a", "b")
     ref_sequence = LabeledSequence(
-        ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test")
+        tokens, ref_labels, provenance=SequenceProvenance(0, "test")
     )
     pred_sequence = LabeledSequence(
-        ["a", "b"], pred_labels, provenance=SequenceProvenance(0, "test")
+        tokens, pred_labels, provenance=SequenceProvenance(0, "test")
     )
     with pytest.raises(ValueError):
         compute_scores([[pred_sequence]], [[ref_sequence], [ref_sequence]])
 
 
 def test_differing_doc_length() -> None:
-    ref_labels = ["O", "B-ORG"]
-    pred_labels = ["O", "B-LOC"]
+    ref_labels = ("O", "B-ORG")
+    pred_labels = ("O", "B-LOC")
+    tokens = ("a", "b")
     ref_sequence = LabeledSequence(
-        ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test")
+        tokens, ref_labels, provenance=SequenceProvenance(0, "test")
     )
     pred_sequence = LabeledSequence(
-        ["a", "b"], pred_labels, provenance=SequenceProvenance(0, "test")
+        tokens, pred_labels, provenance=SequenceProvenance(0, "test")
     )
     with pytest.raises(ValueError):
         compute_scores([[pred_sequence]], [[ref_sequence, ref_sequence]])
 
 
 def test_differing_pred_and_ref_tokens() -> None:
-    ref_labels = ["O", "B-ORG"]
-    pred_labels = ["O", "B-LOC"]
+    ref_labels = ("O", "B-ORG")
+    pred_labels = ("O", "B-LOC")
     ref_sequence = LabeledSequence(
-        ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test")
+        ("a", "b"), ref_labels, provenance=SequenceProvenance(0, "test")
     )
     pred_sequence = LabeledSequence(
-        ["a", "c"], pred_labels, provenance=SequenceProvenance(0, "test")
+        ("a", "c"), pred_labels, provenance=SequenceProvenance(0, "test")
     )
     with pytest.raises(ValueError):
         compute_scores([[pred_sequence]], [[ref_sequence]])

From 9328926805531119db2b605bf86a895b0ccfbf7a Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Tue, 16 Dec 2025 05:02:17 -0500
Subject: [PATCH 05/33] Add total line to count subcommand

---
 seqscore/scripts/seqscore.py  | 10 +++++----
 tests/test_summarize_click.py | 40 +++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index 859ee24..1fcc01e 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -2,10 +2,10 @@
 import sys
 from collections import Counter
 from contextlib import nullcontext
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import click
-from tabulate import tabulate
+from tabulate import SEPARATING_LINE, tabulate
 
 import seqscore
 from seqscore.conll import (
@@ -389,8 +389,10 @@ def summarize(
         print(f"Total {total_documents} document(s) and {total_sentences} sentences")
 
     header = ["Entity Type", "Count"]
-    rows = sorted(type_counts.items())
-    print(tabulate(rows, header, tablefmt="github", floatfmt="6.2f"))
+    rows: list[Union[tuple[str, int], str]] = sorted(type_counts.items())
+    rows.append(SEPARATING_LINE)
+    rows.append(("TOTAL", sum(type_counts.values())))
+    print(tabulate(rows, header, intfmt=","))
 
 
 @cli.command(help="score a file and report performance or an error count table")
diff --git a/tests/test_summarize_click.py b/tests/test_summarize_click.py
index f957e59..4088093 100644
--- a/tests/test_summarize_click.py
+++ b/tests/test_summarize_click.py
@@ -19,10 +19,12 @@ def test_summarize_bio_onedoc() -> None:
     assert (
         result.output
         == """File 'tests/conll_annotation/minimal.bio' contains 1 document(s) and 2 sentences
-| Entity Type   |   Count |
-|---------------|---------|
-| LOC           |       2 |
-| ORG           |       1 |
+Entity Type      Count
+-------------  -------
+LOC                  2
+ORG                  1
+-------------  -------
+TOTAL                3
 """
     )
 
@@ -41,10 +43,12 @@ def test_summarize_bio_onedoc_quiet() -> None:
     assert result.exit_code == 0
     assert (
         result.output
-        == """| Entity Type   |   Count |
-|---------------|---------|
-| LOC           |       2 |
-| ORG           |       1 |
+        == """Entity Type      Count
+-------------  -------
+LOC                  2
+ORG                  1
+-------------  -------
+TOTAL                3
 """
     )
 
@@ -63,10 +67,12 @@ def test_summarize_iob_twodoc() -> None:
     assert (
         result.output
         == """File 'tests/conll_annotation/minimal_fields.iob' contains 2 document(s) and 2 sentences
-| Entity Type   |   Count |
-|---------------|---------|
-| LOC           |       2 |
-| ORG           |       1 |
+Entity Type      Count
+-------------  -------
+LOC                  2
+ORG                  1
+-------------  -------
+TOTAL                3
 """
     )
 
@@ -88,9 +94,11 @@ def test_summarize_bio_twofiles() -> None:
         == """File 'tests/conll_annotation/minimal.bio' contains 1 document(s) and 2 sentences
 File 'tests/conll_annotation/minimal2.bio' contains 1 document(s) and 2 sentences
 Total 2 document(s) and 4 sentences
-| Entity Type   |   Count |
-|---------------|---------|
-| LOC           |       5 |
-| ORG           |       2 |
+Entity Type      Count
+-------------  -------
+LOC                  5
+ORG                  2
+-------------  -------
+TOTAL                7
 """
     )

From 9b85b43b8f520ed07d7a1713b423fa4795d4b593 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Tue, 16 Dec 2025 05:02:40 -0500
Subject: [PATCH 06/33] Change default output delimiter for CoNLL files to tab

---
 seqscore/scripts/seqscore.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index 1fcc01e..4726a8d 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -98,6 +98,14 @@ def _labels_option_default_bio() -> Callable:
     )
 
 
+def _output_delim_option() -> Callable:
+    return click.option(
+        "--output-delim",
+        default="\t",
+        help="the delimiter to be used for output (has no effect on input) [default: tab]",
+    )
+
+
 def _quiet_option() -> Callable:
     return click.option(
         "--quiet",
@@ -151,7 +159,7 @@ def validate(
 @click.argument("output_file")
 @_repair_required_option()
 @_labels_option()
-@click.option("--output-delim", default=" ", help="[default: space]")
+@_output_delim_option()
 @_quiet_option()
 def repair(
     file: str,
@@ -165,6 +173,7 @@ def repair(
     parse_comment_lines: bool,
     quiet: bool,
 ) -> None:
+    output_delim = _normalize_tab(output_delim)
     if repair_method == REPAIR_NONE:
         raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}")
 
@@ -184,7 +193,7 @@ def repair(
 @cli.command(help="convert between mention encodings")
 @_single_input_file_arguments
 @click.argument("output_file")
-@click.option("--output-delim", default=" ", help="[default: space]")
+@_output_delim_option()
 @click.option("--input-labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS))
 @click.option("--output-labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS))
 def convert(
@@ -198,6 +207,7 @@ def convert(
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
 ) -> None:
+    output_delim = _normalize_tab(output_delim)
     if input_labels == output_labels:
         raise ValueError("Conversion requires different input and output labels")
 
@@ -233,7 +243,7 @@ def convert(
     type=click.Path(dir_okay=False),
     help="a JSON file containing types to be modified, in the format of a dict with keys as the target type and values as the source type [example file: {'MISC': ['WorkOfArt', 'Event']}]",
 )
-@click.option("--output-delim", default=" ", help="[default: space]")
+@_output_delim_option()
 def process(
     file: str,
     output_file: str,
@@ -247,6 +257,7 @@ def process(
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
 ) -> None:
+    output_delim = _normalize_tab(output_delim)
     keep_types_set = _parse_type_list(keep_types)
     remove_types_set = _parse_type_list(remove_types)
     type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding)
@@ -281,11 +292,7 @@ def process(
 )
 @_repair_option()
 @_labels_option_default_bio()
-@click.option(
-    "--output-delim",
-    default="\t",
-    help="the delimiter to be used for output (has no effect on input) [default: tab]",
-)
+@_output_delim_option()
 @_quiet_option()
 def count(
     file: list[str],  # Name is "file" to make sense on the command line, but it's a list

From 0cfedcfd1f63ce1344f68a95bffda1a30b887d89 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Fri, 27 Feb 2026 04:34:51 -0500
Subject: [PATCH 07/33] Add TODOs

---
 seqscore/scripts/seqscore.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index 4726a8d..cdcd8c0 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -346,6 +346,8 @@ def count(
             )
 
 
+# TODO: Add support for delimited file output
+# TODO: Take format argument for tabulate from command line
 @cli.command(help="show counts of the documents, sentences, and entity types")
 @_multi_input_file_arguments
 @_repair_option()

From 39234f13006ad1701e2d1813ea943c0afe179736 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Fri, 27 Feb 2026 04:39:12 -0500
Subject: [PATCH 08/33] Check for empty tokens during validation

---
 seqscore/conll.py                            | 26 ++++++++++++--------
 seqscore/model.py                            | 10 ++++----
 seqscore/validation.py                       |  8 ++++++
 tests/test_conll_format.py                   | 12 +++++++++
 tests/test_files/minimal_bio_empty_token.txt | 17 +++++++++++++
 tests/test_model.py                          |  6 +++--
 tests/test_repair_click.py                   |  4 +--
 tests/test_validation.py                     | 11 +++++++++
 8 files changed, 75 insertions(+), 19 deletions(-)
 create mode 100644 tests/test_files/minimal_bio_empty_token.txt

diff --git a/seqscore/conll.py b/seqscore/conll.py
index 05ad864..127617a 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -156,7 +156,7 @@ def ingest(
                     if not quiet:
                         msg = (
                             [
-                                f"Validation errors in sequence at line {line_nums[0]} of {source_name}:"
+                                f"Validation errors in sequence beginning at line {line_nums[0]} of {source_name}:"
                             ]
                             + [error.msg for error in validation.errors]
                             + [
@@ -182,14 +182,20 @@ def ingest(
                     + " ".join(labels),
                 ) from e
 
-            sequences = LabeledSequence(
-                tokens,
-                labels,
-                mentions,
-                other_fields=other_fields,
-                provenance=SequenceProvenance(line_nums[0], source_name),
-                comment=comment,
-            )
+            try:
+                sequences = LabeledSequence(
+                    tokens,
+                    labels,
+                    mentions,
+                    other_fields=other_fields,
+                    provenance=SequenceProvenance(line_nums[0], source_name),
+                    comment=comment,
+                )
+            except ValueError as e:  # pragma: no cover
+                # Unreachable unless there is a bug in validation
+                raise ValueError(
+                    f"Invalid sequence error in sequence beginning at line {line_nums[0]} of {source_name}"
+                ) from e
             document.append(sequences)
 
         # Yield final document if non-empty
@@ -211,7 +217,7 @@ def validate(
                 # But we check anyway to be absolutely sure we aren't throwing away a sequence.
                 assert len(source_sequence) == 1
 
-                # If we care about document boundaries and we have results for this document,
+                # If we care about document boundaries and have results for this document,
                 # add it and move on.
                 if not self.ignore_document_boundaries and document_results:
                     all_results.append(document_results)
diff --git a/seqscore/model.py b/seqscore/model.py
index 09e67e2..6df73b3 100644
--- a/seqscore/model.py
+++ b/seqscore/model.py
@@ -85,15 +85,15 @@ def __attrs_post_init__(self) -> None:
                 "must be of the same length"
             )
 
-        for label in self.labels:
+        for idx, label in enumerate(self.labels):
             # Labels cannot be None or an empty string
             if not label:
-                raise ValueError(f"Invalid label: {repr(label)}")
+                raise ValueError(f"Invalid label at sequence index {idx}: {repr(label)}")
 
-        for token in self.tokens:
-            # Labels cannot be None or an empty string
+        for idx, token in enumerate(self.tokens):
+            # Tokens cannot be None or an empty string
             if not token:
-                raise ValueError(f"Invalid token: {repr(token)}")
+                raise ValueError(f"Invalid token at sequence index {idx}: {repr(token)}")
 
     def with_mentions(self, mentions: Sequence[Mention]) -> "LabeledSequence":
         return LabeledSequence(
diff --git a/seqscore/validation.py b/seqscore/validation.py
index 24bff39..6d3c756 100644
--- a/seqscore/validation.py
+++ b/seqscore/validation.py
@@ -81,6 +81,14 @@ def validate_labels(
         "Line numbers and labels must be the same length"
     )
 
+    # Validate tokens if supplied
+    if tokens:
+        for idx, tok in enumerate(tokens):
+            if not tok:
+                line_msg = f" on line {line_nums[idx]}" if line_nums else ""
+                source_msg = f" of {source_name}" if source_name else ""
+                raise ValueError(f"Invalid token {repr(tok)}{line_msg}{source_msg}")
+
     errors: list[ValidationError] = []
     outside = encoding.dialect.outside
 
diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py
index 0ca1d08..bc4d196 100644
--- a/tests/test_conll_format.py
+++ b/tests/test_conll_format.py
@@ -70,3 +70,15 @@ def test_parse_comments_false() -> None:
             str(err.value)
             == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
         )
+
+
+def test_invalid_token_leading_space() -> None:
+    mention_encoding = get_encoding("BIO")
+    ingester = CoNLLIngester(mention_encoding)
+
+    path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt"
+    with path.open(encoding="utf8") as file:
+        with pytest.raises(ValueError) as err:
+            list(ingester.ingest(file, "test", REPAIR_NONE))
+
+    assert str(err.value) == "Invalid token '' on line 9 of test"
diff --git a/tests/test_files/minimal_bio_empty_token.txt b/tests/test_files/minimal_bio_empty_token.txt
new file mode 100644
index 0000000..a497c92
--- /dev/null
+++ b/tests/test_files/minimal_bio_empty_token.txt
@@ -0,0 +1,17 @@
+This	O
+is	O
+a	O
+sentence	O
+.	O
+
+University	B-ORG
+of	I-ORG
+	I-ORG
+is	O
+in	O
+West	B-LOC
+Philadelphia	I-LOC
+,	O
+Pennsylvania	B-LOC
+.	O
+
diff --git a/tests/test_model.py b/tests/test_model.py
index b2e4732..50dcee1 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -58,13 +58,15 @@ def test_labeled_sentence() -> None:
         # Empty
         LabeledSequence([], [])
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError) as err:
         # Bad label
         LabeledSequence(["a"], [""])
+    assert str(err.value) == "Invalid label at sequence index 0: ''"
 
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError) as err:
         # Bad token
         LabeledSequence([""], ["B-PER"])
+    assert str(err.value) == "Invalid token at sequence index 0: ''"
 
     s2 = s1.with_mentions([Mention(Span(0, 2), "PER")])
     assert s2.mentions == (Mention(Span(0, 2), "PER"),)
diff --git a/tests/test_repair_click.py b/tests/test_repair_click.py
index 2eb9c78..81ac443 100644
--- a/tests/test_repair_click.py
+++ b/tests/test_repair_click.py
@@ -37,7 +37,7 @@ def test_repair_BIO_conlleval() -> None:
     assert result.exit_code == 0
     assert (
         normalize_str_with_path(
-            "Validation errors in sequence at line 7 of tests/conll_annotation/invalid1.bio:"
+            "Validation errors in sequence beginning at line 7 of tests/conll_annotation/invalid1.bio:"
         )
         in result.output
     )
@@ -83,7 +83,7 @@ def test_repair_BIO_discard() -> None:
     assert result.exit_code == 0
     assert (
         normalize_str_with_path(
-            "Validation errors in sequence at line 7 of tests/conll_annotation/invalid1.bio:"
+            "Validation errors in sequence beginning at line 7 of tests/conll_annotation/invalid1.bio:"
         )
         in result.output
     )
diff --git a/tests/test_validation.py b/tests/test_validation.py
index 614a928..c8f121e 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -344,3 +344,14 @@ def test_validation_bad_label() -> None:
         str(err.value)
         == "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
     )
+
+
+def test_validation_bad_token() -> None:
+    encoding = get_encoding("BIO")
+
+    tokens = ["Dr.", "", "Salk"]
+    line_nums = [7, 8, 9]
+    labels = ["O", "PER", "PER"]
+    with pytest.raises(ValueError) as err:
+        validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums)
+    assert str(err.value) == "Invalid token '' on line 8"

From 9cd080d9f855af1d204662a50b7c106dae4b0b8a Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Fri, 27 Feb 2026 05:57:48 -0500
Subject: [PATCH 09/33] Update actions to latest versions

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1170954..6cd5e32 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -20,10 +20,10 @@ jobs:
         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.python-version }}
 

From 65a2fc913d669aaef44065ce06759ff899bca808 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Mon, 30 Mar 2026 09:53:13 -0400
Subject: [PATCH 10/33] Remove old documentation build files

---
 .readthedocs.yaml | 22 ---------------
 docs/Makefile     | 20 -------------
 docs/conf.py      | 72 -----------------------------------------------
 docs/index.rst    | 20 -------------
 docs/make.bat     | 35 -----------------------
 5 files changed, 169 deletions(-)
 delete mode 100644 .readthedocs.yaml
 delete mode 100644 docs/Makefile
 delete mode 100644 docs/conf.py
 delete mode 100644 docs/index.rst
 delete mode 100644 docs/make.bat

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
deleted file mode 100644
index 9f23ef8..0000000
--- a/.readthedocs.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# .readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-# Required
-version: 2
-
-# Set the version of Python and other tools you might need
-build:
-  os: ubuntu-20.04
-  tools:
-    python: "3.8"
-
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
-   configuration: docs/conf.py
-
-# Optionally declare the Python requirements required to build your docs
-python:
-   install:
-     - method: pip
-       path: .
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index d4bb2cb..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index 2497af9..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-from seqscore import __version__
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = "SeqScore"
-copyright = "2021, Constantine Lignos, Chester Palen-Michel, and Nolan Holley"
-author = "Constantine Lignos, Chester Palen-Michel, and Nolan Holley"
-
-version = __version__
-# The full version, including alpha/beta/rc tags
-release = version
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.duration",
-    "sphinx.ext.doctest",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
-    "sphinx.ext.intersphinx",
-]
-
-intersphinx_mapping = {
-    "python": ("https://docs.python.org/3/", None),
-    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
-}
-intersphinx_disabled_domains = ["std"]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-# -- Options for EPUB output
-epub_show_urls = "footnote"
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index fac6cf6..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. SeqScore documentation master file, created by
-   sphinx-quickstart on Wed Nov 10 05:11:47 2021.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-SeqScore
-========
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 153be5e..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd

From 5f415fae685324c4e1b4a15a7f576b4f7f85c51b Mon Sep 17 00:00:00 2001
From: sunshower7 <claire.yq@proton.me>
Date: Mon, 1 Jun 2026 13:19:02 -0400
Subject: [PATCH 11/33] Add --ner-label-index option to validate and disable
 Python 3.9 build

---
 .github/workflows/main.yml                    |  2 +-
 pyproject.toml                                |  2 +-
 seqscore/conll.py                             | 20 +++++++++----
 seqscore/scripts/seqscore.py                  | 12 ++++++++
 .../conll_annotation/labels_not_last_col.bio  | 16 ++++++++++
 tests/test_validation_click.py                | 30 +++++++++++++++++++
 6 files changed, 74 insertions(+), 8 deletions(-)
 create mode 100644 tests/conll_annotation/labels_not_last_col.bio

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6cd5e32..ebf8a1e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
 
     steps:
       - uses: actions/checkout@v6
diff --git a/pyproject.toml b/pyproject.toml
index e3493a6..f8d810e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.mypy]
-python_version = 3.9
+python_version = 3.10
 strict_optional = false
 disallow_untyped_defs = true
 disallow_untyped_calls = true
diff --git a/seqscore/conll.py b/seqscore/conll.py
index 127617a..b429a7f 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -52,7 +52,7 @@ class _CoNLLToken:
     other_fields: tuple[str, ...] = attrib()
 
     @classmethod
-    def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken":
+    def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: int) -> "_CoNLLToken":
         # Note: The caller must strip the line of any trailing whitespace
         # TODO: Sense the file rather than the line so we get consistency across lines
         # Try tab first since it's safer, then space
@@ -72,9 +72,14 @@ def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken":
                     f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}"
                 )
 
+        if ner_label_index == 0:
+            raise ValueError("ner_label_index cannot be 0")
+
         text = splits[0]
-        label = splits[-1]
-        other_fields = tuple(splits[1:-1])
+        label = splits[ner_label_index]
+        other_fields = tuple(splits[1:ner_label_index])
+        if ner_label_index != -1:
+            other_fields += tuple(splits[ner_label_index + 1:])
         is_docstart = text == DOCSTART
         return cls(text, label, is_docstart, line_num, other_fields)
 
@@ -84,6 +89,7 @@ class CoNLLIngester:
     encoding: Encoding = attrib()
     parse_comment_lines: bool = attrib(default=False, kw_only=True)
     ignore_document_boundaries: bool = attrib(default=True, kw_only=True)
+    ner_label_index: int = attrib(default=-1, kw_only=True)
 
     def ingest(
         self,
@@ -210,7 +216,7 @@ def validate(
         document_results: list[SequenceValidationResult] = []
 
         for source_sequence, _ in self._parse_file(
-            source, source_name, parse_comments=self.parse_comment_lines
+            source, source_name, parse_comments=self.parse_comment_lines, ner_label_index=self.ner_label_index
         ):
             if source_sequence[0].is_docstart:
                 # We can ony receive DOCSTART in a sequence by itself, see _parse_file.
@@ -253,7 +259,7 @@ def _decompose_sequence(
 
     @classmethod
     def _parse_file(
-        cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False
+        cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False, ner_label_index: int = -1
     ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]:
         sequence: list = []
         comment: Optional[str] = None
@@ -285,7 +291,7 @@ def _parse_file(
                 # Always skip empty lines
                 continue
 
-            token = _CoNLLToken.from_line(line, line_num, source_name)
+            token = _CoNLLToken.from_line(line, line_num, source_name, ner_label_index)
             # Skip document starts, but ensure sequence is empty when we reach them
             if token.is_docstart:
                 if sequence:
@@ -352,12 +358,14 @@ def validate_conll_file(
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
+    ner_label_index: int,
 ) -> ValidationResult:
     encoding = get_encoding(mention_encoding_name)
     ingester = CoNLLIngester(
         encoding,
         parse_comment_lines=parse_comment_lines,
         ignore_document_boundaries=ignore_document_boundaries,
+        ner_label_index=ner_label_index,
     )
     with open(input_path, encoding=file_encoding) as input_file:
         results = ingester.validate(input_file, input_path)
diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index cdcd8c0..20606f3 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -106,6 +106,15 @@ def _output_delim_option() -> Callable:
     )
 
 
+def _ner_label_index_option() -> Callable:
+    return click.option(
+        "--ner-label-index",
+        default=-1,
+        show_default=True,
+        type=int,
+    )
+
+
 def _quiet_option() -> Callable:
     return click.option(
         "--quiet",
@@ -118,6 +127,7 @@ def _quiet_option() -> Callable:
 @cli.command(help="validate labels")
 @_multi_input_file_arguments
 @_labels_option()
+@_ner_label_index_option()
 @_quiet_option()
 def validate(
     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
@@ -126,6 +136,7 @@ def validate(
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
+    ner_label_index: int,
     quiet: bool,
 ) -> None:
     error = False
@@ -136,6 +147,7 @@ def validate(
             file_encoding,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,
+            ner_label_index=ner_label_index,
         )
         if result.errors:
             print(
diff --git a/tests/conll_annotation/labels_not_last_col.bio b/tests/conll_annotation/labels_not_last_col.bio
new file mode 100644
index 0000000..cc91cf4
--- /dev/null
+++ b/tests/conll_annotation/labels_not_last_col.bio
@@ -0,0 +1,16 @@
+This	O	DET
+is	O	VERB
+a	O	DET
+sentence	O	NOUN
+.	O	PUNCT
+
+University	B-ORG	NOUN
+of	I-ORG	ADP
+Pennsylvania	I-ORG	NOUN
+is	O	VERB
+in	O	ADP
+West	B-LOC	NOUN
+Philadelphia	I-LOC	NOUN
+,	O	PUNCT
+Pennsylvania	B-LOC	NOUN
+.	O	PUNCT
diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py
index e688ecc..3cb4a98 100644
--- a/tests/test_validation_click.py
+++ b/tests/test_validation_click.py
@@ -214,3 +214,33 @@ def test_bad_label() -> None:
         str(result.exception)
         == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
     )
+
+
+def test_ner_label_index_pos() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        validate,
+        ["--labels", "BIO", "--ner-label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")],
+    )
+    assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n"
+    assert result.exit_code == 0
+
+
+def test_ner_label_index_neg() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        validate,
+        ["--labels", "BIO", "--ner-label-index", "-2", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")],
+    )
+    assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n"
+    assert result.exit_code == 0
+
+
+def test_ner_label_index_zero() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        validate,
+        ["--labels", "BIO", "--ner-label-index", "0", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")],
+    )
+    assert result.exit_code != 0
+    assert "ner_label_index cannot be 0" in str(result.exception)

From 868074cab508a815bfd1e102628dea7dc47ede24 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Tue, 2 Jun 2026 10:06:39 -0400
Subject: [PATCH 12/33] Remove Python 3.9 support

---
 README.md      | 6 +++---
 pyproject.toml | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e894734..3f6c148 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,8 @@ To install the latest official release of SeqScore, run: `pip install seqscore`.
 This will install the package and add the command `seqscore` in your Python
 environment.
 
-SeqScore requires Python 3.9 or higher. It is tested on Python 3.9,
-3.10, 3.11, 3.12, 3.13, and 3.14.
+SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12,
+3.13, and 3.14.
 
 ## License
 
@@ -600,7 +600,7 @@ To install from a clone of this repository, use:
 
 ## Setting up an environment for development
 
-1. Create an environment: `conda create -yn seqscore python=3.9`
+1. Create an environment: `conda create -yn seqscore python=3.10`
 2. Activate the environment: `conda activate seqscore`
 3. Install seqscore: `pip install -e .`
 4. Install development dependencies: `pip install -r requirements.txt`
diff --git a/pyproject.toml b/pyproject.toml
index f8d810e..212b567 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.mypy]
-python_version = 3.10
+python_version = "3.10"
 strict_optional = false
 disallow_untyped_defs = true
 disallow_untyped_calls = true
@@ -13,4 +13,4 @@ ignore_missing_imports = true
 
 [tool.ruff]
 line-length = 90
-target-version = "py39"
+target-version = "py310"

From a4d0b5cd57128b4a68696e315e257013eef8ccc9 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Tue, 2 Jun 2026 10:42:05 -0400
Subject: [PATCH 13/33] Update development dependencies

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e976d18..8ac2e1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,12 +6,12 @@
 types-tabulate
 
 # For testing
-pytest==8.3.5
-pytest-cov==5.0.0
+pytest==9.0.3
+pytest-cov>=7.1.0
 
 # For development
-mypy==1.19.0
-ruff==0.14.9
+mypy==2.1.0
+ruff==0.15.15
 
 # Documentation build
 # Disabled for now since we don't need them

From 65d7ef24e23f41d7806c84c66387f37af00c59f8 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Tue, 2 Jun 2026 13:21:13 -0400
Subject: [PATCH 14/33] Support setting token and label index across more
 commands

---
 seqscore/conll.py                             | 127 ++++++++++++------
 seqscore/model.py                             |  12 +-
 seqscore/scripts/seqscore.py                  |  59 +++++++-
 seqscore/util.py                              |   4 +-
 seqscore/validation.py                        |   3 +-
 .../diff_token_label_indices.bio              |  17 +++
 .../diff_token_label_indices.bioes            |  17 +++
 .../conll_annotation/labels_not_last_col.bio  |   1 +
 .../labels_not_last_col.bioes                 |  17 +++
 tests/test_conll_format.py                    |  29 ++--
 tests/test_conversion_click.py                |  40 +++++-
 tests/test_model.py                           |   2 +-
 tests/test_validation.py                      |   5 +-
 tests/test_validation_click.py                |  43 ++++--
 14 files changed, 283 insertions(+), 93 deletions(-)
 create mode 100644 tests/conll_annotation/diff_token_label_indices.bio
 create mode 100644 tests/conll_annotation/diff_token_label_indices.bioes
 create mode 100644 tests/conll_annotation/labels_not_last_col.bioes

diff --git a/seqscore/conll.py b/seqscore/conll.py
index b429a7f..9b9fedd 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -43,16 +43,36 @@ class CoNLLFormatError(Exception):
     pass
 
 
+@attrs(frozen=True)
+class LineSpec:
+    """Defines the fields and delimiters for a CoNLL-format line"""
+
+    token_index: int = attrib()
+    ner_label_index: int = attrib()
+
+    def __attrs_post_init__(self) -> None:
+        # This will only catch cases where the indices are identical, not
+        # when they refer to the same position, such as 1 and -1 in a
+        # sequence of length two
+        if self.token_index == self.ner_label_index:
+            raise ValueError(
+                f"Token index ({self.token_index}) and "
+                f"label index ({self.ner_label_index}) cannot be the same"
+            )
+
+
 @attrs(frozen=True)
 class _CoNLLToken:
     text: str = attrib()
     label: str = attrib()
     is_docstart: bool = attrib()
     line_num: int = attrib()
-    other_fields: tuple[str, ...] = attrib()
+    orig_fields: tuple[str, ...] = attrib()
 
     @classmethod
-    def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: int) -> "_CoNLLToken":
+    def from_line(
+        cls, line: str, line_num: int, source_name: str, line_spec: LineSpec
+    ) -> "_CoNLLToken":
         # Note: The caller must strip the line of any trailing whitespace
         # TODO: Sense the file rather than the line so we get consistency across lines
         # Try tab first since it's safer, then space
@@ -72,24 +92,19 @@ def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index:
                     f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}"
                 )
 
-        if ner_label_index == 0:
-            raise ValueError("ner_label_index cannot be 0")
-
-        text = splits[0]
-        label = splits[ner_label_index]
-        other_fields = tuple(splits[1:ner_label_index])
-        if ner_label_index != -1:
-            other_fields += tuple(splits[ner_label_index + 1:])
+        text = splits[line_spec.token_index]
+        label = splits[line_spec.ner_label_index]
+        orig_fields = tuple(splits)
         is_docstart = text == DOCSTART
-        return cls(text, label, is_docstart, line_num, other_fields)
+        return cls(text, label, is_docstart, line_num, orig_fields)
 
 
 @attrs(frozen=True)
 class CoNLLIngester:
     encoding: Encoding = attrib()
+    line_spec: LineSpec = attrib()
     parse_comment_lines: bool = attrib(default=False, kw_only=True)
     ignore_document_boundaries: bool = attrib(default=True, kw_only=True)
-    ner_label_index: int = attrib(default=-1, kw_only=True)
 
     def ingest(
         self,
@@ -119,7 +134,7 @@ def ingest(
                 continue
 
             # Create mentions from tokens in sequence
-            tokens, labels, line_nums, other_fields = self._decompose_sequence(
+            tokens, labels, line_nums, orig_fields = self._decompose_sequence(
                 source_sequence
             )
 
@@ -193,7 +208,7 @@ def ingest(
                     tokens,
                     labels,
                     mentions,
-                    other_fields=other_fields,
+                    orig_fields=orig_fields,
                     provenance=SequenceProvenance(line_nums[0], source_name),
                     comment=comment,
                 )
@@ -210,13 +225,17 @@ def ingest(
             yield document
 
     def validate(
-        self, source: TextIO, source_name: str
+        self,
+        source: TextIO,
+        source_name: str,
     ) -> list[list[SequenceValidationResult]]:
         all_results: list[list[SequenceValidationResult]] = []
         document_results: list[SequenceValidationResult] = []
 
         for source_sequence, _ in self._parse_file(
-            source, source_name, parse_comments=self.parse_comment_lines, ner_label_index=self.ner_label_index
+            source,
+            source_name,
+            parse_comments=self.parse_comment_lines,
         ):
             if source_sequence[0].is_docstart:
                 # We can ony receive DOCSTART in a sequence by itself, see _parse_file.
@@ -254,12 +273,15 @@ def _decompose_sequence(
         tokens = tuple(tok.text for tok in source_sequence)
         labels = tuple(tok.label for tok in source_sequence)
         line_nums = tuple(tok.line_num for tok in source_sequence)
-        other_fields = tuple(tok.other_fields for tok in source_sequence)
-        return tokens, labels, line_nums, other_fields
+        orig_fields = tuple(tok.orig_fields for tok in source_sequence)
+        return tokens, labels, line_nums, orig_fields
 
-    @classmethod
     def _parse_file(
-        cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False, ner_label_index: int = -1
+        self,
+        input_file: TextIO,
+        source_name: str,
+        *,
+        parse_comments: bool = False,
     ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]:
         sequence: list = []
         comment: Optional[str] = None
@@ -284,14 +306,14 @@ def _parse_file(
             if not line.strip():
                 # Clear out sequence if there's anything in it
                 if sequence:
-                    cls._check_sequence(sequence)
+                    self._check_sequence(sequence)
                     yield tuple(sequence), comment
                     sequence = []
                     comment = None
                 # Always skip empty lines
                 continue
 
-            token = _CoNLLToken.from_line(line, line_num, source_name, ner_label_index)
+            token = _CoNLLToken.from_line(line, line_num, source_name, self.line_spec)
             # Skip document starts, but ensure sequence is empty when we reach them
             if token.is_docstart:
                 if sequence:
@@ -301,7 +323,7 @@ def _parse_file(
                 else:
                     # Yield it by itself. Since the sequence variable is empty, leave it unchanged.
                     tmp_sent = (token,)
-                    cls._check_sequence(tmp_sent)
+                    self._check_sequence(tmp_sent)
                     # Don't return the comment yet, it will be returned with the sequence
                     yield tmp_sent, None
             else:
@@ -309,7 +331,7 @@ def _parse_file(
 
         # Finish the last sequence if needed
         if sequence:
-            cls._check_sequence(sequence)
+            self._check_sequence(sequence)
             yield tuple(sequence), comment
 
     @staticmethod
@@ -327,6 +349,7 @@ def ingest_conll_file(
     input_path: PathType,
     mention_encoding_name: str,
     file_encoding: str,
+    line_spec: LineSpec,
     *,
     repair: Optional[str] = None,
     ignore_document_boundaries: bool,
@@ -343,6 +366,7 @@ def ingest_conll_file(
 
     ingester = CoNLLIngester(
         mention_encoding,
+        line_spec,
         parse_comment_lines=parse_comment_lines,
         ignore_document_boundaries=ignore_document_boundaries,
     )
@@ -355,17 +379,17 @@ def validate_conll_file(
     input_path: str,
     mention_encoding_name: str,
     file_encoding: str,
+    line_spec: LineSpec,
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
-    ner_label_index: int,
 ) -> ValidationResult:
     encoding = get_encoding(mention_encoding_name)
     ingester = CoNLLIngester(
         encoding,
+        line_spec,
         parse_comment_lines=parse_comment_lines,
         ignore_document_boundaries=ignore_document_boundaries,
-        ner_label_index=ner_label_index,
     )
     with open(input_path, encoding=file_encoding) as input_file:
         results = ingester.validate(input_file, input_path)
@@ -388,6 +412,7 @@ def repair_conll_file(
     mention_encoding_name: str,
     repair: Optional[str],
     file_encoding: str,
+    line_spec: LineSpec,
     output_delim: str,
     *,
     ignore_document_boundaries: bool,
@@ -398,6 +423,7 @@ def repair_conll_file(
         input_file,
         mention_encoding_name,
         file_encoding,
+        line_spec,
         repair=repair,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
@@ -429,6 +455,7 @@ def write_docs_using_encoding(
     mention_encoding_name: str,
     file_encoding: str,
     delim: str,
+    line_spec: LineSpec,
     output_path: PathType,
 ) -> None:
     mention_encoding = get_encoding(mention_encoding_name)
@@ -437,7 +464,12 @@ def write_docs_using_encoding(
     with open(output_path, "w", encoding=file_encoding) as file:
         for doc in docs:
             write_doc_using_encoding(
-                doc, mention_encoding, delim, file, output_docstart=output_docstart
+                doc,
+                mention_encoding,
+                delim,
+                file,
+                line_spec,
+                output_docstart=output_docstart,
             )
 
 
@@ -446,32 +478,42 @@ def write_doc_using_encoding(
     encoding: Encoding,
     delim: str,
     file: TextIO,
+    line_spec: LineSpec,
     *,
     output_docstart: bool,
 ) -> None:
     if output_docstart:
-        # Get a single token to figure out how many other_fields entries it has
-        sequence_other_fields = doc[0].other_fields
-        fields = [DOCSTART]
-        if sequence_other_fields:
-            fields.extend([EMPTY_OTHER_FIELD for _ in sequence_other_fields[0]])
-        fields.append(encoding.dialect.outside)
-
+        # Get the fields of the first token of the first sentence
+        if doc[0].orig_fields:
+            # to figure out how many fields there are
+            sequence_orig_fields = doc[0].orig_fields[0]
+            # Create the write number of fields
+            fields = [EMPTY_OTHER_FIELD] * len(sequence_orig_fields)
+            # Fill in the token and label
+            fields[line_spec.token_index] = DOCSTART
+            fields[line_spec.ner_label_index] = encoding.dialect.outside
+        else:
+            fields = [DOCSTART, encoding.dialect.outside]
+        # Write output
         print(delim.join(fields), file=file)
         print(file=file)
 
     for sequence in doc:
         labels = encoding.encode_sequence(sequence)
-        # Lengths of labels and other_fields have previously been checked to match tokens
-        for (token, other_fields), label in zip(
-            sequence.tokens_with_other_fields(), labels
+        # Lengths of labels and orig_fields have previously been checked to match tokens
+        for (token, orig_fields), label in zip(
+            sequence.tokens_with_orig_fields(), labels
         ):
-            fields = [token]
-            if other_fields:
-                fields.extend(other_fields)
-            fields.append(label)
+            if orig_fields:
+                fields = list(orig_fields)
+                fields[line_spec.token_index] = token
+                fields[line_spec.ner_label_index] = label
+            else:
+                fields = [token, label]
+            # Write output
             print(delim.join(fields), file=file)
 
+        # Print an emtpy line after each sequence
         print(file=file)
 
 
@@ -482,6 +524,7 @@ def score_conll_files(
     mention_encoding_name: str,
     repair: Optional[str],
     file_encoding: str,
+    line_spec: LineSpec,
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
@@ -497,6 +540,7 @@ def score_conll_files(
         reference_file,
         mention_encoding_name,
         file_encoding,
+        line_spec,
         repair=repair,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
@@ -521,6 +565,7 @@ def score_conll_files(
             pred_file,
             mention_encoding_name,
             file_encoding,
+            line_spec,
             repair=repair,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,
diff --git a/seqscore/model.py b/seqscore/model.py
index 6df73b3..d554371 100644
--- a/seqscore/model.py
+++ b/seqscore/model.py
@@ -60,7 +60,7 @@ class LabeledSequence(Sequence[str]):
     tokens: tuple[str, ...] = attrib(converter=tuplify_strs)
     labels: tuple[str, ...] = attrib(converter=tuplify_strs)
     mentions: tuple[Mention, ...] = attrib(default=(), converter=_tuplify_mentions)
-    other_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
+    orig_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
         default=None, kw_only=True, converter=tuplify_optional_nested_strs
     )
     provenance: Optional[SequenceProvenance] = attrib(
@@ -79,9 +79,9 @@ def __attrs_post_init__(self) -> None:
         if not self.tokens:
             raise ValueError("Tokens and labels must be non-empty")
 
-        if self.other_fields and len(self.tokens) != len(self.other_fields):
+        if self.orig_fields and len(self.tokens) != len(self.orig_fields):
             raise ValueError(
-                f"Tokens ({len(self.tokens)}) and other_fields ({len(self.other_fields)}) "
+                f"Tokens ({len(self.tokens)}) and orig_fields ({len(self.orig_fields)}) "
                 "must be of the same length"
             )
 
@@ -126,11 +126,11 @@ def __str__(self) -> str:
     def tokens_with_labels(self) -> tuple[tuple[str, str], ...]:
         return tuple(zip(self.tokens, self.labels))
 
-    def tokens_with_other_fields(
+    def tokens_with_orig_fields(
         self,
     ) -> tuple[tuple[str, Optional[tuple[str, ...]]], ...]:
-        if self.other_fields:
-            return tuple(zip(self.tokens, self.other_fields))
+        if self.orig_fields:
+            return tuple(zip(self.tokens, self.orig_fields))
         else:
             return tuple(zip(self.tokens, repeat(None)))
 
diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index 20606f3..c2e9265 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -11,6 +11,7 @@
 from seqscore.conll import (
     FORMAT_DELIM,
     SUPPORTED_SCORE_FORMATS,
+    LineSpec,
     ingest_conll_file,
     repair_conll_file,
     score_conll_files,
@@ -44,6 +45,20 @@ def _input_file_options() -> list[Callable]:
         click.option(
             "--ignore-document-boundaries/--use-document-boundaries", default=False
         ),
+        click.option(
+            "--token-index",
+            default=0,
+            show_default=True,
+            type=int,
+            help="Index of the input field to use for the token",
+        ),
+        click.option(
+            "--label-index",
+            default=-1,
+            show_default=True,
+            type=int,
+            help="Index of the input field to use for the label",
+        ),
     ]
 
 
@@ -127,7 +142,6 @@ def _quiet_option() -> Callable:
 @cli.command(help="validate labels")
 @_multi_input_file_arguments
 @_labels_option()
-@_ner_label_index_option()
 @_quiet_option()
 def validate(
     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
@@ -136,18 +150,20 @@ def validate(
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
-    ner_label_index: int,
+    token_index: int,
+    label_index: int,
     quiet: bool,
 ) -> None:
+    line_spec = LineSpec(token_index, label_index)
     error = False
     for each_file in file:
         result = validate_conll_file(
             each_file,
             labels,
             file_encoding,
+            line_spec,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,
-            ner_label_index=ner_label_index,
         )
         if result.errors:
             print(
@@ -178,6 +194,8 @@ def repair(
     output_file: str,
     labels: str,
     file_encoding: str,
+    token_index: int,
+    label_index: int,
     repair_method: str,
     output_delim: str,
     *,
@@ -188,6 +206,7 @@ def repair(
     output_delim = _normalize_tab(output_delim)
     if repair_method == REPAIR_NONE:
         raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}")
+    line_spec = LineSpec(token_index, label_index)
 
     repair_conll_file(
         file,
@@ -195,6 +214,7 @@ def repair(
         labels,
         repair_method,
         file_encoding,
+        line_spec,
         output_delim,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
@@ -212,6 +232,8 @@ def convert(
     file: str,
     output_file: str,
     file_encoding: str,
+    token_index: int,
+    label_index: int,
     output_delim: str,
     input_labels: str,
     output_labels: str,
@@ -220,19 +242,19 @@ def convert(
     parse_comment_lines: bool,
 ) -> None:
     output_delim = _normalize_tab(output_delim)
-    if input_labels == output_labels:
-        raise ValueError("Conversion requires different input and output labels")
+    line_spec = LineSpec(token_index, label_index)
 
     docs = ingest_conll_file(
         file,
         input_labels,
         file_encoding,
+        line_spec,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
     )
 
     write_docs_using_encoding(
-        docs, output_labels, file_encoding, output_delim, output_file
+        docs, output_labels, file_encoding, output_delim, line_spec, output_file
     )
 
 
@@ -260,6 +282,8 @@ def process(
     file: str,
     output_file: str,
     file_encoding: str,
+    token_index: int,
+    label_index: int,
     output_delim: str,
     labels: str,
     keep_types: str,
@@ -270,6 +294,7 @@ def process(
     parse_comment_lines: bool,
 ) -> None:
     output_delim = _normalize_tab(output_delim)
+    line_spec = LineSpec(token_index, label_index)
     keep_types_set = _parse_type_list(keep_types)
     remove_types_set = _parse_type_list(remove_types)
     type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding)
@@ -286,13 +311,16 @@ def process(
         file,
         labels,
         file_encoding,
+        line_spec,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
     )
 
     mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict)
 
-    write_docs_using_encoding(mod_docs, labels, file_encoding, output_delim, output_file)
+    write_docs_using_encoding(
+        mod_docs, labels, file_encoding, output_delim, line_spec, output_file
+    )
 
 
 @cli.command(help="show counts for all the mentions contained in a file")
@@ -309,6 +337,8 @@ def process(
 def count(
     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
     file_encoding: str,
+    token_index: int,
+    label_index: int,
     output_file: Optional[str],
     labels: str,
     *,
@@ -318,6 +348,7 @@ def count(
     repair_method: str,
     quiet: bool,
 ) -> None:
+    line_spec = LineSpec(token_index, label_index)
     if repair_method == REPAIR_NONE:
         repair_method = None
 
@@ -334,6 +365,7 @@ def count(
             each_file,
             labels,
             file_encoding,
+            line_spec,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,
             repair=repair_method,
@@ -368,6 +400,8 @@ def count(
 def summarize(
     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
     file_encoding: str,
+    token_index: int,
+    label_index: int,
     labels: str,
     *,
     ignore_document_boundaries: bool,
@@ -375,6 +409,7 @@ def summarize(
     repair_method: str,
     quiet: bool,
 ) -> None:
+    line_spec = LineSpec(token_index, label_index)
     if repair_method == REPAIR_NONE:
         repair_method = None
 
@@ -386,6 +421,7 @@ def summarize(
             each_file,
             labels,
             file_encoding,
+            line_spec,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,
             repair=repair_method,
@@ -453,11 +489,15 @@ def score(
     reference: str,
     score_format: str,
     delim: str,
+    token_index: int,
+    label_index: int,
     repair_method: str,
     error_counts: bool,
     full_precision: bool,
     quiet: bool,
 ) -> None:
+    line_spec = LineSpec(token_index, label_index)
+
     if repair_method == REPAIR_NONE:
         repair_method = None
 
@@ -475,6 +515,7 @@ def score(
         labels,
         repair_method,
         file_encoding,
+        line_spec,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
         output_format=score_format,
@@ -492,18 +533,22 @@ def score(
 def extract_text(
     file: list[str],  # Name is "file" to make sense on the command line, but it's a list
     file_encoding: str,
+    token_index: int,
+    label_index: int,
     labels: str,
     output_file: str,
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
 ) -> None:
+    line_spec = LineSpec(token_index, label_index)
     all_docs = []
     for each_file in file:
         docs = ingest_conll_file(
             each_file,
             labels,
             file_encoding,
+            line_spec,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,
         )
diff --git a/seqscore/util.py b/seqscore/util.py
index 70677fd..666ead2 100644
--- a/seqscore/util.py
+++ b/seqscore/util.py
@@ -30,13 +30,15 @@ def tuplify_optional_nested_strs(
 def file_fields_match(path1: PathType, path2: PathType, *, debug: bool = False) -> bool:
     """Return whether the whitespace-delimited fields of two files are identical."""
     with open(path1, encoding="utf8") as f1, open(path2, encoding="utf8") as f2:
+        line_count = 1
         for l1, l2 in zip_longest(f1, f2):
             if l1 is None or l2 is None or l1.split() != l2.split():
                 if debug:  # pragma: no cover
-                    print("Non-matching lines:")
+                    print(f"Failed to match at line {line_count}:")
                     print(repr(l1))
                     print(repr(l2))
                 return False
+            line_count += 1
         return True
 
 
diff --git a/seqscore/validation.py b/seqscore/validation.py
index 6d3c756..9593713 100644
--- a/seqscore/validation.py
+++ b/seqscore/validation.py
@@ -106,7 +106,8 @@ def validate_labels(
             raise InvalidLabelError(
                 label,
                 f"Could not parse label {repr(label)}{line_msg}{source_msg} during validation: "
-                + str(e),
+                + str(e)
+                + " Use the --label-index argument if the label is not the last field.",
             ) from e
 
         if not encoding.is_valid_state(state):
diff --git a/tests/conll_annotation/diff_token_label_indices.bio b/tests/conll_annotation/diff_token_label_indices.bio
new file mode 100644
index 0000000..ba68d3f
--- /dev/null
+++ b/tests/conll_annotation/diff_token_label_indices.bio
@@ -0,0 +1,17 @@
+1	This	O	DET
+2	is	O	VERB
+3	a	O	DET
+4	sentence	O	NOUN
+5	.	O	PUNCT
+
+6	University	B-ORG	NOUN
+7	of	I-ORG	ADP
+8	Pennsylvania	I-ORG	NOUN
+9	is	O	VERB
+10	in	O	ADP
+11	West	B-LOC	NOUN
+12	Philadelphia	I-LOC	NOUN
+13	,	O	PUNCT
+14	Pennsylvania	B-LOC	NOUN
+15	.	O	PUNCT
+
diff --git a/tests/conll_annotation/diff_token_label_indices.bioes b/tests/conll_annotation/diff_token_label_indices.bioes
new file mode 100644
index 0000000..46a6398
--- /dev/null
+++ b/tests/conll_annotation/diff_token_label_indices.bioes
@@ -0,0 +1,17 @@
+1	This	O	DET
+2	is	O	VERB
+3	a	O	DET
+4	sentence	O	NOUN
+5	.	O	PUNCT
+
+6	University	B-ORG	NOUN
+7	of	I-ORG	ADP
+8	Pennsylvania	E-ORG	NOUN
+9	is	O	VERB
+10	in	O	ADP
+11	West	B-LOC	NOUN
+12	Philadelphia	E-LOC	NOUN
+13	,	O	PUNCT
+14	Pennsylvania	S-LOC	NOUN
+15	.	O	PUNCT
+
diff --git a/tests/conll_annotation/labels_not_last_col.bio b/tests/conll_annotation/labels_not_last_col.bio
index cc91cf4..67d4b7f 100644
--- a/tests/conll_annotation/labels_not_last_col.bio
+++ b/tests/conll_annotation/labels_not_last_col.bio
@@ -14,3 +14,4 @@ Philadelphia	I-LOC	NOUN
 ,	O	PUNCT
 Pennsylvania	B-LOC	NOUN
 .	O	PUNCT
+
diff --git a/tests/conll_annotation/labels_not_last_col.bioes b/tests/conll_annotation/labels_not_last_col.bioes
new file mode 100644
index 0000000..0e55cd2
--- /dev/null
+++ b/tests/conll_annotation/labels_not_last_col.bioes
@@ -0,0 +1,17 @@
+This	O	DET
+is	O	VERB
+a	O	DET
+sentence	O	NOUN
+.	O	PUNCT
+
+University	B-ORG	NOUN
+of	I-ORG	ADP
+Pennsylvania	E-ORG	NOUN
+is	O	VERB
+in	O	ADP
+West	B-LOC	NOUN
+Philadelphia	E-LOC	NOUN
+,	O	PUNCT
+Pennsylvania	S-LOC	NOUN
+.	O	PUNCT
+
diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py
index bc4d196..f32506b 100644
--- a/tests/test_conll_format.py
+++ b/tests/test_conll_format.py
@@ -2,14 +2,15 @@
 
 import pytest
 
-from seqscore.conll import CoNLLFormatError, CoNLLIngester
+from seqscore.conll import CoNLLFormatError, CoNLLIngester, LineSpec
 from seqscore.encoding import REPAIR_NONE, get_encoding
 from seqscore.validation import InvalidLabelError
 
 
 def test_parse_comments_true() -> None:
     mention_encoding = get_encoding("BIO")
-    ingester = CoNLLIngester(mention_encoding, parse_comment_lines=True)
+    line_spec = LineSpec(0, 1)
+    ingester = CoNLLIngester(mention_encoding, line_spec, parse_comment_lines=True)
     comments_path = Path("tests") / "test_files" / "minimal_comments.bio"
     with comments_path.open(encoding="utf8") as file:
         documents = list(ingester.ingest(file, "test", REPAIR_NONE))
@@ -32,7 +33,8 @@ def test_parse_comments_true() -> None:
 
 def test_parse_comments_false() -> None:
     mention_encoding = get_encoding("BIO")
-    ingester = CoNLLIngester(mention_encoding)
+    line_spec = LineSpec(0, 1)
+    ingester = CoNLLIngester(mention_encoding, line_spec)
 
     comments_path = Path("tests") / "test_files" / "minimal_comments_1.bio"
     with comments_path.open(encoding="utf8") as file:
@@ -46,35 +48,24 @@ def test_parse_comments_false() -> None:
 
     comments_path = Path("tests") / "test_files" / "minimal_comments_2.bio"
     with comments_path.open(encoding="utf8") as file:
-        with pytest.raises(InvalidLabelError) as err:
+        with pytest.raises(InvalidLabelError):
             list(ingester.ingest(file, "test", REPAIR_NONE))
-        assert (
-            str(err.value)
-            == "Could not parse label 'Comment' on line 1 of test during validation: Label 'Comment' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
-        )
 
     comments_path = Path("tests") / "test_files" / "minimal_comments_3.bio"
     with comments_path.open(encoding="utf8") as file:
-        with pytest.raises(InvalidLabelError) as err:
+        with pytest.raises(InvalidLabelError):
             list(ingester.ingest(file, "test", REPAIR_NONE))
-        assert (
-            str(err.value)
-            == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
-        )
 
     comments_path = Path("tests") / "test_files" / "minimal_comments_4.bio"
     with comments_path.open(encoding="utf8") as file:
-        with pytest.raises(InvalidLabelError) as err:
+        with pytest.raises(InvalidLabelError):
             list(ingester.ingest(file, "test", REPAIR_NONE))
-        assert (
-            str(err.value)
-            == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines."
-        )
 
 
 def test_invalid_token_leading_space() -> None:
     mention_encoding = get_encoding("BIO")
-    ingester = CoNLLIngester(mention_encoding)
+    line_spec = LineSpec(0, -1)
+    ingester = CoNLLIngester(mention_encoding, line_spec)
 
     path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt"
     with path.open(encoding="utf8") as file:
diff --git a/tests/test_conversion_click.py b/tests/test_conversion_click.py
index 8fafcf6..b41a32b 100644
--- a/tests/test_conversion_click.py
+++ b/tests/test_conversion_click.py
@@ -174,6 +174,28 @@ def test_IOB_to_BIO_fields() -> None:
     )
 
 
+def test_IOB_to_BIO_fields_and_specified_indices() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        convert,
+        [
+            "--input-labels",
+            "BIO",
+            "--output-labels",
+            "BIOES",
+            "--label-index",
+            "1",
+            os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"),
+            os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert file_fields_match(
+        os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"),
+        os.path.join("tests", "conll_annotation", "labels_not_last_col.bioes"),
+    )
+
+
 def test_IO_to_BIOES() -> None:
     runner = CliRunner()
     result = runner.invoke(
@@ -215,7 +237,7 @@ def test_BIOES_to_IO() -> None:
     )
 
 
-def test_same_input_and_output_labels_raises_error() -> None:
+def test_diff_token_label_indices() -> None:
     runner = CliRunner()
     result = runner.invoke(
         convert,
@@ -223,9 +245,17 @@ def test_same_input_and_output_labels_raises_error() -> None:
             "--input-labels",
             "BIO",
             "--output-labels",
-            "BIO",
-            os.path.join("tests", "conll_annotation", "minimal.bio"),
-            os.path.join(TMP_DIR.name, "temp.txt"),
+            "BIOES",
+            "--token-index",
+            "1",
+            "--label-index",
+            "2",
+            os.path.join("tests", "conll_annotation", "diff_token_label_indices.bio"),
+            os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"),
         ],
     )
-    assert result.exit_code != 0
+    assert result.exit_code == 0
+    assert file_fields_match(
+        os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"),
+        os.path.join("tests", "conll_annotation", "diff_token_label_indices.bioes"),
+    )
diff --git a/tests/test_model.py b/tests/test_model.py
index 50dcee1..0853d16 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -73,4 +73,4 @@ def test_labeled_sentence() -> None:
 
     with pytest.raises(ValueError):
         # Mismatched length between tokens and other_fields
-        LabeledSequence(["a", "b"], ["B-PER", "I-PER"], other_fields=[["DT"]])
+        LabeledSequence(["a", "b"], ["B-PER", "I-PER"], orig_fields=[["DT"]])
diff --git a/tests/test_validation.py b/tests/test_validation.py
index c8f121e..9fbde19 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -340,9 +340,8 @@ def test_validation_bad_label() -> None:
     labels = ["O", "PER", "PER"]
     with pytest.raises(EncodingError) as err:
         validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums)
-    assert (
-        str(err.value)
-        == "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
+    assert str(err.value).startswith(
+        "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
     )
 
 
diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py
index 3cb4a98..5cf201c 100644
--- a/tests/test_validation_click.py
+++ b/tests/test_validation_click.py
@@ -210,9 +210,8 @@ def test_bad_label() -> None:
         ["--labels", "BIO", os.path.join("tests", "conll_annotation", "bad_label2.bio")],
     )
     assert result.exit_code != 0
-    assert (
-        str(result.exception)
-        == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
+    assert str(result.exception).startswith(
+        "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '<STATE>-<ENTITY_TYPE>'."
     )
 
 
@@ -220,9 +219,18 @@ def test_ner_label_index_pos() -> None:
     runner = CliRunner()
     result = runner.invoke(
         validate,
-        ["--labels", "BIO", "--ner-label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")],
+        [
+            "--labels",
+            "BIO",
+            "--label-index",
+            "1",
+            os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"),
+        ],
+    )
+    assert (
+        result.output
+        == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n"
     )
-    assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n"
     assert result.exit_code == 0
 
 
@@ -230,9 +238,18 @@ def test_ner_label_index_neg() -> None:
     runner = CliRunner()
     result = runner.invoke(
         validate,
-        ["--labels", "BIO", "--ner-label-index", "-2", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")],
+        [
+            "--labels",
+            "BIO",
+            "--label-index",
+            "-2",
+            os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"),
+        ],
+    )
+    assert (
+        result.output
+        == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n"
     )
-    assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n"
     assert result.exit_code == 0
 
 
@@ -240,7 +257,15 @@ def test_ner_label_index_zero() -> None:
     runner = CliRunner()
     result = runner.invoke(
         validate,
-        ["--labels", "BIO", "--ner-label-index", "0", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")],
+        [
+            "--labels",
+            "BIO",
+            "--label-index",
+            "0",
+            os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"),
+        ],
     )
     assert result.exit_code != 0
-    assert "ner_label_index cannot be 0" in str(result.exception)
+    assert "Token index (0) and label index (0) cannot be the same" in str(
+        result.exception
+    )

From 0e2050f663f1842bc8581d761813a417b388d343 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 12:13:10 -0400
Subject: [PATCH 15/33] Update contributors in README

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3f6c148..6fc280d 100644
--- a/README.md
+++ b/README.md
@@ -608,6 +608,7 @@ To install from a clone of this repository, use:
 # Contributors
 
 SeqScore was developed by the BLT Lab at Brandeis University under the
-direction of PI and lead developer Constantine Lignos. Chester Palen-Michel
-and Nolan Holley contributed to its development. Gordon Dou, Maya Kruse, and
-Andrew Rueda gave feedback on its features and assisted in README writing.
+direction of PI and lead developer Constantine Lignos. Chester
+Palen-Michel, Nolan Holley, and Claire Wang contributed to its
+development.  Gordon Dou, Maya Kruse, and Andrew Rueda gave feedback
+on its features and assisted in README writing.

From 3e9b46bae506435b69748ec75d368b5838d49b17 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 12:16:36 -0400
Subject: [PATCH 16/33] Drop Python 3.9 support in setup.py

---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 90b2082..fbb7f4a 100755
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ def setup_package() -> None:
         packages=find_packages(include=("seqscore", "seqscore.*")),
         # Package type information
         package_data={"seqscore": ["py.typed"]},
-        python_requires=">=3.9",
+        python_requires=">=3.10",
         license="MIT",
         description="SeqScore: Scoring for named entity recognition and other sequence labeling tasks",
         long_description=long_description,
@@ -34,7 +34,6 @@ def setup_package() -> None:
         classifiers=[
             "Development Status :: 4 - Beta",
             "License :: OSI Approved :: MIT License",
-            "Programming Language :: Python :: 3.9",
             "Programming Language :: Python :: 3.10",
             "Programming Language :: Python :: 3.11",
             "Programming Language :: Python :: 3.12",

From 208a933acb06db16b9f851b4d1d6a1887506934d Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 12:19:43 -0400
Subject: [PATCH 17/33] Update year and contributors in license

---
 LICENSE | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 6e764b2..d0f5cdd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,7 @@
 MIT License
 
-Copyright (c) 2023 Constantine Lignos, Chester Palen-Michel, and Nolan Holley
+Copyright (c) 2026 Constantine Lignos, Chester Palen-Michel, Nolan Holley,
+and Claire Wang.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From f038db07f3b21b8977095b2626e1c8ca906a9cbf Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 13:41:32 -0400
Subject: [PATCH 18/33] Fix aggregation of accuracy scores across files

---
 seqscore/conll.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqscore/conll.py b/seqscore/conll.py
index 9b9fedd..24d863a 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -576,7 +576,7 @@ def score_conll_files(
             pred_docs, ref_docs, count_fp_fn_examples=error_counts
         )
         all_class_scores.append(class_scores)
-        all_acc_scores.append(class_scores)
+        all_acc_scores.append(acc_scores)
 
         if error_counts:
             if multi_files:

From 023e950b746ffff024bf536bea5786e80de48371 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 13:42:58 -0400
Subject: [PATCH 19/33] Improve test coverage of conll.py and clean up some
 tests

---
 seqscore/conll.py                             |  24 ++--
 tests/conll_annotation/bad_label1.bio         |   5 -
 tests/test_conll_format.py                    | 125 +++++++++++++++---
 .../bad_label1.bio}                           |   0
 .../bad_label2.bio}                           |   2 +-
 tests/test_files/minimal_bad_docstart.bio     |  19 +++
 tests/test_files/minimal_docstart.bio         |  19 +++
 tests/test_files/minimal_no_delims.bio        |  17 +++
 tests/test_summarize_click.py                 |  25 ++++
 tests/test_validation_click.py                |   2 +-
 10 files changed, 201 insertions(+), 37 deletions(-)
 delete mode 100644 tests/conll_annotation/bad_label1.bio
 rename tests/{conll_annotation/bad_label2.bio => test_files/bad_label1.bio} (100%)
 rename tests/{conll_annotation/bad_label3.bio => test_files/bad_label2.bio} (68%)
 create mode 100644 tests/test_files/minimal_bad_docstart.bio
 create mode 100644 tests/test_files/minimal_docstart.bio
 create mode 100644 tests/test_files/minimal_no_delims.bio

diff --git a/seqscore/conll.py b/seqscore/conll.py
index 24d863a..a6fb155 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -104,7 +104,7 @@ class CoNLLIngester:
     encoding: Encoding = attrib()
     line_spec: LineSpec = attrib()
     parse_comment_lines: bool = attrib(default=False, kw_only=True)
-    ignore_document_boundaries: bool = attrib(default=True, kw_only=True)
+    ignore_document_boundaries: bool = attrib(default=False, kw_only=True)
 
     def ingest(
         self,
@@ -113,8 +113,8 @@ def ingest(
         repair: Optional[str],
         *,
         quiet: bool = False,
-    ) -> Iterable[list[LabeledSequence]]:
-        document_counter = 0
+    ) -> list[list[LabeledSequence]]:
+        all_documents: list[list[LabeledSequence]] = []
         document: list[LabeledSequence] = []
 
         for source_sequence, comment in self._parse_file(
@@ -128,8 +128,7 @@ def ingest(
                 # We skip this if the builder is empty, which will happen for the very
                 # first document in the corpus (as there is no previous document to end).
                 if not self.ignore_document_boundaries and document:
-                    document_counter += 1
-                    yield document
+                    all_documents.append(document)
                     document = []
                 continue
 
@@ -219,10 +218,11 @@ def ingest(
                 ) from e
             document.append(sequences)
 
-        # Yield final document if non-empty
+        # Add final document if non-empty
         if document:
-            document_counter += 1
-            yield document
+            all_documents.append(document)
+
+        return all_documents
 
     def validate(
         self,
@@ -317,8 +317,8 @@ def _parse_file(
             # Skip document starts, but ensure sequence is empty when we reach them
             if token.is_docstart:
                 if sequence:
-                    raise ValueError(
-                        f"Encountered DOCSTART at line {line_num} while still in sequence"
+                    raise CoNLLFormatError(
+                        f"Encountered {DOCSTART} at line {line_num} of {source_name} in the middle of a sequence"
                     )
                 else:
                     # Yield it by itself. Since the sequence variable is empty, leave it unchanged.
@@ -341,7 +341,7 @@ def _check_sequence(sequence: Sequence[_CoNLLToken]) -> None:
         # get document boundaries as their own sequences.
         if sequence[0].is_docstart and len(sequence) > 1:
             raise ValueError(
-                f"Returned -DOCSTART- as part of a sequence at line {sequence[0].line_num}"
+                f"Returned {DOCSTART} as part of a sequence at line {sequence[0].line_num}"
             )
 
 
@@ -371,7 +371,7 @@ def ingest_conll_file(
         ignore_document_boundaries=ignore_document_boundaries,
     )
     with open(input_path, encoding=file_encoding) as input_file:
-        docs = list(ingester.ingest(input_file, str(input_path), repair, quiet=quiet))
+        docs = ingester.ingest(input_file, str(input_path), repair, quiet=quiet)
     return docs
 
 
diff --git a/tests/conll_annotation/bad_label1.bio b/tests/conll_annotation/bad_label1.bio
deleted file mode 100644
index dcda560..0000000
--- a/tests/conll_annotation/bad_label1.bio
+++ /dev/null
@@ -1,5 +0,0 @@
-This	O
-is
-a	O
-sentence	O
-.	O
diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py
index f32506b..b5b3fa5 100644
--- a/tests/test_conll_format.py
+++ b/tests/test_conll_format.py
@@ -2,18 +2,26 @@
 
 import pytest
 
-from seqscore.conll import CoNLLFormatError, CoNLLIngester, LineSpec
+from seqscore.conll import (
+    DOCSTART,
+    CoNLLFormatError,
+    CoNLLIngester,
+    LineSpec,
+    _CoNLLToken,
+    ingest_conll_file,
+)
 from seqscore.encoding import REPAIR_NONE, get_encoding
 from seqscore.validation import InvalidLabelError
 
+BIO = get_encoding("BIO")
+LINE_SPEC = LineSpec(0, -1)
+
 
 def test_parse_comments_true() -> None:
-    mention_encoding = get_encoding("BIO")
-    line_spec = LineSpec(0, 1)
-    ingester = CoNLLIngester(mention_encoding, line_spec, parse_comment_lines=True)
+    ingester = CoNLLIngester(BIO, LINE_SPEC, parse_comment_lines=True)
     comments_path = Path("tests") / "test_files" / "minimal_comments.bio"
     with comments_path.open(encoding="utf8") as file:
-        documents = list(ingester.ingest(file, "test", REPAIR_NONE))
+        documents = ingester.ingest(file, "test", REPAIR_NONE)
 
     assert len(documents) == 1
     sequences = documents[0]
@@ -32,15 +40,12 @@ def test_parse_comments_true() -> None:
 
 
 def test_parse_comments_false() -> None:
-    mention_encoding = get_encoding("BIO")
-    line_spec = LineSpec(0, 1)
-    ingester = CoNLLIngester(mention_encoding, line_spec)
-
+    ingester = CoNLLIngester(BIO, LINE_SPEC)
     comments_path = Path("tests") / "test_files" / "minimal_comments_1.bio"
     with comments_path.open(encoding="utf8") as file:
         # err1 needs to not be reused below because the exception is a different type
         with pytest.raises(CoNLLFormatError) as err1:
-            list(ingester.ingest(file, "test", REPAIR_NONE))
+            ingester.ingest(file, "test", REPAIR_NONE)
         assert (
             str(err1.value)
             == "Line 1 of test does not appear to be delimited and begins with #. Perhaps you want to use the --parse-comment-lines flag? Line contents: '#'"
@@ -49,27 +54,111 @@ def test_parse_comments_false() -> None:
     comments_path = Path("tests") / "test_files" / "minimal_comments_2.bio"
     with comments_path.open(encoding="utf8") as file:
         with pytest.raises(InvalidLabelError):
-            list(ingester.ingest(file, "test", REPAIR_NONE))
+            ingester.ingest(file, "test", REPAIR_NONE)
 
     comments_path = Path("tests") / "test_files" / "minimal_comments_3.bio"
     with comments_path.open(encoding="utf8") as file:
         with pytest.raises(InvalidLabelError):
-            list(ingester.ingest(file, "test", REPAIR_NONE))
+            ingester.ingest(file, "test", REPAIR_NONE)
 
     comments_path = Path("tests") / "test_files" / "minimal_comments_4.bio"
     with comments_path.open(encoding="utf8") as file:
         with pytest.raises(InvalidLabelError):
-            list(ingester.ingest(file, "test", REPAIR_NONE))
+            ingester.ingest(file, "test", REPAIR_NONE)
 
 
 def test_invalid_token_leading_space() -> None:
-    mention_encoding = get_encoding("BIO")
-    line_spec = LineSpec(0, -1)
-    ingester = CoNLLIngester(mention_encoding, line_spec)
-
+    ingester = CoNLLIngester(BIO, LINE_SPEC)
     path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt"
     with path.open(encoding="utf8") as file:
         with pytest.raises(ValueError) as err:
-            list(ingester.ingest(file, "test", REPAIR_NONE))
+            ingester.ingest(file, "test", REPAIR_NONE)
 
     assert str(err.value) == "Invalid token '' on line 9 of test"
+
+
+def test_bad_docstart() -> None:
+    ingester = CoNLLIngester(BIO, LINE_SPEC)
+    path = Path("tests") / "test_files" / "minimal_bad_docstart.bio"
+    with path.open(encoding="utf8") as file:
+        with pytest.raises(CoNLLFormatError) as err:
+            ingester.ingest(file, str(path), REPAIR_NONE)
+
+    assert (
+        str(err.value)
+        == "Encountered -DOCSTART- at line 4 of tests/test_files/minimal_bad_docstart.bio in the middle of a sequence"
+    )
+
+
+def test_check_sequence() -> None:
+    tokens = [
+        _CoNLLToken(DOCSTART, "O", True, 0, ()),
+        _CoNLLToken("Hello", "O", True, 0, ()),
+    ]
+    with pytest.raises(ValueError):
+        CoNLLIngester._check_sequence(tokens)
+
+
+def test_no_delims() -> None:
+    ingester = CoNLLIngester(BIO, LINE_SPEC)
+    path = Path("tests") / "test_files" / "minimal_no_delims.bio"
+    with path.open(encoding="utf8") as file:
+        with pytest.raises(CoNLLFormatError) as err:
+            ingester.ingest(file, str(path), REPAIR_NONE)
+
+    assert (
+        str(err.value)
+        == "Line 1 of tests/test_files/minimal_no_delims.bio is not delimited by space or tab: 'ThisO'"
+    )
+
+
+def test_validate_with_docstart() -> None:
+    ingester = CoNLLIngester(BIO, LINE_SPEC, ignore_document_boundaries=False)
+    path = Path("tests") / "test_files" / "minimal_docstart.bio"
+    with path.open(encoding="utf8") as file:
+        ingester.validate(
+            file,
+            str(path),
+        )
+
+
+def test_repair_bad_name() -> None:
+    path = Path("tests") / "conll_annotation" / "minimal.bio"
+    with pytest.raises(ValueError) as err:
+        ingest_conll_file(
+            str(path),
+            "BIOES",
+            "UTF-8",
+            LINE_SPEC,
+            repair="conlleval",
+            ignore_document_boundaries=False,
+            parse_comment_lines=False,
+        )
+
+    assert str(err.value).startswith(
+        "Cannot repair mention encoding BIOES using method conlleval."
+    )
+
+
+def test_bad_label1() -> None:
+    ingester = CoNLLIngester(BIO, LINE_SPEC)
+    path = Path("tests") / "test_files" / "bad_label1.bio"
+    with path.open(encoding="utf8") as file:
+        with pytest.raises(InvalidLabelError) as err:
+            ingester.ingest(file, str(path), repair=REPAIR_NONE)
+
+    assert str(err.value).startswith(
+        "Could not parse label 'GPE' on line 4 of tests/test_files/bad_label1.bio during validation"
+    )
+
+
+def test_bad_label2() -> None:
+    ingester = CoNLLIngester(BIO, LINE_SPEC)
+    path = Path("tests") / "test_files" / "bad_label2.bio"
+    with path.open(encoding="utf8") as file:
+        with pytest.raises(InvalidLabelError) as err:
+            ingester.ingest(file, str(path), repair=REPAIR_NONE)
+
+    assert str(err.value).startswith(
+        "Could not parse label 'OUT' on line 1 of tests/test_files/bad_label2.bio during validation"
+    )
diff --git a/tests/conll_annotation/bad_label2.bio b/tests/test_files/bad_label1.bio
similarity index 100%
rename from tests/conll_annotation/bad_label2.bio
rename to tests/test_files/bad_label1.bio
diff --git a/tests/conll_annotation/bad_label3.bio b/tests/test_files/bad_label2.bio
similarity index 68%
rename from tests/conll_annotation/bad_label3.bio
rename to tests/test_files/bad_label2.bio
index 95d093e..e67150c 100644
--- a/tests/conll_annotation/bad_label3.bio
+++ b/tests/test_files/bad_label2.bio
@@ -1,5 +1,5 @@
 This	OUT
 is	OUT
 a	OUT
-sentence	OUT
+sentence	GPE
 .	OUT
diff --git a/tests/test_files/minimal_bad_docstart.bio b/tests/test_files/minimal_bad_docstart.bio
new file mode 100644
index 0000000..3284f2c
--- /dev/null
+++ b/tests/test_files/minimal_bad_docstart.bio
@@ -0,0 +1,19 @@
+-DOCSTART-	O
+This	O
+is	O
+-DOCSTART-	O
+a	O
+sentence	O
+.	O
+
+University	B-ORG
+of	I-ORG
+Pennsylvania	I-ORG
+is	O
+in	O
+West	B-LOC
+Philadelphia	I-LOC
+,	O
+Pennsylvania	B-LOC
+.	O
+
diff --git a/tests/test_files/minimal_docstart.bio b/tests/test_files/minimal_docstart.bio
new file mode 100644
index 0000000..c46e330
--- /dev/null
+++ b/tests/test_files/minimal_docstart.bio
@@ -0,0 +1,19 @@
+-DOCSTART-	O
+This	O
+is	O
+a	O
+sentence	O
+.	O
+
+-DOCSTART-	O
+University	B-ORG
+of	I-ORG
+Pennsylvania	I-ORG
+is	O
+in	O
+West	B-LOC
+Philadelphia	I-LOC
+,	O
+Pennsylvania	B-LOC
+.	O
+
diff --git a/tests/test_files/minimal_no_delims.bio b/tests/test_files/minimal_no_delims.bio
new file mode 100644
index 0000000..d86767e
--- /dev/null
+++ b/tests/test_files/minimal_no_delims.bio
@@ -0,0 +1,17 @@
+ThisO
+isO
+aO
+sentenceO
+.O
+
+UniversityB-ORG
+ofI-ORG
+PennsylvaniaI-ORG
+isO
+inO
+WestB-LOC
+PhiladelphiaI-LOC
+,O
+PennsylvaniaB-LOC
+.O
+
diff --git a/tests/test_summarize_click.py b/tests/test_summarize_click.py
index 4088093..8a9dec8 100644
--- a/tests/test_summarize_click.py
+++ b/tests/test_summarize_click.py
@@ -77,6 +77,31 @@ def test_summarize_iob_twodoc() -> None:
     )
 
 
+def test_summarize_iob_twodoc_ignore_doc_boundaries() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        summarize,
+        [
+            "--labels",
+            "IOB",
+            "--ignore-document-boundaries",
+            os.path.join("tests", "conll_annotation", "minimal_fields.iob"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert (
+        result.output
+        == """File 'tests/conll_annotation/minimal_fields.iob' contains 1 document(s) and 2 sentences
+Entity Type      Count
+-------------  -------
+LOC                  2
+ORG                  1
+-------------  -------
+TOTAL                3
+"""
+    )
+
+
 def test_summarize_bio_twofiles() -> None:
     runner = CliRunner()
     result = runner.invoke(
diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py
index 5cf201c..6f72a7d 100644
--- a/tests/test_validation_click.py
+++ b/tests/test_validation_click.py
@@ -207,7 +207,7 @@ def test_bad_label() -> None:
     runner = CliRunner()
     result = runner.invoke(
         validate,
-        ["--labels", "BIO", os.path.join("tests", "conll_annotation", "bad_label2.bio")],
+        ["--labels", "BIO", os.path.join("tests", "test_files", "bad_label1.bio")],
     )
     assert result.exit_code != 0
     assert str(result.exception).startswith(

From 15b17a0d104fee54077175e9eeaf149c19cea26c Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 14:02:53 -0400
Subject: [PATCH 20/33] Remove repair-specific deprecated file writing function

---
 seqscore/conll.py | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/seqscore/conll.py b/seqscore/conll.py
index a6fb155..15f8edc 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -429,25 +429,9 @@ def repair_conll_file(
         parse_comment_lines=parse_comment_lines,
         quiet=quiet,
     )
-
-    output_docstart = len(docs) > 1
-
-    with open(output_file, "w", encoding=file_encoding) as file:
-        for doc in docs:
-            _write_doc_labels(doc, output_delim, file, output_docstart=output_docstart)
-
-
-def _write_doc_labels(
-    doc: Sequence[LabeledSequence], delim: str, file: TextIO, *, output_docstart: bool
-) -> None:
-    if output_docstart:
-        print(f"{DOCSTART}{delim}O", file=file)
-        print(file=file)
-
-    for sequence in doc:
-        for token, label in sequence.tokens_with_labels():
-            print(f"{token}{delim}{label}", file=file)
-        print(file=file)
+    write_docs_using_encoding(
+        docs, mention_encoding_name, file_encoding, output_delim, line_spec, output_file
+    )
 
 
 def write_docs_using_encoding(

From a888abcc01218348fcb72bd6f2a92b9ef8b9ed64 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 14:41:50 -0400
Subject: [PATCH 21/33] Add LabeledSequence.from_tokens_and_labels utility
 method

---
 seqscore/model.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/seqscore/model.py b/seqscore/model.py
index d554371..2921b50 100644
--- a/seqscore/model.py
+++ b/seqscore/model.py
@@ -1,6 +1,6 @@
 from collections.abc import Iterable, Iterator, Sequence
 from itertools import repeat
-from typing import Any, Optional, Union, overload
+from typing import TYPE_CHECKING, Any, Optional, Union, overload
 
 from attr import Attribute, attrib, attrs
 
@@ -10,6 +10,9 @@
     validator_nonempty_str,
 )
 
+if TYPE_CHECKING:
+    from seqscore.encoding import Encoding  # pragma: no cover
+
 
 def _validator_nonnegative(_inst: Any, _attr: Attribute, value: Any) -> None:
     if value < 0:
@@ -139,3 +142,14 @@ def span_tokens(self, span: Span) -> tuple[str, ...]:
 
     def mention_tokens(self, mention: Mention) -> tuple[str, ...]:
         return self.span_tokens(mention.span)
+
+    @classmethod
+    def from_tokens_and_labels(
+        cls,
+        tokens: Sequence[str],
+        labels: Sequence[str],
+        encoding: "Encoding",
+        **kwargs: Any,
+    ) -> "LabeledSequence":
+        mentions = encoding.decode_labels(labels)
+        return cls(tokens, labels, mentions, **kwargs)

From 2296b4c69811310e8aade3efa82aa1e130954131 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 14:43:55 -0400
Subject: [PATCH 22/33] Improve test coverage of conll.py

---
 tests/test_conll_format.py                    | 48 ++++++++++++++---
 tests/test_files/minimal_docstart1.bio        | 21 ++++++++
 ...mal_docstart.bio => minimal_docstart2.bio} |  0
 tests/test_scoring_click.py                   | 54 +++++++++++++++++++
 4 files changed, 117 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_files/minimal_docstart1.bio
 rename tests/test_files/{minimal_docstart.bio => minimal_docstart2.bio} (100%)

diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py
index b5b3fa5..2b19a71 100644
--- a/tests/test_conll_format.py
+++ b/tests/test_conll_format.py
@@ -9,8 +9,11 @@
     LineSpec,
     _CoNLLToken,
     ingest_conll_file,
+    write_docs_using_encoding,
 )
 from seqscore.encoding import REPAIR_NONE, get_encoding
+from seqscore.model import LabeledSequence
+from seqscore.util import file_fields_match
 from seqscore.validation import InvalidLabelError
 
 BIO = get_encoding("BIO")
@@ -114,12 +117,15 @@ def test_no_delims() -> None:
 
 def test_validate_with_docstart() -> None:
     ingester = CoNLLIngester(BIO, LINE_SPEC, ignore_document_boundaries=False)
-    path = Path("tests") / "test_files" / "minimal_docstart.bio"
-    with path.open(encoding="utf8") as file:
-        ingester.validate(
-            file,
-            str(path),
-        )
+    # Check two variants, one with docstart in its own sentence and another with
+    # docstart at the start of the sentence
+    for filename in ("minimal_docstart1.bio", "minimal_docstart2.bio"):
+        path = Path("tests") / "test_files" / filename
+        with path.open(encoding="utf8") as file:
+            ingester.validate(
+                file,
+                str(path),
+            )
 
 
 def test_repair_bad_name() -> None:
@@ -152,6 +158,36 @@ def test_bad_label1() -> None:
     )
 
 
+def test_write_docs_no_orig_fields(tmp_path: Path) -> None:
+    sent1 = LabeledSequence(
+        tokens=("This", "is", "a", "sentence", "."),
+        labels=("O", "O", "O", "O", "O"),
+        mentions=(),
+    )
+    sent2 = LabeledSequence.from_tokens_and_labels(
+        (
+            "University",
+            "of",
+            "Pennsylvania",
+            "is",
+            "in",
+            "West",
+            "Philadelphia",
+            ",",
+            "Pennsylvania",
+            ".",
+        ),
+        ("B-ORG", "I-ORG", "I-ORG", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O"),
+        BIO,
+    )
+    docs = [[sent1], [sent2]]
+    output_file = tmp_path / "out.bio"
+    write_docs_using_encoding(docs, "BIO", "utf-8", "\t", LINE_SPEC, output_file)
+    assert file_fields_match(
+        output_file, Path("tests") / "test_files" / "minimal_docstart1.bio", debug=True
+    )
+
+
 def test_bad_label2() -> None:
     ingester = CoNLLIngester(BIO, LINE_SPEC)
     path = Path("tests") / "test_files" / "bad_label2.bio"
diff --git a/tests/test_files/minimal_docstart1.bio b/tests/test_files/minimal_docstart1.bio
new file mode 100644
index 0000000..cc0b7e0
--- /dev/null
+++ b/tests/test_files/minimal_docstart1.bio
@@ -0,0 +1,21 @@
+-DOCSTART-	O
+
+This	O
+is	O
+a	O
+sentence	O
+.	O
+
+-DOCSTART-	O
+
+University	B-ORG
+of	I-ORG
+Pennsylvania	I-ORG
+is	O
+in	O
+West	B-LOC
+Philadelphia	I-LOC
+,	O
+Pennsylvania	B-LOC
+.	O
+
diff --git a/tests/test_files/minimal_docstart.bio b/tests/test_files/minimal_docstart2.bio
similarity index 100%
rename from tests/test_files/minimal_docstart.bio
rename to tests/test_files/minimal_docstart2.bio
diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py
index 69b2573..c336f9b 100644
--- a/tests/test_scoring_click.py
+++ b/tests/test_scoring_click.py
@@ -27,6 +27,60 @@ def test_score_correct_labels() -> None:
     assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output
 
 
+def test_score_incorrect_default_format() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert (
+        "| ALL    |       50.00 |    66.67 |  57.14 |           3 |           4 |         2 |"
+        in result.output
+    )
+    assert (
+        "| LOC    |       33.33 |    50.00 |  40.00 |           2 |           3 |         1 |"
+        in result.output
+    )
+    assert (
+        "| ORG    |      100.00 |   100.00 | 100.00 |           1 |           1 |         1 |"
+        in result.output
+    )
+
+
+def test_score_incorrect_conlleval_format() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            "--score-format",
+            "conlleval",
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert (
+        "processed 15 tokens with 3 phrases; found: 4 phrases; correct: 2."
+        in result.output
+    )
+    assert (
+        "accuracy:  93.33%; precision:  50.00%; recall:  66.67%; FB1:  57.14"
+        in result.output
+    )
+    assert "LOC: precision:  33.33%; recall:  50.00%; FB1:  40.00  3" in result.output
+    assert "ORG: precision: 100.00%; recall: 100.00%; FB1: 100.00  1" in result.output
+
+
 def test_score_invalid_sequence_conlleval() -> None:
     runner = CliRunner()
     result = runner.invoke(

From ccf3172e0d6148f4e78cbf25abd4965c928e44a8 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 15:02:44 -0400
Subject: [PATCH 23/33] Change multi-file scoring to compute std. error and
 allow pretty output

---
 seqscore/conll.py                             | 85 +++++++++++++------
 seqscore/scripts/seqscore.py                  |  4 +-
 .../incorrect1_nopredictions.bio              | 30 +++----
 tests/test_scoring_click.py                   | 70 ++++++++++++++-
 4 files changed, 144 insertions(+), 45 deletions(-)

diff --git a/seqscore/conll.py b/seqscore/conll.py
index 15f8edc..6c75ecc 100644
--- a/seqscore/conll.py
+++ b/seqscore/conll.py
@@ -2,6 +2,7 @@
 from collections import Counter, defaultdict
 from collections.abc import Iterable, Sequence
 from itertools import chain
+from math import sqrt
 from statistics import mean, stdev
 from typing import (
     Any,
@@ -38,6 +39,8 @@
 FORMAT_DELIM = "delim"
 SUPPORTED_SCORE_FORMATS = (FORMAT_PRETTY, FORMAT_CONLLEVAL, FORMAT_DELIM)
 
+ALL_TYPES = "ALL"
+
 
 class CoNLLFormatError(Exception):
     pass
@@ -611,7 +614,8 @@ def score_conll_files(
         elif output_format in (FORMAT_PRETTY, FORMAT_DELIM):
             header, rows = format_output_table(class_scores, full_precision)
             if output_format == FORMAT_PRETTY:
-                # TODO: Should we raise an error for pretty output with full precision specified?
+                if full_precision:
+                    raise ValueError("Cannot use full_precision with pretty formatting")
                 # We don't allow full_precision in this case so we can use the usual float format
                 score_summaries.append(
                     tabulate(rows, header, tablefmt="github", floatfmt="6.2f")
@@ -640,31 +644,33 @@ def score_conll_files(
         else:
             raise ValueError(f"Unrecognized output format: {output_format}")
 
+    # Compute summary statistics across files when multiple files are scored
+    if multi_files:
+        type_scores: DefaultDict[str, list] = defaultdict(list)
+        for class_score in all_class_scores:
+            for entity_type, entity_score in class_score.type_scores.items():
+                type_scores[entity_type].append(entity_score.f1)
+
+        entity_type_means = {
+            entity_type: mean(scores) for entity_type, scores in type_scores.items()
+        }
+        entity_type_means[ALL_TYPES] = mean(score.f1 for score in all_class_scores)
+
+        entity_type_stderrs = {
+            entity_type: stdev(scores) / sqrt(len(scores))
+            for entity_type, scores in type_scores.items()
+        }
+        all_f1s = [score.f1 for score in all_class_scores]
+        entity_type_stderrs[ALL_TYPES] = stdev(all_f1s) / sqrt(len(all_f1s))
+
     # For delimited, just join all the rows
     if output_format == FORMAT_DELIM:
         if multi_files:
-            # Compute summary statistics
-            type_scores: DefaultDict[str, list] = defaultdict(list)
-            for class_score in all_class_scores:
-                for entity_type, entity_score in class_score.type_scores.items():
-                    type_scores[entity_type].append(entity_score.f1)
-
-            entity_type_means = {
-                entity_type: mean(scores) for entity_type, scores in type_scores.items()
-            }
-            entity_type_means["ALL"] = mean(score.f1 for score in all_class_scores)
-            # TODO: This should be standard error of the mean, not standard deviation
-            entity_type_sds = {
-                entity_type: stdev(scores) for entity_type, scores in type_scores.items()
-            }
-            entity_type_sds["ALL"] = stdev(score.f1 for score in all_class_scores)
-
-            for entity_type, num in entity_type_sds.items():
+            for entity_type, num in entity_type_stderrs.items():
                 score_summaries.append(
-                    # TODO: Change SD precision
                     _join_delim(
                         [
-                            "SD",
+                            "SE",
                             entity_type,
                             "NA",
                             "NA",
@@ -676,7 +682,6 @@ def score_conll_files(
                         delim,
                     )
                 )
-            # Add aggregates
             for entity_type, num in entity_type_means.items():
                 score_summaries.append(
                     _join_delim(
@@ -698,8 +703,7 @@ def score_conll_files(
         if not multi_files:
             print(score_summaries[0])
         else:
-            # TODO: Sort out aggregates here?
-            # Index because we care about when we're at the last entry
+            # Use the index because we care whether we're at the last entry
             for idx, (filename, summary) in enumerate(zip(pred_files, score_summaries)):
                 print(filename)
                 print(summary)
@@ -707,6 +711,39 @@ def score_conll_files(
                 if idx != len(pred_files) - 1:
                     print()
 
+            # Print mean ± SE summary table
+            ref_scores = all_class_scores[0]
+            summary_header = ["Type", "Mean F1", "SE", "Reference"]
+            summary_rows = [
+                [
+                    ALL_TYPES,
+                    entity_type_means[ALL_TYPES] * 100,
+                    entity_type_stderrs[ALL_TYPES] * 100,
+                    ref_scores.total_ref,
+                ]
+            ]
+            for entity_type in sorted(entity_type_means):
+                if entity_type == ALL_TYPES:
+                    continue
+                summary_rows.append(
+                    [
+                        entity_type,
+                        entity_type_means[entity_type] * 100,
+                        entity_type_stderrs[entity_type] * 100,
+                        ref_scores.type_scores[entity_type].total_ref,
+                    ]
+                )
+            print()
+            print("Summary")
+            print(
+                tabulate(
+                    summary_rows,
+                    summary_header,
+                    tablefmt="github",
+                    floatfmt="6.2f",
+                )
+            )
+
 
 def format_output_conlleval(
     class_scores: ClassificationScore,
@@ -757,7 +794,7 @@ def format_output_table(
     ]
     rows = [
         [
-            "ALL",
+            ALL_TYPES,
             convert_score(class_scores.precision, full_precision),
             convert_score(class_scores.recall, full_precision),
             convert_score(class_scores.f1, full_precision),
diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index c2e9265..2c24334 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -476,7 +476,7 @@ def summarize(
 @click.option(
     "--full-precision",
     is_flag=True,
-    help="whether to output floating values at full precision instead of rounding half even at two decimal places",
+    help="whether to output floating values at full precision instead of multiplying by 100 and rounding half even at two decimal places",
 )
 @_quiet_option()
 def score(
@@ -505,7 +505,7 @@ def score(
         raise ValueError(f"Can only use full-precision with score-format {FORMAT_DELIM}")
 
     if error_counts and len(file) > 1:
-        raise ValueError("Cannot use error-counts with multiple files to be scored")
+        raise click.UsageError("Cannot use error-counts with multiple files to be scored")
 
     delim = _normalize_tab(delim)
 
diff --git a/tests/conll_predictions/incorrect1_nopredictions.bio b/tests/conll_predictions/incorrect1_nopredictions.bio
index 8146bb7..6a93472 100644
--- a/tests/conll_predictions/incorrect1_nopredictions.bio
+++ b/tests/conll_predictions/incorrect1_nopredictions.bio
@@ -1,16 +1,16 @@
-This O O
-is O O
-a O O
-sentence O O
-. O O
+This O
+is O
+a O
+sentence O
+. O
 
-University B-ORG O
-of I-ORG O
-Pennsylvania I-ORG O
-is O O
-in O O
-West B-LOC O
-Philadelphia I-LOC O
-, O O
-Pennsylvania B-LOC O
-. O O
+University O
+of O
+Pennsylvania O
+is O
+in O
+West O
+Philadelphia O
+, O
+Pennsylvania O
+. O
diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py
index c336f9b..5ee7c9a 100644
--- a/tests/test_scoring_click.py
+++ b/tests/test_scoring_click.py
@@ -1,4 +1,3 @@
-import glob
 import os
 
 from click.testing import CliRunner
@@ -232,9 +231,72 @@ def test_score_multiple_files() -> None:
             os.path.join("tests", "conll_annotation", "minimal.bio"),
             "--score-format",
             "delim",
-        ]
-        + glob.glob(os.path.join("tests", "conll_predictions", "*1.bio")),
+            os.path.join("tests", "conll_predictions", "correct1.bio"),
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+        ],
     )
     assert result.exit_code == 0
-    assert "SD\tALL\tNA\tNA\t30.30\tNA\tNA\tNA" in result.output
+    assert "SE\tALL\tNA\tNA\t21.43\tNA\tNA\tNA" in result.output
     assert "Mean\tALL\tNA\tNA\t78.57\tNA\tNA\tNA" in result.output
+
+
+def test_score_multiple_files_pretty() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            os.path.join("tests", "conll_predictions", "correct1.bio"),
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert os.path.join("tests", "conll_predictions", "correct1.bio") in result.output
+    assert os.path.join("tests", "conll_predictions", "incorrect1.bio") in result.output
+    assert "Summary" in result.output
+    assert "| ALL    |     78.57 |  21.43 |           3 |" in result.output
+    assert "| LOC    |     70.00 |  30.00 |           2 |" in result.output
+    assert "| ORG    |    100.00 |   0.00 |           1 |" in result.output
+
+
+def test_score_error_counts_multiple_files() -> None:
+    # Cannot use error-counts with multiple files
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            os.path.join("tests", "conll_predictions", "correct1.bio"),
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+            "--error-counts",
+        ],
+    )
+    assert result.exit_code != 0
+    assert "Cannot use error-counts with multiple files to be scored" in result.output
+
+
+def test_score_error_counts_conlleval_format() -> None:
+    # Cannot use error-counts with conlleval format
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            "--score-format",
+            "conlleval",
+            os.path.join("tests", "conll_predictions", "correct1.bio"),
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+            "--error-counts",
+        ],
+    )
+    assert result.exit_code != 0
+    assert "Cannot use error-counts with multiple files to be scored" in result.output

From 8dbd007326ccf61c0aaba2b4e1e70a7025e18ec1 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 15:47:57 -0400
Subject: [PATCH 24/33] Improve test coverage of conll.py

---
 tests/test_conll_scoring.py | 69 +++++++++++++++++++++++++++++++++++++
 tests/test_scoring_click.py | 42 ++++++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 tests/test_conll_scoring.py

diff --git a/tests/test_conll_scoring.py b/tests/test_conll_scoring.py
new file mode 100644
index 0000000..1aaabf1
--- /dev/null
+++ b/tests/test_conll_scoring.py
@@ -0,0 +1,69 @@
+import os
+
+import pytest
+
+from seqscore.conll import (
+    FORMAT_CONLLEVAL,
+    FORMAT_DELIM,
+    FORMAT_PRETTY,
+    LineSpec,
+    score_conll_files,
+)
+
+REFERENCE = os.path.join("tests", "conll_annotation", "minimal.bio")
+CORRECT1 = os.path.join("tests", "conll_predictions", "correct1.bio")
+INCORRECT1 = os.path.join("tests", "conll_predictions", "incorrect1.bio")
+
+
+def _score(
+    pred_files: list[str],
+    output_format: str,
+    error_counts: bool = False,
+    full_precision: bool = False,
+) -> None:
+    score_conll_files(
+        pred_files,
+        REFERENCE,
+        mention_encoding_name="BIO",
+        repair=None,
+        file_encoding="utf-8",
+        line_spec=LineSpec(0, -1),
+        ignore_document_boundaries=False,
+        parse_comment_lines=False,
+        delim="\t",
+        output_format=output_format,
+        error_counts=error_counts,
+        full_precision=full_precision,
+    )
+
+
+def test_score_error_counts_multiple_files() -> None:
+    with pytest.raises(
+        ValueError,
+        match="Outputting error counts is only available for a single prediction file",
+    ):
+        _score([CORRECT1, INCORRECT1], FORMAT_DELIM, error_counts=True)
+
+
+def test_score_error_counts_conlleval_format() -> None:
+    with pytest.raises(
+        ValueError,
+        match=f"Format {repr(FORMAT_CONLLEVAL)} is not supported with error counts",
+    ):
+        _score([CORRECT1], FORMAT_CONLLEVAL, error_counts=True)
+
+
+def test_score_full_precision_pretty_format() -> None:
+    with pytest.raises(
+        ValueError,
+        match="Cannot use full_precision with pretty formatting",
+    ):
+        _score([CORRECT1], FORMAT_PRETTY, full_precision=True)
+
+
+def test_score_unrecognized_format() -> None:
+    with pytest.raises(
+        ValueError,
+        match="Unrecognized output format: bogus",
+    ):
+        _score([CORRECT1], "bogus")
diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py
index 5ee7c9a..0ef943a 100644
--- a/tests/test_scoring_click.py
+++ b/tests/test_scoring_click.py
@@ -262,6 +262,48 @@ def test_score_multiple_files_pretty() -> None:
     assert "| ORG    |    100.00 |   0.00 |           1 |" in result.output
 
 
+def test_score_error_counts_single_file() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            "--error-counts",
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert "|   Count | Error   | Type   | Tokens            |" in result.output
+    assert "|       1 | FP      | LOC    | West              |" in result.output
+    assert "|       1 | FP      | LOC    | Philadelphia      |" in result.output
+    assert "|       1 | FN      | LOC    | West Philadelphia |" in result.output
+
+
+def test_score_error_counts_delim_format() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            "--score-format",
+            "delim",
+            "--error-counts",
+            os.path.join("tests", "conll_predictions", "incorrect1.bio"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert "Count\tError\tType\tTokens" in result.output
+    assert "1\tFP\tLOC\tWest" in result.output
+    assert "1\tFP\tLOC\tPhiladelphia" in result.output
+    assert "1\tFN\tLOC\tWest Philadelphia" in result.output
+
+
 def test_score_error_counts_multiple_files() -> None:
     # Cannot use error-counts with multiple files
     runner = CliRunner()

From 105f83738046b99f7f3d7974a00f020d9c479e08 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Wed, 3 Jun 2026 16:33:31 -0400
Subject: [PATCH 25/33] Improve test coverage of seqscore.py

---
 seqscore/scripts/seqscore.py                  |  46 ++--
 .../correct1_improper_sequence.bio            |  16 --
 .../correct1_improper_sequence_ref.txt        |  16 --
 tests/test_conversion_click.py                | 127 ++++++++--
 tests/test_files/map_empty_key.json           |   3 +
 tests/test_files/map_empty_value.json         |   3 +
 tests/test_files/map_invalid_json.json        |   1 +
 tests/test_files/map_not_dict.json            |   1 +
 tests/test_files/map_outside_key.json         |   3 +
 tests/test_files/map_outside_value.json       |   3 +
 ...pace_delim.txt => minimal_space_delim.txt} |   0
 tests/test_process_click.py                   | 236 +++++++++++++++++-
 tests/test_scoring_click.py                   |  50 +++-
 tests/test_utils.py                           |   4 +-
 14 files changed, 416 insertions(+), 93 deletions(-)
 delete mode 100644 tests/conll_predictions/correct1_improper_sequence.bio
 delete mode 100644 tests/conll_predictions/correct1_improper_sequence_ref.txt
 create mode 100644 tests/test_files/map_empty_key.json
 create mode 100644 tests/test_files/map_empty_value.json
 create mode 100644 tests/test_files/map_invalid_json.json
 create mode 100644 tests/test_files/map_not_dict.json
 create mode 100644 tests/test_files/map_outside_key.json
 create mode 100644 tests/test_files/map_outside_value.json
 rename tests/test_files/{space_delim.txt => minimal_space_delim.txt} (100%)

diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py
index 2c24334..053cab2 100644
--- a/seqscore/scripts/seqscore.py
+++ b/seqscore/scripts/seqscore.py
@@ -121,15 +121,6 @@ def _output_delim_option() -> Callable:
     )
 
 
-def _ner_label_index_option() -> Callable:
-    return click.option(
-        "--ner-label-index",
-        default=-1,
-        show_default=True,
-        type=int,
-    )
-
-
 def _quiet_option() -> Callable:
     return click.option(
         "--quiet",
@@ -205,7 +196,9 @@ def repair(
 ) -> None:
     output_delim = _normalize_tab(output_delim)
     if repair_method == REPAIR_NONE:
-        raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}")
+        raise click.UsageError(
+            f"Cannot repair with repair strategy {repr(repair_method)}"
+        )
     line_spec = LineSpec(token_index, label_index)
 
     repair_conll_file(
@@ -300,10 +293,10 @@ def process(
     type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding)
 
     if keep_types_set and remove_types_set:
-        raise ValueError("Cannot specify both keep-types and remove-types")
+        raise click.UsageError("Cannot specify both keep-types and remove-types")
 
     if not keep_types_set and not remove_types_set and not type_map:
-        raise ValueError(
+        raise click.UsageError(
             "Must specify at least one of keep-types, remove-types, or type-map"
         )
 
@@ -316,7 +309,10 @@ def process(
         parse_comment_lines=parse_comment_lines,
     )
 
-    mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict)
+    try:
+        mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict)
+    except ValueError as err:
+        raise click.UsageError(str(err)) from err
 
     write_docs_using_encoding(
         mod_docs, labels, file_encoding, output_delim, line_spec, output_file
@@ -502,7 +498,9 @@ def score(
         repair_method = None
 
     if full_precision and score_format != FORMAT_DELIM:
-        raise ValueError(f"Can only use full-precision with score-format {FORMAT_DELIM}")
+        raise click.UsageError(
+            f"Can only use full-precision with score-format {FORMAT_DELIM}"
+        )
 
     if error_counts and len(file) > 1:
         raise click.UsageError("Cannot use error-counts with multiple files to be scored")
@@ -580,7 +578,7 @@ def _parse_type_list(types: str) -> set[str]:
     # Check for outside type
     for entity_type in split_types:
         if entity_type == DEFAULT_OUTSIDE:
-            raise ValueError(
+            raise click.UsageError(
                 f"Cannot specify the outside type {DEFAULT_OUTSIDE} in keep/remove types"
             )
     return set(split_types)
@@ -596,40 +594,42 @@ def _load_type_map(
         with open(type_map_path, encoding=file_encoding) as file:
             type_map = json.load(file)
     except FileNotFoundError as err:
-        raise ValueError(f"Could not open type map file {repr(type_map_path)}") from err
+        raise click.UsageError(
+            f"Could not open type map file {repr(type_map_path)}"
+        ) from err
     except json.decoder.JSONDecodeError as err:
-        raise ValueError(
+        raise click.UsageError(
             f"Type map provided in file {repr(type_map_path)} is not valid JSON"
         ) from err
 
     # Validate types
     if not isinstance(type_map, dict):
-        raise ValueError(
+        raise click.UsageError(
             f"Type map provided in file {repr(type_map_path)} is not a dictionary"
         )
 
     for from_type, to_types in type_map.items():
         if not isinstance(from_type, str) or not from_type:
-            raise ValueError(
+            raise click.UsageError(
                 f"Key {repr(from_type)} in type map {repr(type_map_path)} is not a non-empty string"
             )
         if from_type == DEFAULT_OUTSIDE:
-            raise ValueError(
+            raise click.UsageError(
                 f"Key {repr(from_type)} in type map {repr(type_map_path)} is the outside type {DEFAULT_OUTSIDE}"
             )
 
         if not isinstance(to_types, list):
-            raise ValueError(
+            raise click.UsageError(
                 f"Value {repr(to_types)} in type map {repr(type_map_path)} is not a list"
             )
 
         for to_type in to_types:
             if not isinstance(to_type, str) or not to_type:
-                raise ValueError(
+                raise click.UsageError(
                     f"Value {repr(to_type)} in type map {repr(type_map_path)} is not a non-empty string"
                 )
             if to_type == DEFAULT_OUTSIDE:
-                raise ValueError(
+                raise click.UsageError(
                     f"Value {repr(to_type)} in type map {repr(type_map_path)} is the outside type {DEFAULT_OUTSIDE}"
                 )
 
diff --git a/tests/conll_predictions/correct1_improper_sequence.bio b/tests/conll_predictions/correct1_improper_sequence.bio
deleted file mode 100644
index ecc2f1a..0000000
--- a/tests/conll_predictions/correct1_improper_sequence.bio
+++ /dev/null
@@ -1,16 +0,0 @@
-This O O
-is O O
-a O O
-sentence O O
-. O O
-
-University B-ORG I-ORG
-of I-ORG I-ORG
-Pennsylvania I-ORG I-ORG
-is O O
-in O O
-West B-LOC B-LOC
-Philadelphia I-LOC I-LOC
-, O O
-Pennsylvania B-LOC B-LOC
-. O O
diff --git a/tests/conll_predictions/correct1_improper_sequence_ref.txt b/tests/conll_predictions/correct1_improper_sequence_ref.txt
deleted file mode 100644
index 2ff8768..0000000
--- a/tests/conll_predictions/correct1_improper_sequence_ref.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-This	O
-is	O
-a	O
-sentence	O
-.	O
-
-University	B-ORG
-of	I-ORG
-Pennsylvania	I-ORG
-is	O
-in	O
-West	B-LOC
-Philadelphia	I-LOC
-,	O
-Pennsylvania	B-LOC
-.	O
diff --git a/tests/test_conversion_click.py b/tests/test_conversion_click.py
index b41a32b..b827d0c 100644
--- a/tests/test_conversion_click.py
+++ b/tests/test_conversion_click.py
@@ -5,7 +5,7 @@
 from click.testing import CliRunner
 
 from seqscore.scripts.seqscore import convert
-from seqscore.util import file_fields_match
+from seqscore.util import file_fields_match, file_lines_match
 
 TMP_DIR: Optional[tempfile.TemporaryDirectory] = None
 
@@ -23,6 +23,7 @@ def teardown_module() -> None:
 
 def test_invalid_conversion_BIO() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "temp.txt")
     result = runner.invoke(
         convert,
         [
@@ -31,7 +32,7 @@ def test_invalid_conversion_BIO() -> None:
             "--output-labels",
             "BIOES",
             os.path.join("tests", "conll_annotation", "invalid1.bio"),
-            os.path.join(TMP_DIR.name, "temp.txt"),
+            output_path,
         ],
     )
     assert result.exit_code != 0
@@ -39,6 +40,7 @@ def test_invalid_conversion_BIO() -> None:
 
 def test_invalid_conversion_BIOES() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "temp.txt")
     result = runner.invoke(
         convert,
         [
@@ -47,7 +49,7 @@ def test_invalid_conversion_BIOES() -> None:
             "--output-labels",
             "BIO",
             os.path.join("tests", "conll_annotation", "invalid1.bioes"),
-            os.path.join(TMP_DIR.name, "temp.txt"),
+            output_path,
         ],
     )
     assert result.exit_code != 0
@@ -55,6 +57,7 @@ def test_invalid_conversion_BIOES() -> None:
 
 def test_BIO_to_BIOES() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOtoBIOES.txt")
     result = runner.invoke(
         convert,
         [
@@ -63,18 +66,19 @@ def test_BIO_to_BIOES() -> None:
             "--output-labels",
             "BIOES",
             os.path.join("tests", "conll_annotation", "minimal.bio"),
-            os.path.join(TMP_DIR.name, "BIOtoBIOES.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "BIOtoBIOES.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal.bioes"),
     )
 
 
 def test_BIOES_to_BIO() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOEStoBIO.txt")
     result = runner.invoke(
         convert,
         [
@@ -83,18 +87,19 @@ def test_BIOES_to_BIO() -> None:
             "--output-labels",
             "BIO",
             os.path.join("tests", "conll_annotation", "minimal.bioes"),
-            os.path.join(TMP_DIR.name, "BIOEStoBIO.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "BIOEStoBIO.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal.bio"),
     )
 
 
 def test_BIO_to_IO() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOtoIO.txt")
     result = runner.invoke(
         convert,
         [
@@ -103,18 +108,19 @@ def test_BIO_to_IO() -> None:
             "--output-labels",
             "IO",
             os.path.join("tests", "conll_annotation", "minimal.bio"),
-            os.path.join(TMP_DIR.name, "BIOtoIO.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "BIOtoIO.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal.io"),
     )
 
 
 def test_IO_to_BIO() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "IOtoBIO.txt")
     result = runner.invoke(
         convert,
         [
@@ -123,19 +129,20 @@ def test_IO_to_BIO() -> None:
             "--output-labels",
             "BIO",
             os.path.join("tests", "conll_annotation", "minimal.io"),
-            os.path.join(TMP_DIR.name, "IOtoBIO.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     # conversion will not necessarily reproduce BIO correctly but does in this case
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "IOtoBIO.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal.bio"),
     )
 
 
 def test_BIO_to_IOB_fields() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOtoIOB.txt")
     result = runner.invoke(
         convert,
         [
@@ -144,18 +151,19 @@ def test_BIO_to_IOB_fields() -> None:
             "--output-labels",
             "IOB",
             os.path.join("tests", "conll_annotation", "minimal_fields.bio"),
-            os.path.join(TMP_DIR.name, "BIOtoIOB.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "BIOtoIOB.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal_fields.iob"),
     )
 
 
 def test_IOB_to_BIO_fields() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "IOBtoBIO.txt")
     result = runner.invoke(
         convert,
         [
@@ -164,18 +172,19 @@ def test_IOB_to_BIO_fields() -> None:
             "--output-labels",
             "BIO",
             os.path.join("tests", "conll_annotation", "minimal_fields.iob"),
-            os.path.join(TMP_DIR.name, "IOBtoBIO.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "IOBtoBIO.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal_fields.bio"),
     )
 
 
 def test_IOB_to_BIO_fields_and_specified_indices() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "labels_not_last_col.bioes")
     result = runner.invoke(
         convert,
         [
@@ -186,18 +195,19 @@ def test_IOB_to_BIO_fields_and_specified_indices() -> None:
             "--label-index",
             "1",
             os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"),
-            os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"),
+        output_path,
         os.path.join("tests", "conll_annotation", "labels_not_last_col.bioes"),
     )
 
 
 def test_IO_to_BIOES() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "IOtoBIOES.txt")
     result = runner.invoke(
         convert,
         [
@@ -206,19 +216,20 @@ def test_IO_to_BIOES() -> None:
             "--output-labels",
             "BIOES",
             os.path.join("tests", "conll_annotation", "minimal.io"),
-            os.path.join(TMP_DIR.name, "IOtoBIOES.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     # conversion will not necessarily reproduce BIOES correctly but does in this case
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "IOtoBIOES.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal.bioes"),
     )
 
 
 def test_BIOES_to_IO() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOEStoIO.txt")
     result = runner.invoke(
         convert,
         [
@@ -227,18 +238,88 @@ def test_BIOES_to_IO() -> None:
             "--output-labels",
             "IO",
             os.path.join("tests", "conll_annotation", "minimal.bioes"),
-            os.path.join(TMP_DIR.name, "BIOEStoIO.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "BIOEStoIO.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "minimal.io"),
     )
 
 
+def test_BIO_to_BIO_space_delim() -> None:
+    runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOtoBIO_space.txt")
+    result = runner.invoke(
+        convert,
+        [
+            "--input-labels",
+            "BIO",
+            "--output-labels",
+            "BIO",
+            "--output-delim",
+            " ",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            output_path,
+        ],
+    )
+    assert result.exit_code == 0
+    assert file_lines_match(
+        output_path,
+        os.path.join("tests", "test_files", "minimal_space_delim.txt"),
+    )
+
+
+def test_BIO_to_BIO_tab_spelled_out() -> None:
+    runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOtoBIO_tab_spelled_out.txt")
+    result = runner.invoke(
+        convert,
+        [
+            "--input-labels",
+            "BIO",
+            "--output-labels",
+            "BIO",
+            "--output-delim",
+            "tab",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            output_path,
+        ],
+    )
+    assert result.exit_code == 0
+    assert file_lines_match(
+        output_path,
+        os.path.join("tests", "conll_annotation", "minimal.bio"),
+    )
+
+
+def test_BIO_to_BIO_tab_backslash_t() -> None:
+    runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "BIOtoBIO_tab_backslash_t.txt")
+    result = runner.invoke(
+        convert,
+        [
+            "--input-labels",
+            "BIO",
+            "--output-labels",
+            "BIO",
+            "--output-delim",
+            "\\t",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            output_path,
+        ],
+    )
+    assert result.exit_code == 0
+    assert file_lines_match(
+        output_path,
+        os.path.join("tests", "conll_annotation", "minimal.bio"),
+    )
+
+
 def test_diff_token_label_indices() -> None:
     runner = CliRunner()
+    output_path = os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt")
     result = runner.invoke(
         convert,
         [
@@ -251,11 +332,11 @@ def test_diff_token_label_indices() -> None:
             "--label-index",
             "2",
             os.path.join("tests", "conll_annotation", "diff_token_label_indices.bio"),
-            os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"),
+            output_path,
         ],
     )
     assert result.exit_code == 0
     assert file_fields_match(
-        os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"),
+        output_path,
         os.path.join("tests", "conll_annotation", "diff_token_label_indices.bioes"),
     )
diff --git a/tests/test_files/map_empty_key.json b/tests/test_files/map_empty_key.json
new file mode 100644
index 0000000..1eeffa0
--- /dev/null
+++ b/tests/test_files/map_empty_key.json
@@ -0,0 +1,3 @@
+{
+  "": ["LOC"]
+}
diff --git a/tests/test_files/map_empty_value.json b/tests/test_files/map_empty_value.json
new file mode 100644
index 0000000..fa87826
--- /dev/null
+++ b/tests/test_files/map_empty_value.json
@@ -0,0 +1,3 @@
+{
+  "GPE": [""]
+}
diff --git a/tests/test_files/map_invalid_json.json b/tests/test_files/map_invalid_json.json
new file mode 100644
index 0000000..828e21d
--- /dev/null
+++ b/tests/test_files/map_invalid_json.json
@@ -0,0 +1 @@
+{ invalid json
diff --git a/tests/test_files/map_not_dict.json b/tests/test_files/map_not_dict.json
new file mode 100644
index 0000000..c941e94
--- /dev/null
+++ b/tests/test_files/map_not_dict.json
@@ -0,0 +1 @@
+["LOC", "GPE"]
diff --git a/tests/test_files/map_outside_key.json b/tests/test_files/map_outside_key.json
new file mode 100644
index 0000000..ed0263d
--- /dev/null
+++ b/tests/test_files/map_outside_key.json
@@ -0,0 +1,3 @@
+{
+  "O": ["LOC"]
+}
diff --git a/tests/test_files/map_outside_value.json b/tests/test_files/map_outside_value.json
new file mode 100644
index 0000000..e3b6ef0
--- /dev/null
+++ b/tests/test_files/map_outside_value.json
@@ -0,0 +1,3 @@
+{
+  "GPE": ["O"]
+}
diff --git a/tests/test_files/space_delim.txt b/tests/test_files/minimal_space_delim.txt
similarity index 100%
rename from tests/test_files/space_delim.txt
rename to tests/test_files/minimal_space_delim.txt
diff --git a/tests/test_process_click.py b/tests/test_process_click.py
index 5fd251c..f59cbe5 100644
--- a/tests/test_process_click.py
+++ b/tests/test_process_click.py
@@ -232,6 +232,27 @@ def test_map_types_keep_types() -> None:
     assert file_fields_match(TEST_FILES_DIR / "minimal_no_names.bio", output_path)
 
 
+def test_keep_and_remove_types() -> None:
+    runner = CliRunner()
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--keep-types",
+            "LOC,ORG",
+            "--remove-types",
+            "MISC",
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert "Cannot specify both keep-types and remove-types" in result.output
+
+
 def test_map_types_invalid_map() -> None:
     runner = CliRunner()
     map_path = str(TEST_FILES_DIR / "map_bad_value.json")
@@ -248,8 +269,11 @@ def test_map_types_invalid_map() -> None:
             output_path,
         ],
     )
-    # Malformed map, dictionary value is a string and not a list
-    assert result.exit_code != 0
+    assert result.exit_code == 2
+    assert (
+        "Value 'LOC' in type map 'tests/test_files/map_bad_value.json' is not a list"
+        in result.output
+    )
 
 
 def test_map_types_duplicate_mapping() -> None:
@@ -268,11 +292,31 @@ def test_map_types_duplicate_mapping() -> None:
             output_path,
         ],
     )
-    # Malformed map, dictionary value is a string and not a list
-    assert result.exit_code != 0
+    assert result.exit_code == 2
+    assert "Multiple mappings specified for type 'LOC' in type map" in result.output
 
 
-def test_keep_and_remove_types() -> None:
+def test_no_operation() -> None:
+    runner = CliRunner()
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Must specify at least one of keep-types, remove-types, or type-map"
+        in result.output
+    )
+
+
+def test_keep_outside_type() -> None:
     runner = CliRunner()
     input_path = str(ANNOTATION_DIR / "minimal.bio")
     output_path = str(Path(TMP_DIR.name) / "out.bio")
@@ -280,14 +324,188 @@ def test_keep_and_remove_types() -> None:
         process,
         [
             "--keep-types",
-            "LOC,ORG",
+            "O",
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert "Cannot specify the outside type O in keep/remove types" in result.output
+
+
+def test_remove_outside_type() -> None:
+    runner = CliRunner()
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
             "--remove-types",
-            "MISC",
+            "O",
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert "Cannot specify the outside type O in keep/remove types" in result.output
+
+
+def test_type_map_missing_file() -> None:
+    runner = CliRunner()
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            "nonexistent_map.json",
             "--labels",
             "BIO",
             input_path,
             output_path,
         ],
     )
-    # Can't specify both keep and remove
-    assert result.exit_code != 0
+    assert result.exit_code == 2
+    assert "Could not open type map file 'nonexistent_map.json'" in result.output
+
+
+def test_type_map_invalid_json() -> None:
+    runner = CliRunner()
+    map_path = str(TEST_FILES_DIR / "map_invalid_json.json")
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            map_path,
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Type map provided in file 'tests/test_files/map_invalid_json.json' is not valid JSON"
+        in result.output
+    )
+
+
+def test_type_map_not_dict() -> None:
+    runner = CliRunner()
+    map_path = str(TEST_FILES_DIR / "map_not_dict.json")
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            map_path,
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Type map provided in file 'tests/test_files/map_not_dict.json' is not a dictionary"
+        in result.output
+    )
+
+
+def test_type_map_empty_key() -> None:
+    runner = CliRunner()
+    map_path = str(TEST_FILES_DIR / "map_empty_key.json")
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            map_path,
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Key '' in type map 'tests/test_files/map_empty_key.json' is not a non-empty string"
+        in result.output
+    )
+
+
+def test_type_map_outside_key() -> None:
+    runner = CliRunner()
+    map_path = str(TEST_FILES_DIR / "map_outside_key.json")
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            map_path,
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Key 'O' in type map 'tests/test_files/map_outside_key.json' is the outside type O"
+        in result.output
+    )
+
+
+def test_type_map_empty_value() -> None:
+    runner = CliRunner()
+    map_path = str(TEST_FILES_DIR / "map_empty_value.json")
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            map_path,
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Value '' in type map 'tests/test_files/map_empty_value.json' is not a non-empty string"
+        in result.output
+    )
+
+
+def test_type_map_outside_value() -> None:
+    runner = CliRunner()
+    map_path = str(TEST_FILES_DIR / "map_outside_value.json")
+    input_path = str(ANNOTATION_DIR / "minimal.bio")
+    output_path = str(Path(TMP_DIR.name) / "out.bio")
+    result = runner.invoke(
+        process,
+        [
+            "--type-map",
+            map_path,
+            "--labels",
+            "BIO",
+            input_path,
+            output_path,
+        ],
+    )
+    assert result.exit_code == 2
+    assert (
+        "Value 'O' in type map 'tests/test_files/map_outside_value.json' is the outside type O"
+        in result.output
+    )
diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py
index 0ef943a..9a97ab4 100644
--- a/tests/test_scoring_click.py
+++ b/tests/test_scoring_click.py
@@ -26,6 +26,27 @@ def test_score_correct_labels() -> None:
     assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output
 
 
+def test_score_no_predictions() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            "--score-format",
+            "delim",
+            os.path.join("tests", "conll_predictions", "incorrect1_nopredictions.bio"),
+        ],
+    )
+    assert result.exit_code == 0
+    assert "Type\tPrecision\tRecall\tF1\tReference\tPredicted\tCorrect" in result.output
+    assert "ALL\t0.00\t0.00\t0.00\t3\t0\t0" in result.output
+    assert "LOC\t0.00\t0.00\t0.00\t2\t0\t0" in result.output
+    assert "ORG\t0.00\t0.00\t0.00\t1\t0\t0" in result.output
+
+
 def test_score_incorrect_default_format() -> None:
     runner = CliRunner()
     result = runner.invoke(
@@ -150,7 +171,8 @@ def test_score_invalid_sequence_none() -> None:
             ),
         ],
     )
-    assert result.exit_code != 0
+    assert result.exit_code == 1
+    assert "Invalid transition 'O' -> 'I-ORG'" in str(result.exception)
 
 
 def test_score_valid_incorrect_sequence() -> None:
@@ -217,7 +239,10 @@ def test_score_invalid_labels() -> None:
             os.path.join("tests", "conll_predictions", "incorrect1.bio"),
         ],
     )
-    assert result.exit_code != 0
+    assert result.exit_code == 1
+    assert "The above labels are not valid for the chunk encoding BIO." in str(
+        result.exception
+    )
 
 
 def test_score_multiple_files() -> None:
@@ -319,10 +344,27 @@ def test_score_error_counts_multiple_files() -> None:
             "--error-counts",
         ],
     )
-    assert result.exit_code != 0
+    assert result.exit_code == 2
     assert "Cannot use error-counts with multiple files to be scored" in result.output
 
 
+def test_score_full_precision_not_delim() -> None:
+    runner = CliRunner()
+    result = runner.invoke(
+        score,
+        [
+            "--labels",
+            "BIO",
+            "--reference",
+            os.path.join("tests", "conll_annotation", "minimal.bio"),
+            "--full-precision",
+            os.path.join("tests", "conll_predictions", "correct1.bio"),
+        ],
+    )
+    assert result.exit_code == 2
+    assert "Can only use full-precision with score-format delim" in result.output
+
+
 def test_score_error_counts_conlleval_format() -> None:
     # Cannot use error-counts with conlleval format
     runner = CliRunner()
@@ -340,5 +382,5 @@ def test_score_error_counts_conlleval_format() -> None:
             "--error-counts",
         ],
     )
-    assert result.exit_code != 0
+    assert result.exit_code == 2
     assert "Cannot use error-counts with multiple files to be scored" in result.output
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d1bbf7c..f5b6721 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -33,11 +33,11 @@ def test_empty_file() -> None:
 
 def test_differing_whitespace() -> None:
     assert file_fields_match(
-        os.path.join("tests", "test_files", "space_delim.txt"),
+        os.path.join("tests", "test_files", "minimal_space_delim.txt"),
         os.path.join("tests", "conll_annotation", "minimal.bio"),
     )
     assert not file_lines_match(
-        os.path.join("tests", "test_files", "space_delim.txt"),
+        os.path.join("tests", "test_files", "minimal_space_delim.txt"),
         os.path.join("tests", "conll_annotation", "minimal.bio"),
     )
 

From fafc12fe0cc8d01024c3e76f532c3767afac5662 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 10:41:01 -0400
Subject: [PATCH 26/33] Move all setup to pyproject.toml and add uv
 instructions

---
 README.md           | 12 ++++++----
 pyproject.toml      | 55 ++++++++++++++++++++++++++++++++++++++++++++-
 requirements.txt    | 19 ----------------
 setup.py            | 52 ------------------------------------------
 tests/pre_commit.sh |  2 +-
 5 files changed, 63 insertions(+), 77 deletions(-)
 delete mode 100644 requirements.txt
 delete mode 100755 setup.py

diff --git a/README.md b/README.md
index 6fc280d..43b68ad 100644
--- a/README.md
+++ b/README.md
@@ -595,15 +595,19 @@ The following instructions are for the project maintainers only.
 For development, check out the `dev` branch (latest, but less tested
 than `main`).
 
-To install from a clone of this repository, use:
-`pip install -e .`
-
 ## Setting up an environment for development
 
+### Using uv
+
+1. Create an environment: `uv venv --python 3.10 .venv`
+2. Install seqscore and development dependencies: `uv pip install -e ".[dev]"`
+
+### Using conda
+
 1. Create an environment: `conda create -yn seqscore python=3.10`
 2. Activate the environment: `conda activate seqscore`
 3. Install seqscore: `pip install -e .`
-4. Install development dependencies: `pip install -r requirements.txt`
+4. Install development dependencies: `pip install -e ".[dev]"`
 
 # Contributors
 
diff --git a/pyproject.toml b/pyproject.toml
index 212b567..23866eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,57 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "seqscore"
+dynamic = ["version"]
+description = "SeqScore: Scoring for named entity recognition and other sequence labeling tasks"
+readme = "README.md"
+license = {text = "MIT"}
+authors = [
+    {name = "Constantine Lignos", email = "lignos@brandeis.edu"},
+]
+requires-python = ">=3.10"
+dependencies = [
+    "attrs>=19.2.0",
+    "click",
+    "tabulate",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+[project.urls]
+Homepage = "https://github.com/bltlab/seqscore"
+
+[project.scripts]
+seqscore = "seqscore.scripts.seqscore:cli"
+
+[project.optional-dependencies]
+dev = [
+    "types-tabulate",
+    "pytest==9.0.3",
+    "pytest-cov>=7.1.0",
+    "mypy==2.1.0",
+    "ruff==0.15.15",
+]
+
+[tool.setuptools.dynamic]
+version = {attr = "seqscore.__version__"}
+
+[tool.setuptools.packages.find]
+include = ["seqscore", "seqscore.*"]
+
+[tool.setuptools.package-data]
+seqscore = ["py.typed"]
+
 [tool.mypy]
 python_version = "3.10"
 strict_optional = false
@@ -6,7 +60,6 @@ disallow_untyped_calls = true
 
 [[tool.mypy.overrides]]
 module = [
-    "setuptools",
     "click.*",
 ]
 ignore_missing_imports = true
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 8ac2e1b..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# This file only contains dependencies needed for development.
-# setup.py contains the actual package dependencies, and the package
-# should be installed before these requirements.
-
-# Type annotations for tabulate
-types-tabulate
-
-# For testing
-pytest==9.0.3
-pytest-cov>=7.1.0
-
-# For development
-mypy==2.1.0
-ruff==0.15.15
-
-# Documentation build
-# Disabled for now since we don't need them
-# sphinx
-# sphinx-rtd-theme
diff --git a/setup.py b/setup.py
deleted file mode 100755
index fbb7f4a..0000000
--- a/setup.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#! /usr/bin/env python
-
-from os import path
-
-from setuptools import find_packages, setup
-
-from seqscore import __version__
-
-
-def setup_package() -> None:
-    root = path.abspath(path.dirname(__file__))
-    with open(path.join(root, "README.md"), encoding="utf-8") as f:
-        long_description = f.read()
-
-    setup(
-        name="seqscore",
-        version=__version__,
-        packages=find_packages(include=("seqscore", "seqscore.*")),
-        # Package type information
-        package_data={"seqscore": ["py.typed"]},
-        python_requires=">=3.10",
-        license="MIT",
-        description="SeqScore: Scoring for named entity recognition and other sequence labeling tasks",
-        long_description=long_description,
-        install_requires=[
-            "attrs>=19.2.0",
-            "click",
-            "tabulate",
-        ],
-        entry_points="""
-            [console_scripts]
-            seqscore=seqscore.scripts.seqscore:cli
-        """,
-        classifiers=[
-            "Development Status :: 4 - Beta",
-            "License :: OSI Approved :: MIT License",
-            "Programming Language :: Python :: 3.10",
-            "Programming Language :: Python :: 3.11",
-            "Programming Language :: Python :: 3.12",
-            "Programming Language :: Python :: 3.13",
-            "Programming Language :: Python :: 3.14",
-            "Topic :: Scientific/Engineering :: Artificial Intelligence",
-        ],
-        url="https://github.com/bltlab/seqscore",
-        long_description_content_type="text/markdown",
-        author="Constantine Lignos",
-        author_email="lignos@brandeis.edu",
-    )
-
-
-if __name__ == "__main__":
-    setup_package()
diff --git a/tests/pre_commit.sh b/tests/pre_commit.sh
index 411b6f3..3b76d84 100755
--- a/tests/pre_commit.sh
+++ b/tests/pre_commit.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-files=(seqscore/ tests/ *.py)
+files=(seqscore/ tests/)
 ruff check --fix "${files[@]}"
 ruff check --select I --fix "${files[@]}"  # Organize imports
 ruff format "${files[@]}"

From 5d8456e6dc65f4ef60609a1e802d83808073e30a Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 10:50:14 -0400
Subject: [PATCH 27/33] Add flowmark for markdown autoformatting

---
 pyproject.toml      | 1 +
 tests/pre_commit.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 23866eb..a6256ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ dev = [
     "pytest-cov>=7.1.0",
     "mypy==2.1.0",
     "ruff==0.15.15",
+    "flowmark",
 ]
 
 [tool.setuptools.dynamic]
diff --git a/tests/pre_commit.sh b/tests/pre_commit.sh
index 3b76d84..1967c0a 100755
--- a/tests/pre_commit.sh
+++ b/tests/pre_commit.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
+flowmark -i --nobackup *.md
 files=(seqscore/ tests/)
 ruff check --fix "${files[@]}"
 ruff check --select I --fix "${files[@]}"  # Organize imports

From 8c52701c52a2a3945e98323e39d05707d7f37ab2 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 10:50:29 -0400
Subject: [PATCH 28/33] Autoformat README

---
 README.md           | 212 ++++++++++++++++++++------------------------
 tests/pre_commit.sh |   2 +-
 2 files changed, 95 insertions(+), 119 deletions(-)

diff --git a/README.md b/README.md
index 43b68ad..afb832d 100644
--- a/README.md
+++ b/README.md
@@ -5,24 +5,22 @@
 [![image](https://img.shields.io/pypi/l/seqscore.svg)](https://pypi.python.org/pypi/seqscore)
 [![image](https://img.shields.io/pypi/pyversions/seqscore.svg)](https://pypi.python.org/pypi/seqscore)
 
-SeqScore provides scoring for named entity recognition and other
-chunking tasks evaluated over sequence labels.
+SeqScore provides scoring for named entity recognition and other chunking tasks
+evaluated over sequence labels.
 
-SeqScore is maintained by the BLT Lab at Brandeis University. Please
-open an issue if you find incorrect behavior or features you would like
-to see added. Due to the risk of introducing regressions or incorrect
-scoring behavior, *we generally do not accept pull requests*. Please do not
-open a pull request unless you are asked to do so by a maintainer in an
-issue.
+SeqScore is maintained by the BLT Lab at Brandeis University. Please open an issue if
+you find incorrect behavior or features you would like to see added. Due to the risk of
+introducing regressions or incorrect scoring behavior, *we generally do not accept pull
+requests*. Please do not open a pull request unless you are asked to do so by a
+maintainer in an issue.
 
 ## Installation
 
-To install the latest official release of SeqScore, run: `pip install seqscore`.
-This will install the package and add the command `seqscore` in your Python
-environment.
+To install the latest official release of SeqScore, run: `pip install seqscore`. This
+will install the package and add the command `seqscore` in your Python environment.
 
-SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12,
-3.13, and 3.14.
+SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12, 3.13,
+and 3.14.
 
 ## License
 
@@ -78,7 +76,6 @@ Other papers related to SeqScore include:
 * [Toward More Meaningful Resources for Lower-resourced Languages](https://aclanthology.org/2022.findings-acl.44/)
 * [CoNLL#: Fine-grained Error Analysis and a Corrected Test Set for CoNLL-03 English](https://aclanthology.org/2024.lrec-main.330/)
 
-
 # Usage
 
 ## Overview
@@ -108,10 +105,9 @@ Commands:
 
 ## Scoring
 
-The most common application of SeqScore is scoring CoNLL-format NER
-predictions. Let's assume you have two files, one containing the
-correct labels (annotation) and the other containing the predictions
-(system output).
+The most common application of SeqScore is scoring CoNLL-format NER predictions. Let's
+assume you have two files, one containing the correct labels (annotation) and the other
+containing the predictions (system output).
 
 The correct labels are in the file [samples/reference.bio](samples/reference.bio):
 
@@ -132,7 +128,6 @@ Philadelphia I-LOC
 , O
 Pennsylvania B-LOC
 . O
-
 ```
 
 The predictions are in the file [samples/predicted.bio](samples/predicted.bio):
@@ -154,7 +149,6 @@ Philadelphia B-LOC
 , O
 Pennsylvania B-LOC
 . O
-
 ```
 
 To score the predictions, run:
@@ -171,27 +165,23 @@ To score the predictions, run:
 A few things to note:
 
 * The reference file must be specified with the `--reference` flag.
-* The chunk encoding (BIO, BIOES, etc.) must be specified using the
-  `--labels` flag.
-* Both files need to use the same chunk encoding. If you have
-  files that use different chunk encodings, use the `convert` command.
-* You can get output in different formats using the `--score-format`
-  flag. Using `--score-format delim` will produce tab-delimited
-  output. In the delimited format, you can specify the `--full-precision`
-  flag to output higher numerical precision.
-* In the default (pretty) output format, numbers are rounded "half up"
-  at two decimal places. In other words, 57.124 will round to 57.12,
-  and 57.125 will round to 57.13. This is different than the "half even"
-  rounding used by `conlleval` and other libraries that rely on `printf`
-  behavior for rounding. Half up rounding is used as it is more likely to
-  match the rounding a user would perform if shown three decimal places.
-  If you request `conlleval` output format, the same rounding used by
+* The chunk encoding (BIO, BIOES, etc.) must be specified using the `--labels` flag.
+* Both files need to use the same chunk encoding. If you have files that use different
+  chunk encodings, use the `convert` command.
+* You can get output in different formats using the `--score-format` flag. Using
+  `--score-format delim` will produce tab-delimited output. In the delimited format, you
+  can specify the `--full-precision` flag to output higher numerical precision.
+* In the default (pretty) output format, numbers are rounded "half up" at two decimal
+  places. In other words, 57.124 will round to 57.12, and 57.125 will round to 57.13.
+  This is different than the "half even" rounding used by `conlleval` and other
+  libraries that rely on `printf` behavior for rounding. Half up rounding is used as it
+  is more likely to match the rounding a user would perform if shown three decimal
+  places. If you request `conlleval` output format, the same rounding used by
   `conlleval` will be used.
 
-The above scoring command will work for files that do not have any
-invalid transitions, that is, those that perfectly follow what the
-encoding allows. However, consider this BIO-encoded file,
-[samples/invalid.bio](samples/invalid.bio):
+The above scoring command will work for files that do not have any invalid transitions,
+that is, those that perfectly follow what the encoding allows. However, consider this
+BIO-encoded file, [samples/invalid.bio](samples/invalid.bio):
 
 ```
 This O
@@ -210,11 +200,10 @@ Philadelphia I-LOC
 , O
 Pennsylvania B-LOC
 . O
-
 ```
 
-Note that the token `University` has the label `I-ORG`, but there is
-no preceding `B-ORG`. If we score it as before with
+Note that the token `University` has the label `I-ORG`, but there is no preceding
+`B-ORG`. If we score it as before with
 `seqscore score --labels BIO --reference samples/reference.bio samples/invalid.bio`,
 scoring will fail:
 
@@ -223,10 +212,9 @@ seqscore.encoding.EncodingError: Stopping due to validation errors in invalid.bi
 Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7
 ```
 
-To score output with invalid transitions, we need to specify a repair
-method which can correct them. We can tell SeqScore to use the same
-approach that conlleval uses (which we refer to as "begin" repair in our
-paper):
+To score output with invalid transitions, we need to specify a repair method which can
+correct them. We can tell SeqScore to use the same approach that conlleval uses (which
+we refer to as "begin" repair in our paper):
 `seqscore score --labels BIO --repair-method conlleval --reference samples/reference.bio samples/invalid.bio`:
 
 ```
@@ -242,8 +230,8 @@ New: ('B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O')
 | ORG    |      100.00 |   100.00 | 100.00 |           1 |           1 |         1 |
 ```
 
-You can use the `-q` flag to suppress the logging of all of the repairs
-applied. For example, running the command
+You can use the `-q` flag to suppress the logging of all of the repairs applied. For
+example, running the command
 `seqscore score -q --labels BIO --repair-method conlleval --reference samples/reference.bio samples/invalid.bio`
 will hide the repairs:
 
@@ -255,13 +243,12 @@ will hide the repairs:
 | ORG    |      100.00 |   100.00 | 100.00 |           1 |           1 |         1 |
 ```
 
-You may want to also explore the `discard` repair, which can
-produce higher scores for output from models without a CRF/constrained
-decoding as they are more likely to produce invalid transitions.
+You may want to also explore the `discard` repair, which can produce higher scores for
+output from models without a CRF/constrained decoding as they are more likely to produce
+invalid transitions.
 
-SeqScore can also display all errors (false positives and false negatives)
-encountered in scoring using the `--error-counts` flag. For example, running the
-command
+SeqScore can also display all errors (false positives and false negatives) encountered
+in scoring using the `--error-counts` flag. For example, running the command
 `seqscore score --labels BIO --error-counts --reference samples/reference.bio samples/predicted.bio`
 will produce the following output:
 
@@ -273,10 +260,10 @@ will produce the following output:
 |       1 | FN      | LOC    | West Philadelphia |
 ```
 
-The output shows that the system produced two false positives and missed one
-mention in the reference (false negative). The most frequent errors appear at
-the top. The `--error-counts` flag can be combined with `--score-format delim`
-to write a delimited table that can be read as a spreadsheet.
+The output shows that the system produced two false positives and missed one mention in
+the reference (false negative). The most frequent errors appear at the top. The
+`--error-counts` flag can be combined with `--score-format delim` to write a delimited
+table that can be read as a spreadsheet.
 
 ## Validation
 
@@ -290,7 +277,7 @@ No errors found in 0 tokens, 2 sequences, and 1 documents in reference.bio
 For the example of the [samples/invalid.bio](samples/invalid.bio), we can run
 `seqscore validate --labels BIO samples/invalid.bio`:
 
- ```
+```
 Encountered 1 errors in 1 tokens, 2 sequences, and 1 documents in invalid.bio
 Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7
 ```
@@ -299,8 +286,8 @@ Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7
 
 We can convert a file from one chunk encoding to another. For example,
 `seqscore convert --input-labels BIO --output-labels BIOES samples/reference.bio samples/reference.bioes`
-will read [samples/reference.bio](samples/reference.bio) in BIO
-encoding and write the BIOES-converted file to [samples/reference.bioes](samples/reference.bioes):
+will read [samples/reference.bio](samples/reference.bio) in BIO encoding and write the
+BIOES-converted file to [samples/reference.bioes](samples/reference.bioes):
 
 ```
 This O
@@ -319,7 +306,6 @@ Philadelphia E-LOC
 , O
 Pennsylvania S-LOC
 . O
-
 ```
 
 We can get a list of available chunk encodings by running `seqscore convert --help`:
@@ -341,12 +327,11 @@ Options:
 
 ## Repair
 
-We can also apply repair methods to a file, creating an output file
-with only valid transitions. For example, we can run
+We can also apply repair methods to a file, creating an output file with only valid
+transitions. For example, we can run
 `seqscore repair --labels BIO --repair-method conlleval samples/invalid.bio samples/invalid_repair_conlleval.bio`,
 which will apply the conlleval repair method to the
-[samples/invalid.bio](samples/invalid.bio) and write the repaired
-labels to
+[samples/invalid.bio](samples/invalid.bio) and write the repaired labels to
 [samples/invalid_repair_conlleval.bio](samples/invalid_repair_conlleval.bio):
 
 ```
@@ -366,12 +351,12 @@ Philadelphia I-LOC
 , O
 Pennsylvania B-LOC
 . O
-
 ```
 
 If we want to apply the discard repair method, we can run
 `seqscore repair --labels BIO --repair-method discard samples/invalid.bio samples/invalid_repair_discard.bio`
-and the output will be written to [samples/invalid_repair_discard.bio](samples/invalid_repair_discard.bio):
+and the output will be written to
+[samples/invalid_repair_discard.bio](samples/invalid_repair_discard.bio):
 
 ```
 This O
@@ -390,18 +375,16 @@ Philadelphia I-LOC
 , O
 Pennsylvania B-LOC
 . O
-
 ```
 
-Repairing the file before performing other operations is available in the
-`count` and `summarize` subcommands.
+Repairing the file before performing other operations is available in the `count` and
+`summarize` subcommands.
 
 ## Summarize
 
-The `summarize` subcommand can produce counts of the types of chunks
-in the input file. For example, if we run
-`seqscore summarize --labels BIO samples/reference.bio`
-we get the following output:
+The `summarize` subcommand can produce counts of the types of chunks in the input file.
+For example, if we run `seqscore summarize --labels BIO samples/reference.bio` we get
+the following output:
 
 ```
 File 'samples/reference.bio' contains 1 document(s) with the following mentions:
@@ -411,14 +394,13 @@ File 'samples/reference.bio' contains 1 document(s) with the following mentions:
 | ORG           |       1 |
 ```
 
-If the quiet (`-q`) flag is provided, the first line giving the filename
-and document count is not printed.
+If the quiet (`-q`) flag is provided, the first line giving the filename and document
+count is not printed.
 
 ## Count
 
-The `count` subcommand can produce the counts of chunks in the input
-file. Unlike `summarize`, it counts chunk-type pairs, not just types.
-For example, if we run
+The `count` subcommand can produce the counts of chunks in the input file. Unlike
+`summarize`, it counts chunk-type pairs, not just types. For example, if we run
 `seqscore count --labels BIO samples/reference.bio --output-file counts.csv`,
 tab-delimited counts would be written to `counts.csv` as follows:
 
@@ -433,18 +415,18 @@ standard output. However, you may encounter Unicode issues if your terminal is n
 configured properly.
 
 You can use the `--output-delim` argument to change the delimiter used in the counts.
-The default delimiter of tab is strongly recommended, as there is no escaping or
-quoting of the names in the output.
+The default delimiter of tab is strongly recommended, as there is no escaping or quoting
+of the names in the output.
 
 ## Process
 
-The `process` subcommand can remove entity types from a file or map them to
-other types. Removing types can be performed by specifying one of `--keep-types`
-or `--remove-types`.
+The `process` subcommand can remove entity types from a file or map them to other types.
+Removing types can be performed by specifying one of `--keep-types` or `--remove-types`.
 
 For example, if we wanted to keep only the ORG type, we could run:
 `seqscore process --labels BIO --keep-types ORG samples/reference.bio samples/keep_ORG.bio`,
-and the following output will be written to [samples/keep_ORG.bio](samples/keep_ORG.bio):
+and the following output will be written to
+[samples/keep_ORG.bio](samples/keep_ORG.bio):
 
 ```
 This O
@@ -468,11 +450,12 @@ Pennsylvania O
 You can also keep multiple types by specifying a comma-separated list of types:
 `--keep-types LOC,ORG`.
 
-Instead of specifying which types to keep, we can also specify which types to
-remove using `--remove-types`. For example, if we wanted to remove only the
-ORG type, we could run:
+Instead of specifying which types to keep, we can also specify which types to remove
+using `--remove-types`. For example, if we wanted to remove only the ORG type, we could
+run:
 `seqscore process --labels BIO --remove-types ORG samples/reference.bio samples/remove_ORG.bio`,
-and the following output will be written to [samples/remove_ORG.bio](samples/remove_ORG.bio):
+and the following output will be written to
+[samples/remove_ORG.bio](samples/remove_ORG.bio):
 
 ```
 This O
@@ -496,10 +479,9 @@ Pennsylvania B-LOC
 As with keep, you can specify multiple tags to remove, for example
 `--remove-types LOC,ORG`.
 
-The `--type-map` argument allows you to specify a JSON file that specifies a
-mapping between types and other types. Suppose you want to collapse several
-types into a more generic NAME type. In that case, the type map would be
-specified as follows:
+The `--type-map` argument allows you to specify a JSON file that specifies a mapping
+between types and other types. Suppose you want to collapse several types into a more
+generic NAME type. In that case, the type map would be specified as follows:
 
 ```
 {
@@ -507,9 +489,9 @@ specified as follows:
 }
 ```
 
-The type map must be a JSON dictionary. The keys are the types to be mapped to,
-while the value for each key is a list of types to be mapped from. Note that
-the value must always be a list, even if it would only contain one element.
+The type map must be a JSON dictionary. The keys are the types to be mapped to, while
+the value for each key is a list of types to be mapped from. Note that the value must
+always be a list, even if it would only contain one element.
 
 We can apply the above type map to a file using the following command:
 `seqscore process --labels BIO --type-map samples/type_map_NAME.json samples/reference.bio samples/all_NAME.bio`,
@@ -534,9 +516,8 @@ Pennsylvania B-NAME
 . O
 ```
 
-When `--type-map` is specified at the same time as `--keep-types` or
-`--remove-types`, the type mapping is applied **before** the keep/remove
-filtering is applied.
+When `--type-map` is specified at the same time as `--keep-types` or `--remove-types`,
+the type mapping is applied **before** the keep/remove filtering is applied.
 
 ## Text extraction
 
@@ -555,14 +536,12 @@ University of Pennsylvania is in West Philadelphia , Pennsylvania .
 
 Each sentence is written on one line with space-delimited tokens.
 
-
 # FAQ
 
 ## Why can't I score output files that are in the format `conlleval` expects?
 
-SeqScore intentionally does not support the "merged"
-format used by `conlleval` where each line contains a token, correct
-tag, and predicted tag:
+SeqScore intentionally does not support the "merged" format used by `conlleval` where
+each line contains a token, correct tag, and predicted tag:
 
 ```
 University B-ORG B-ORG
@@ -577,23 +556,21 @@ Pennsylvania B-LOC B-LOC
 . O O
 ```
 
-We do not support this format because we have found that creating
-predictions in this format is a common source of errors in scoring
-pipelines.
+We do not support this format because we have found that creating predictions in this
+format is a common source of errors in scoring pipelines.
 
 ## When do I need to specify the `--labels` argument?
 
-The `--labels` argument must be specified for commands where knowing the label
-encoding is essential to getting correct answers. These commands are `validate`,
-`repair`, and `score`. For all other commands, `--labels BIO` is assumed by
-default but can be overridden.
+The `--labels` argument must be specified for commands where knowing the label encoding
+is essential to getting correct answers. These commands are `validate`, `repair`, and
+`score`. For all other commands, `--labels BIO` is assumed by default but can be
+overridden.
 
 # Development
 
 The following instructions are for the project maintainers only.
 
-For development, check out the `dev` branch (latest, but less tested
-than `main`).
+For development, check out the `dev` branch (latest, but less tested than `main`).
 
 ## Setting up an environment for development
 
@@ -611,8 +588,7 @@ than `main`).
 
 # Contributors
 
-SeqScore was developed by the BLT Lab at Brandeis University under the
-direction of PI and lead developer Constantine Lignos. Chester
-Palen-Michel, Nolan Holley, and Claire Wang contributed to its
-development.  Gordon Dou, Maya Kruse, and Andrew Rueda gave feedback
-on its features and assisted in README writing.
+SeqScore was developed by the BLT Lab at Brandeis University under the direction of PI
+and lead developer Constantine Lignos. Chester Palen-Michel, Nolan Holley, and Claire
+Wang contributed to its development. Gordon Dou, Maya Kruse, and Andrew Rueda gave
+feedback on its features and assisted in README writing.
diff --git a/tests/pre_commit.sh b/tests/pre_commit.sh
index 1967c0a..abd3ec1 100755
--- a/tests/pre_commit.sh
+++ b/tests/pre_commit.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-flowmark -i --nobackup *.md
+flowmark -i --nobackup ./*.md
 files=(seqscore/ tests/)
 ruff check --fix "${files[@]}"
 ruff check --select I --fix "${files[@]}"  # Organize imports

From a63548be90ee414925821514c4da97f1316feabd Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 10:58:06 -0400
Subject: [PATCH 29/33] Enable build on dev* branches

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ebf8a1e..d380885 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,11 +4,11 @@ on:
   push:
     branches:
     - main
-    - dev
+    - dev*
   pull_request:
     branches:
     - main
-    - dev
+    - dev*
 
 jobs:
   build:

From f79203ad2d1bd30ee222fc76dcfcf9c93fe6a389 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 11:02:03 -0400
Subject: [PATCH 30/33] Update build to use pyproject.toml

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d380885..0bc5ba2 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -43,7 +43,7 @@ jobs:
 
       - name: Install quality check dependencies
         run: |
-          pip install -r requirements.txt
+          pip install ".[dev]"
 
       - name: Run quality checks
         run: |

From 5de23c2c6e2015247e85d43859bf796fb4b7ec45 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 11:06:10 -0400
Subject: [PATCH 31/33] Pin version of pytest-cov

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a6256ca..94f89c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ seqscore = "seqscore.scripts.seqscore:cli"
 dev = [
     "types-tabulate",
     "pytest==9.0.3",
-    "pytest-cov>=7.1.0",
+    "pytest-cov==7.1.0",
     "mypy==2.1.0",
     "ruff==0.15.15",
     "flowmark",

From d540196008cb84a01e0e0bf527506d82634cf5e6 Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 11:08:44 -0400
Subject: [PATCH 32/33] Update check.sh for removal of setup.py

---
 tests/check.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/check.sh b/tests/check.sh
index 1f05a83..7fb6ba5 100755
--- a/tests/check.sh
+++ b/tests/check.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-files=(seqscore/ tests/ setup.py)
+files=(seqscore/ tests/)
 ruff check "${files[@]}"
 mypy "${files[@]}"

From 598f066594dbe0f551c1e98e3ea87dfac1625a9e Mon Sep 17 00:00:00 2001
From: Constantine Lignos <lignos@brandeis.edu>
Date: Thu, 4 Jun 2026 11:22:49 -0400
Subject: [PATCH 33/33] Add release script

---
 README.md          | 10 ++++++++
 pyproject.toml     |  2 ++
 scripts/release.sh | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/check.sh     |  1 +
 4 files changed, 70 insertions(+)
 create mode 100755 scripts/release.sh

diff --git a/README.md b/README.md
index afb832d..9b1320c 100644
--- a/README.md
+++ b/README.md
@@ -586,6 +586,16 @@ For development, check out the `dev` branch (latest, but less tested than `main`
 3. Install seqscore: `pip install -e .`
 4. Install development dependencies: `pip install -e ".[dev]"`
 
+## Release
+
+The release script is located at `scripts/release.sh` and can only be used by project
+maintainers. To make a release:
+
+1. Make sure `__version__` is up to date in `seqscore/__init__.py`.
+2. Make sure you are on the main branch with no uncommitted changes.
+3. Run `scripts/release.sh`. If anything goes wrong between tagging and releasing, you
+   will have to delete the tag on GitHub and try again.
+
 # Contributors
 
 SeqScore was developed by the BLT Lab at Brandeis University under the direction of PI
diff --git a/pyproject.toml b/pyproject.toml
index 94f89c1..38cb8e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dev = [
     "mypy==2.1.0",
     "ruff==0.15.15",
     "flowmark",
+    "build",
+    "twine",
 ]
 
 [tool.setuptools.dynamic]
diff --git a/scripts/release.sh b/scripts/release.sh
new file mode 100755
index 0000000..13fa608
--- /dev/null
+++ b/scripts/release.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# Builds, uploads to PyPI, and tags the release.
+# Should only be run by project maintainers.
+set -euo pipefail
+
+VENV=".venv/bin"
+
+# Run pre-commit checks
+bash tests/check.sh
+
+# Must be on main with a clean working tree
+current_branch=$(git rev-parse --abbrev-ref HEAD)
+if [[ "$current_branch" != "main" ]]; then
+    echo "Error: must be on main branch (currently on '$current_branch')"
+    exit 1
+fi
+
+if ! git diff --quiet || ! git diff --cached --quiet; then
+    echo "Error: working tree is not clean"
+    exit 1
+fi
+
+# Read version from package
+version=$("$VENV/python" -c "import seqscore; print(seqscore.__version__)")
+tag="v$version"
+
+# Abort if tag already exists
+if git rev-parse "$tag" >/dev/null 2>&1; then
+    echo "Error: tag $tag already exists. Update __version__ in seqscore/__init__.py."
+    exit 1
+fi
+
+echo "Releasing $tag"
+
+# Build
+rm -rf dist/
+"$VENV/python" -m build
+
+# Tag and push
+git tag "$tag"
+git push origin "$tag"
+
+# Prompt to verify tag before uploading
+echo ""
+echo "Tag $tag pushed. Check the release on GitHub before uploading to PyPI:"
+echo "  https://github.com/bltlab/seqscore/releases/tag/$tag"
+echo ""
+read -r -p "Upload to PyPI? [y/N] " confirm
+if [[ "${confirm,,}" != "y" ]]; then
+    echo "Aborted. Re-run this script to retry the upload."
+    exit 1
+fi
+
+# Upload to PyPI
+"$VENV/twine" upload dist/*
+
+echo "Done: $tag released and pushed"
diff --git a/tests/check.sh b/tests/check.sh
index 7fb6ba5..e544ad2 100755
--- a/tests/check.sh
+++ b/tests/check.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
+flowmark --check ./*.md
 files=(seqscore/ tests/)
 ruff check "${files[@]}"
 mypy "${files[@]}"