From 12cd6f3616b65e152e12e7cd05c9102e8637c323 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Sat, 13 Dec 2025 13:48:00 -0500 Subject: [PATCH 01/33] Test on Python 3.14 and move actions to Ubuntu 24.04 --- .github/workflows/main.yml | 4 ++-- README.md | 2 +- setup.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fb51447..1170954 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,12 +12,12 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 diff --git a/README.md b/README.md index 8b560dc..e894734 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This will install the package and add the command `seqscore` in your Python environment. SeqScore requires Python 3.9 or higher. It is tested on Python 3.9, -3.10, 3.11, 3.12, and 3.13. +3.10, 3.11, 3.12, 3.13, and 3.14. ## License diff --git a/setup.py b/setup.py index 0c227b3..90b2082 100755 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ def setup_package() -> None: "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], url="https://github.com/bltlab/seqscore", From efdffada7cdfe6f3dce4a8e24f35db6b01102e0a Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Sun, 14 Dec 2025 08:36:16 -0500 Subject: [PATCH 02/33] Increment version to 0.8.0 --- seqscore/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqscore/__init__.py b/seqscore/__init__.py index 49e0fc1..777f190 100644 --- a/seqscore/__init__.py +++ b/seqscore/__init__.py @@ -1 +1 @@ -__version__ = "0.7.0" +__version__ = "0.8.0" From fb0f7453b8ab0f5681fcc238e5a8525672a3a84b Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Sun, 14 Dec 2025 08:39:45 -0500 Subject: [PATCH 03/33] Update mypy and ruff to latest versions --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0c54998..e976d18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,8 @@ pytest==8.3.5 pytest-cov==5.0.0 # For development -mypy==1.14.1 -ruff==0.9.10 +mypy==1.19.0 +ruff==0.14.9 # Documentation build # Disabled for now since we don't need them From 37d031b7d2dfb3b41ad7ec3ff18e48d04d766895 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Sun, 14 Dec 2025 12:12:31 -0500 Subject: [PATCH 04/33] Add more scoring tests and rename error counting flag --- seqscore/conll.py | 2 +- seqscore/scoring.py | 19 ++++++--- tests/test_scoring.py | 96 +++++++++++++++++++++++++++++++++---------- 3 files changed, 89 insertions(+), 28 deletions(-) diff --git a/seqscore/conll.py b/seqscore/conll.py index d60bb8b..05ad864 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -514,7 +514,7 @@ def score_conll_files( ) class_scores, acc_scores = compute_scores( - pred_docs, ref_docs, count_fp_fn=error_counts + pred_docs, ref_docs, count_fp_fn_examples=error_counts ) all_class_scores.append(class_scores) all_acc_scores.append(class_scores) diff --git a/seqscore/scoring.py b/seqscore/scoring.py index 1622613..c0c8079 100644 --- a/seqscore/scoring.py +++ b/seqscore/scoring.py @@ -133,7 +133,7 @@ def compute_scores( pred_docs: Sequence[Sequence[LabeledSequence]], ref_docs: Sequence[Sequence[LabeledSequence]], *, - count_fp_fn: bool = False, + count_fp_fn_examples: bool = False, ) -> tuple[ClassificationScore, AccuracyScore]: accuracy = AccuracyScore() classification = ClassificationScore() @@ -174,7 +174,7 @@ def compute_scores( ref_sequence.mentions, classification, tokens=ref_sequence.tokens, - count_fp_fn=count_fp_fn, + count_fp_fn_examples=count_fp_fn_examples, ) return classification, accuracy @@ -205,13 +205,20 @@ def score_sequence_mentions( score: ClassificationScore, *, tokens: Optional[Sequence[str]] = (), - count_fp_fn: bool = False, + count_fp_fn_examples: bool = False, ) -> None: """Update a ClassificationScore for a single sequence's mentions. Since mentions are defined per-sequence, the behavior is not defined - if you provide mentions corresponding to multiple sequences. + if you provide mentions corresponding to multiple sequences. Tokens + must be provided if you want false positives and negative examples + to be counted. """ + if count_fp_fn_examples and not tokens: + raise ValueError( + "Tokens must be provided to count false positive/negative examples" + ) + # Compute span accuracy pred_mentions_set = set(pred_mentions) ref_mentions_set = set(ref_mentions) @@ -226,7 +233,7 @@ def score_sequence_mentions( # False positive score.false_pos += 1 score.type_scores[pred.type].false_pos += 1 - if count_fp_fn: + if count_fp_fn_examples: error_tokens = tokens[pred.span.start : pred.span.end] score.count_false_positive(error_tokens, pred.type) @@ -235,7 +242,7 @@ def score_sequence_mentions( if ref not in pred_mentions_set: score.false_neg += 1 score.type_scores[ref.type].false_neg += 1 - if count_fp_fn: + if count_fp_fn_examples: error_tokens = tokens[ref.span.start : ref.span.end] score.count_false_negative(error_tokens, ref.type) diff --git a/tests/test_scoring.py b/tests/test_scoring.py index ebb8881..a74558d 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -1,3 +1,4 @@ +from collections import Counter from decimal import Decimal import pytest @@ -8,6 +9,7 @@ AccuracyScore, ClassificationScore, TokenCountError, + TokensWithType, compute_scores, convert_score, score_label_sequences, @@ -45,7 +47,7 @@ def test_score_sentence_labels_invalid() -> None: score_sequence_label_accuracy(pred_labels, ref_labels, AccuracyScore()) -def test_score_sentence_mentions_correct() -> None: +def test_score_sequence_mentions_correct() -> None: ref_mentions = [Mention(Span(0, 2), "PER"), Mention(Span(4, 5), "ORG")] pred_mentions = [Mention(Span(0, 2), "PER"), Mention(Span(4, 5), "ORG")] score = ClassificationScore() @@ -63,8 +65,14 @@ def test_score_sentence_mentions_correct() -> None: assert score.recall == 1.0 assert score.f1 == 1.0 + # Test that tokens are required for counting FP/FN + with pytest.raises(ValueError): + score_sequence_mentions( + pred_mentions, ref_mentions, score, count_fp_fn_examples=True + ) + -def test_score_sentence_mentions_incorrect1() -> None: +def test_score_sequence_mentions_incorrect1() -> None: ref_mentions = [ Mention(Span(0, 2), "LOC"), Mention(Span(4, 5), "PER"), @@ -100,6 +108,28 @@ def test_score_sentence_mentions_incorrect1() -> None: 2 * (score.precision * score.recall) / (score.precision + score.recall) ) + # Run again and check counted fp/fn examples. We do this in a second pass so + # we can cover both True/False cases for count_fp_fn_examples. + score2 = ClassificationScore() + tokens = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"] + score_sequence_mentions( + pred_mentions, ref_mentions, score2, count_fp_fn_examples=True, tokens=tokens + ) + expected_false_pos = Counter( + [ + TokensWithType(("a", "b"), "ORG"), + TokensWithType(("g",), "SPURIOUS"), + ] + ) + expected_false_neg = Counter( + [ + TokensWithType(("a", "b"), "LOC"), + TokensWithType(("h",), "MISC"), + ] + ) + assert score2.false_pos_examples == expected_false_pos + assert score2.false_neg_examples == expected_false_neg + def test_score_label_sequences_correct() -> None: ref_labels = [["O", "B-ORG", "I-ORG", "O"], ["B-PER", "I-PER"]] @@ -192,60 +222,84 @@ def test_accuracy_score_empty() -> None: assert score.accuracy == 0.0 +def test_compute_scores() -> None: + ref_labels = ("O", "B-ORG", "I-ORG", "O", "B-LOC") + ref_mentions = ( + Mention(Span(1, 3), "ORG"), + Mention(Span(4, 5), "LOC"), + ) + pred_labels = ("O", "B-ORG", "I-ORG", "O", "B-ORG") + pred_mentions = ( + Mention(Span(1, 3), "ORG"), + Mention(Span(4, 5), "ORG"), + ) + tokens = ("a", "b", "c", "d", "e") + ref_sequence = LabeledSequence(tokens, ref_labels, ref_mentions) + pred_sequence = LabeledSequence(tokens, pred_labels, pred_mentions) + class_score, acc_score = compute_scores([[pred_sequence]], [[ref_sequence]]) + assert acc_score.accuracy == 4 / 5 + print(class_score) + assert class_score.true_pos == 1 + assert class_score.false_pos == 1 + assert class_score.false_neg == 1 + + def test_token_count_error() -> None: - ref_labels = ["O", "B-ORG", "I-ORG", "O"] - pred_labels = ["O", "B-ORG", "I-ORG", "O", "O"] + ref_labels = ("O", "B-ORG", "I-ORG", "O") + pred_labels = ("O", "B-ORG", "I-ORG", "O", "O") ref_sequence = LabeledSequence( - ["a", "b", "c", "d"], ref_labels, provenance=SequenceProvenance(0, "test") + ("a", "b", "c", "d"), ref_labels, provenance=SequenceProvenance(0, "test") ) pred_sequence = LabeledSequence( - ["a", "b", "c", "d", "e"], pred_labels, provenance=SequenceProvenance(0, "test") + ("a", "b", "c", "d", "e"), pred_labels, provenance=SequenceProvenance(0, "test") ) with pytest.raises(TokenCountError): compute_scores([[pred_sequence]], [[ref_sequence]]) -def test_provenance_none_raises_error() -> None: - labels = ["O", "B-ORG"] - sequence = LabeledSequence(["a", "b"], labels, provenance=None) +def test_token_count_error_provenance_none_raises_error() -> None: + labels = ("O", "B-ORG") + sequence = LabeledSequence(("a", "b"), labels, provenance=None) with pytest.raises(ValueError): TokenCountError.from_predicted_sequence(2, sequence) def test_differing_num_docs() -> None: - ref_labels = ["O", "B-ORG"] - pred_labels = ["O", "B-LOC"] + ref_labels = ("O", "B-ORG") + pred_labels = ("O", "B-LOC") + tokens = ("a", "b") ref_sequence = LabeledSequence( - ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test") + tokens, ref_labels, provenance=SequenceProvenance(0, "test") ) pred_sequence = LabeledSequence( - ["a", "b"], pred_labels, provenance=SequenceProvenance(0, "test") + tokens, pred_labels, provenance=SequenceProvenance(0, "test") ) with pytest.raises(ValueError): compute_scores([[pred_sequence]], [[ref_sequence], [ref_sequence]]) def test_differing_doc_length() -> None: - ref_labels = ["O", "B-ORG"] - pred_labels = ["O", "B-LOC"] + ref_labels = ("O", "B-ORG") + pred_labels = ("O", "B-LOC") + tokens = ("a", "b") ref_sequence = LabeledSequence( - ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test") + tokens, ref_labels, provenance=SequenceProvenance(0, "test") ) pred_sequence = LabeledSequence( - ["a", "b"], pred_labels, provenance=SequenceProvenance(0, "test") + tokens, pred_labels, provenance=SequenceProvenance(0, "test") ) with pytest.raises(ValueError): compute_scores([[pred_sequence]], [[ref_sequence, ref_sequence]]) def test_differing_pred_and_ref_tokens() -> None: - ref_labels = ["O", "B-ORG"] - pred_labels = ["O", "B-LOC"] + ref_labels = ("O", "B-ORG") + pred_labels = ("O", "B-LOC") ref_sequence = LabeledSequence( - ["a", "b"], ref_labels, provenance=SequenceProvenance(0, "test") + ("a", "b"), ref_labels, provenance=SequenceProvenance(0, "test") ) pred_sequence = LabeledSequence( - ["a", "c"], pred_labels, provenance=SequenceProvenance(0, "test") + ("a", "c"), pred_labels, provenance=SequenceProvenance(0, "test") ) with pytest.raises(ValueError): compute_scores([[pred_sequence]], [[ref_sequence]]) From 9328926805531119db2b605bf86a895b0ccfbf7a Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 16 Dec 2025 05:02:17 -0500 Subject: [PATCH 05/33] Add total line to count subcommand --- seqscore/scripts/seqscore.py | 10 +++++---- tests/test_summarize_click.py | 40 +++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index 859ee24..1fcc01e 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -2,10 +2,10 @@ import sys from collections import Counter from contextlib import nullcontext -from typing import Callable, Optional +from typing import Callable, Optional, Union import click -from tabulate import tabulate +from tabulate import SEPARATING_LINE, tabulate import seqscore from seqscore.conll import ( @@ -389,8 +389,10 @@ def summarize( print(f"Total {total_documents} document(s) and {total_sentences} sentences") header = ["Entity Type", "Count"] - rows = sorted(type_counts.items()) - print(tabulate(rows, header, tablefmt="github", floatfmt="6.2f")) + rows: list[Union[tuple[str, int], str]] = sorted(type_counts.items()) + rows.append(SEPARATING_LINE) + rows.append(("TOTAL", sum(type_counts.values()))) + print(tabulate(rows, header, intfmt=",")) @cli.command(help="score a file and report performance or an error count table") diff --git a/tests/test_summarize_click.py b/tests/test_summarize_click.py index f957e59..4088093 100644 --- a/tests/test_summarize_click.py +++ b/tests/test_summarize_click.py @@ -19,10 +19,12 @@ def test_summarize_bio_onedoc() -> None: assert ( result.output == """File 'tests/conll_annotation/minimal.bio' contains 1 document(s) and 2 sentences -| Entity Type | Count | -|---------------|---------| -| LOC | 2 | -| ORG | 1 | +Entity Type Count +------------- ------- +LOC 2 +ORG 1 +------------- ------- +TOTAL 3 """ ) @@ -41,10 +43,12 @@ def test_summarize_bio_onedoc_quiet() -> None: assert result.exit_code == 0 assert ( result.output - == """| Entity Type | Count | -|---------------|---------| -| LOC | 2 | -| ORG | 1 | + == """Entity Type Count +------------- ------- +LOC 2 +ORG 1 +------------- ------- +TOTAL 3 """ ) @@ -63,10 +67,12 @@ def test_summarize_iob_twodoc() -> None: assert ( result.output == """File 'tests/conll_annotation/minimal_fields.iob' contains 2 document(s) and 2 sentences -| Entity Type | Count | -|---------------|---------| -| LOC | 2 | -| ORG | 1 | +Entity Type Count +------------- ------- +LOC 2 +ORG 1 +------------- ------- +TOTAL 3 """ ) @@ -88,9 +94,11 @@ def test_summarize_bio_twofiles() -> None: == """File 'tests/conll_annotation/minimal.bio' contains 1 document(s) and 2 sentences File 'tests/conll_annotation/minimal2.bio' contains 1 document(s) and 2 sentences Total 2 document(s) and 4 sentences -| Entity Type | Count | -|---------------|---------| -| LOC | 5 | -| ORG | 2 | +Entity Type Count +------------- ------- +LOC 5 +ORG 2 +------------- ------- +TOTAL 7 """ ) From 9b85b43b8f520ed07d7a1713b423fa4795d4b593 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 16 Dec 2025 05:02:40 -0500 Subject: [PATCH 06/33] Change default output delimiter for CoNLL files to tab --- seqscore/scripts/seqscore.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index 1fcc01e..4726a8d 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -98,6 +98,14 @@ def _labels_option_default_bio() -> Callable: ) +def _output_delim_option() -> Callable: + return click.option( + "--output-delim", + default="\t", + help="the delimiter to be used for output (has no effect on input) [default: tab]", + ) + + def _quiet_option() -> Callable: return click.option( "--quiet", @@ -151,7 +159,7 @@ def validate( @click.argument("output_file") @_repair_required_option() @_labels_option() -@click.option("--output-delim", default=" ", help="[default: space]") +@_output_delim_option() @_quiet_option() def repair( file: str, @@ -165,6 +173,7 @@ def repair( parse_comment_lines: bool, quiet: bool, ) -> None: + output_delim = _normalize_tab(output_delim) if repair_method == REPAIR_NONE: raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}") @@ -184,7 +193,7 @@ def repair( @cli.command(help="convert between mention encodings") @_single_input_file_arguments @click.argument("output_file") -@click.option("--output-delim", default=" ", help="[default: space]") +@_output_delim_option() @click.option("--input-labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS)) @click.option("--output-labels", required=True, type=click.Choice(SUPPORTED_ENCODINGS)) def convert( @@ -198,6 +207,7 @@ def convert( ignore_document_boundaries: bool, parse_comment_lines: bool, ) -> None: + output_delim = _normalize_tab(output_delim) if input_labels == output_labels: raise ValueError("Conversion requires different input and output labels") @@ -233,7 +243,7 @@ def convert( type=click.Path(dir_okay=False), help="a JSON file containing types to be modified, in the format of a dict with keys as the target type and values as the source type [example file: {'MISC': ['WorkOfArt', 'Event']}]", ) -@click.option("--output-delim", default=" ", help="[default: space]") +@_output_delim_option() def process( file: str, output_file: str, @@ -247,6 +257,7 @@ def process( ignore_document_boundaries: bool, parse_comment_lines: bool, ) -> None: + output_delim = _normalize_tab(output_delim) keep_types_set = _parse_type_list(keep_types) remove_types_set = _parse_type_list(remove_types) type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding) @@ -281,11 +292,7 @@ def process( ) @_repair_option() @_labels_option_default_bio() -@click.option( - "--output-delim", - default="\t", - help="the delimiter to be used for output (has no effect on input) [default: tab]", -) +@_output_delim_option() @_quiet_option() def count( file: list[str], # Name is "file" to make sense on the command line, but it's a list From 0cfedcfd1f63ce1344f68a95bffda1a30b887d89 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Fri, 27 Feb 2026 04:34:51 -0500 Subject: [PATCH 07/33] Add TODOs --- seqscore/scripts/seqscore.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index 4726a8d..cdcd8c0 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -346,6 +346,8 @@ def count( ) +# TODO: Add support for delimited file output +# TODO: Take format argument for tabulate from command line @cli.command(help="show counts of the documents, sentences, and entity types") @_multi_input_file_arguments @_repair_option() From 39234f13006ad1701e2d1813ea943c0afe179736 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Fri, 27 Feb 2026 04:39:12 -0500 Subject: [PATCH 08/33] Check for empty tokens during validation --- seqscore/conll.py | 26 ++++++++++++-------- seqscore/model.py | 10 ++++---- seqscore/validation.py | 8 ++++++ tests/test_conll_format.py | 12 +++++++++ tests/test_files/minimal_bio_empty_token.txt | 17 +++++++++++++ tests/test_model.py | 6 +++-- tests/test_repair_click.py | 4 +-- tests/test_validation.py | 11 +++++++++ 8 files changed, 75 insertions(+), 19 deletions(-) create mode 100644 tests/test_files/minimal_bio_empty_token.txt diff --git a/seqscore/conll.py b/seqscore/conll.py index 05ad864..127617a 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -156,7 +156,7 @@ def ingest( if not quiet: msg = ( [ - f"Validation errors in sequence at line {line_nums[0]} of {source_name}:" + f"Validation errors in sequence beginning at line {line_nums[0]} of {source_name}:" ] + [error.msg for error in validation.errors] + [ @@ -182,14 +182,20 @@ def ingest( + " ".join(labels), ) from e - sequences = LabeledSequence( - tokens, - labels, - mentions, - other_fields=other_fields, - provenance=SequenceProvenance(line_nums[0], source_name), - comment=comment, - ) + try: + sequences = LabeledSequence( + tokens, + labels, + mentions, + other_fields=other_fields, + provenance=SequenceProvenance(line_nums[0], source_name), + comment=comment, + ) + except ValueError as e: # pragma: no cover + # Unreachable unless there is a bug in validation + raise ValueError( + f"Invalid sequence error in sequence beginning at line {line_nums[0]} of {source_name}" + ) from e document.append(sequences) # Yield final document if non-empty @@ -211,7 +217,7 @@ def validate( # But we check anyway to be absolutely sure we aren't throwing away a sequence. assert len(source_sequence) == 1 - # If we care about document boundaries and we have results for this document, + # If we care about document boundaries and have results for this document, # add it and move on. if not self.ignore_document_boundaries and document_results: all_results.append(document_results) diff --git a/seqscore/model.py b/seqscore/model.py index 09e67e2..6df73b3 100644 --- a/seqscore/model.py +++ b/seqscore/model.py @@ -85,15 +85,15 @@ def __attrs_post_init__(self) -> None: "must be of the same length" ) - for label in self.labels: + for idx, label in enumerate(self.labels): # Labels cannot be None or an empty string if not label: - raise ValueError(f"Invalid label: {repr(label)}") + raise ValueError(f"Invalid label at sequence index {idx}: {repr(label)}") - for token in self.tokens: - # Labels cannot be None or an empty string + for idx, token in enumerate(self.tokens): + # Tokens cannot be None or an empty string if not token: - raise ValueError(f"Invalid token: {repr(token)}") + raise ValueError(f"Invalid token at sequence index {idx}: {repr(token)}") def with_mentions(self, mentions: Sequence[Mention]) -> "LabeledSequence": return LabeledSequence( diff --git a/seqscore/validation.py b/seqscore/validation.py index 24bff39..6d3c756 100644 --- a/seqscore/validation.py +++ b/seqscore/validation.py @@ -81,6 +81,14 @@ def validate_labels( "Line numbers and labels must be the same length" ) + # Validate tokens if supplied + if tokens: + for idx, tok in enumerate(tokens): + if not tok: + line_msg = f" on line {line_nums[idx]}" if line_nums else "" + source_msg = f" of {source_name}" if source_name else "" + raise ValueError(f"Invalid token {repr(tok)}{line_msg}{source_msg}") + errors: list[ValidationError] = [] outside = encoding.dialect.outside diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py index 0ca1d08..bc4d196 100644 --- a/tests/test_conll_format.py +++ b/tests/test_conll_format.py @@ -70,3 +70,15 @@ def test_parse_comments_false() -> None: str(err.value) == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." ) + + +def test_invalid_token_leading_space() -> None: + mention_encoding = get_encoding("BIO") + ingester = CoNLLIngester(mention_encoding) + + path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt" + with path.open(encoding="utf8") as file: + with pytest.raises(ValueError) as err: + list(ingester.ingest(file, "test", REPAIR_NONE)) + + assert str(err.value) == "Invalid token '' on line 9 of test" diff --git a/tests/test_files/minimal_bio_empty_token.txt b/tests/test_files/minimal_bio_empty_token.txt new file mode 100644 index 0000000..a497c92 --- /dev/null +++ b/tests/test_files/minimal_bio_empty_token.txt @@ -0,0 +1,17 @@ +This O +is O +a O +sentence O +. O + +University B-ORG +of I-ORG + I-ORG +is O +in O +West B-LOC +Philadelphia I-LOC +, O +Pennsylvania B-LOC +. O + diff --git a/tests/test_model.py b/tests/test_model.py index b2e4732..50dcee1 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -58,13 +58,15 @@ def test_labeled_sentence() -> None: # Empty LabeledSequence([], []) - with pytest.raises(ValueError): + with pytest.raises(ValueError) as err: # Bad label LabeledSequence(["a"], [""]) + assert str(err.value) == "Invalid label at sequence index 0: ''" - with pytest.raises(ValueError): + with pytest.raises(ValueError) as err: # Bad token LabeledSequence([""], ["B-PER"]) + assert str(err.value) == "Invalid token at sequence index 0: ''" s2 = s1.with_mentions([Mention(Span(0, 2), "PER")]) assert s2.mentions == (Mention(Span(0, 2), "PER"),) diff --git a/tests/test_repair_click.py b/tests/test_repair_click.py index 2eb9c78..81ac443 100644 --- a/tests/test_repair_click.py +++ b/tests/test_repair_click.py @@ -37,7 +37,7 @@ def test_repair_BIO_conlleval() -> None: assert result.exit_code == 0 assert ( normalize_str_with_path( - "Validation errors in sequence at line 7 of tests/conll_annotation/invalid1.bio:" + "Validation errors in sequence beginning at line 7 of tests/conll_annotation/invalid1.bio:" ) in result.output ) @@ -83,7 +83,7 @@ def test_repair_BIO_discard() -> None: assert result.exit_code == 0 assert ( normalize_str_with_path( - "Validation errors in sequence at line 7 of tests/conll_annotation/invalid1.bio:" + "Validation errors in sequence beginning at line 7 of tests/conll_annotation/invalid1.bio:" ) in result.output ) diff --git a/tests/test_validation.py b/tests/test_validation.py index 614a928..c8f121e 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -344,3 +344,14 @@ def test_validation_bad_label() -> None: str(err.value) == "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) + + +def test_validation_bad_token() -> None: + encoding = get_encoding("BIO") + + tokens = ["Dr.", "", "Salk"] + line_nums = [7, 8, 9] + labels = ["O", "PER", "PER"] + with pytest.raises(ValueError) as err: + validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums) + assert str(err.value) == "Invalid token '' on line 8" From 9cd080d9f855af1d204662a50b7c106dae4b0b8a Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Fri, 27 Feb 2026 05:57:48 -0500 Subject: [PATCH 09/33] Update actions to latest versions --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1170954..6cd5e32 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,10 +20,10 @@ jobs: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} From 65a2fc913d669aaef44065ce06759ff899bca808 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Mon, 30 Mar 2026 09:53:13 -0400 Subject: [PATCH 10/33] Remove old documentation build files --- .readthedocs.yaml | 22 --------------- docs/Makefile | 20 ------------- docs/conf.py | 72 ----------------------------------------------- docs/index.rst | 20 ------------- docs/make.bat | 35 ----------------------- 5 files changed, 169 deletions(-) delete mode 100644 .readthedocs.yaml delete mode 100644 docs/Makefile delete mode 100644 docs/conf.py delete mode 100644 docs/index.rst delete mode 100644 docs/make.bat diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index 9f23ef8..0000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# .readthedocs.yaml -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-20.04 - tools: - python: "3.8" - -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/conf.py - -# Optionally declare the Python requirements required to build your docs -python: - install: - - method: pip - path: . diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d4bb2cb..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 2497af9..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,72 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -from seqscore import __version__ - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = "SeqScore" -copyright = "2021, Constantine Lignos, Chester Palen-Michel, and Nolan Holley" -author = "Constantine Lignos, Chester Palen-Michel, and Nolan Holley" - -version = __version__ -# The full version, including alpha/beta/rc tags -release = version - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.duration", - "sphinx.ext.doctest", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.intersphinx", -] - -intersphinx_mapping = { - "python": ("https://docs.python.org/3/", None), - "sphinx": ("https://www.sphinx-doc.org/en/master/", None), -} -intersphinx_disabled_domains = ["std"] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -# -- Options for EPUB output -epub_show_urls = "footnote" diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index fac6cf6..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. SeqScore documentation master file, created by - sphinx-quickstart on Wed Nov 10 05:11:47 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -SeqScore -======== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 153be5e..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd From 5f415fae685324c4e1b4a15a7f576b4f7f85c51b Mon Sep 17 00:00:00 2001 From: sunshower7 Date: Mon, 1 Jun 2026 13:19:02 -0400 Subject: [PATCH 11/33] Add --ner-label-index option to validate and disable Python 3.9 build --- .github/workflows/main.yml | 2 +- pyproject.toml | 2 +- seqscore/conll.py | 20 +++++++++---- seqscore/scripts/seqscore.py | 12 ++++++++ .../conll_annotation/labels_not_last_col.bio | 16 ++++++++++ tests/test_validation_click.py | 30 +++++++++++++++++++ 6 files changed, 74 insertions(+), 8 deletions(-) create mode 100644 tests/conll_annotation/labels_not_last_col.bio diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6cd5e32..ebf8a1e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v6 diff --git a/pyproject.toml b/pyproject.toml index e3493a6..f8d810e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.mypy] -python_version = 3.9 +python_version = 3.10 strict_optional = false disallow_untyped_defs = true disallow_untyped_calls = true diff --git a/seqscore/conll.py b/seqscore/conll.py index 127617a..b429a7f 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -52,7 +52,7 @@ class _CoNLLToken: other_fields: tuple[str, ...] = attrib() @classmethod - def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken": + def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: int) -> "_CoNLLToken": # Note: The caller must strip the line of any trailing whitespace # TODO: Sense the file rather than the line so we get consistency across lines # Try tab first since it's safer, then space @@ -72,9 +72,14 @@ def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken": f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}" ) + if ner_label_index == 0: + raise ValueError("ner_label_index cannot be 0") + text = splits[0] - label = splits[-1] - other_fields = tuple(splits[1:-1]) + label = splits[ner_label_index] + other_fields = tuple(splits[1:ner_label_index]) + if ner_label_index != -1: + other_fields += tuple(splits[ner_label_index + 1:]) is_docstart = text == DOCSTART return cls(text, label, is_docstart, line_num, other_fields) @@ -84,6 +89,7 @@ class CoNLLIngester: encoding: Encoding = attrib() parse_comment_lines: bool = attrib(default=False, kw_only=True) ignore_document_boundaries: bool = attrib(default=True, kw_only=True) + ner_label_index: int = attrib(default=-1, kw_only=True) def ingest( self, @@ -210,7 +216,7 @@ def validate( document_results: list[SequenceValidationResult] = [] for source_sequence, _ in self._parse_file( - source, source_name, parse_comments=self.parse_comment_lines + source, source_name, parse_comments=self.parse_comment_lines, ner_label_index=self.ner_label_index ): if source_sequence[0].is_docstart: # We can ony receive DOCSTART in a sequence by itself, see _parse_file. @@ -253,7 +259,7 @@ def _decompose_sequence( @classmethod def _parse_file( - cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False + cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False, ner_label_index: int = -1 ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]: sequence: list = [] comment: Optional[str] = None @@ -285,7 +291,7 @@ def _parse_file( # Always skip empty lines continue - token = _CoNLLToken.from_line(line, line_num, source_name) + token = _CoNLLToken.from_line(line, line_num, source_name, ner_label_index) # Skip document starts, but ensure sequence is empty when we reach them if token.is_docstart: if sequence: @@ -352,12 +358,14 @@ def validate_conll_file( *, ignore_document_boundaries: bool, parse_comment_lines: bool, + ner_label_index: int, ) -> ValidationResult: encoding = get_encoding(mention_encoding_name) ingester = CoNLLIngester( encoding, parse_comment_lines=parse_comment_lines, ignore_document_boundaries=ignore_document_boundaries, + ner_label_index=ner_label_index, ) with open(input_path, encoding=file_encoding) as input_file: results = ingester.validate(input_file, input_path) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index cdcd8c0..20606f3 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -106,6 +106,15 @@ def _output_delim_option() -> Callable: ) +def _ner_label_index_option() -> Callable: + return click.option( + "--ner-label-index", + default=-1, + show_default=True, + type=int, + ) + + def _quiet_option() -> Callable: return click.option( "--quiet", @@ -118,6 +127,7 @@ def _quiet_option() -> Callable: @cli.command(help="validate labels") @_multi_input_file_arguments @_labels_option() +@_ner_label_index_option() @_quiet_option() def validate( file: list[str], # Name is "file" to make sense on the command line, but it's a list @@ -126,6 +136,7 @@ def validate( *, ignore_document_boundaries: bool, parse_comment_lines: bool, + ner_label_index: int, quiet: bool, ) -> None: error = False @@ -136,6 +147,7 @@ def validate( file_encoding, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, + ner_label_index=ner_label_index, ) if result.errors: print( diff --git a/tests/conll_annotation/labels_not_last_col.bio b/tests/conll_annotation/labels_not_last_col.bio new file mode 100644 index 0000000..cc91cf4 --- /dev/null +++ b/tests/conll_annotation/labels_not_last_col.bio @@ -0,0 +1,16 @@ +This O DET +is O VERB +a O DET +sentence O NOUN +. O PUNCT + +University B-ORG NOUN +of I-ORG ADP +Pennsylvania I-ORG NOUN +is O VERB +in O ADP +West B-LOC NOUN +Philadelphia I-LOC NOUN +, O PUNCT +Pennsylvania B-LOC NOUN +. O PUNCT diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py index e688ecc..3cb4a98 100644 --- a/tests/test_validation_click.py +++ b/tests/test_validation_click.py @@ -214,3 +214,33 @@ def test_bad_label() -> None: str(result.exception) == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) + + +def test_ner_label_index_pos() -> None: + runner = CliRunner() + result = runner.invoke( + validate, + ["--labels", "BIO", "--ner-label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + ) + assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" + assert result.exit_code == 0 + + +def test_ner_label_index_neg() -> None: + runner = CliRunner() + result = runner.invoke( + validate, + ["--labels", "BIO", "--ner-label-index", "-2", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + ) + assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" + assert result.exit_code == 0 + + +def test_ner_label_index_zero() -> None: + runner = CliRunner() + result = runner.invoke( + validate, + ["--labels", "BIO", "--ner-label-index", "0", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + ) + assert result.exit_code != 0 + assert "ner_label_index cannot be 0" in str(result.exception) From 868074cab508a815bfd1e102628dea7dc47ede24 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 2 Jun 2026 10:06:39 -0400 Subject: [PATCH 12/33] Remove Python 3.9 support --- README.md | 6 +++--- pyproject.toml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e894734..3f6c148 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ To install the latest official release of SeqScore, run: `pip install seqscore`. This will install the package and add the command `seqscore` in your Python environment. -SeqScore requires Python 3.9 or higher. It is tested on Python 3.9, -3.10, 3.11, 3.12, 3.13, and 3.14. +SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12, +3.13, and 3.14. ## License @@ -600,7 +600,7 @@ To install from a clone of this repository, use: ## Setting up an environment for development -1. Create an environment: `conda create -yn seqscore python=3.9` +1. Create an environment: `conda create -yn seqscore python=3.10` 2. Activate the environment: `conda activate seqscore` 3. Install seqscore: `pip install -e .` 4. Install development dependencies: `pip install -r requirements.txt` diff --git a/pyproject.toml b/pyproject.toml index f8d810e..212b567 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.mypy] -python_version = 3.10 +python_version = "3.10" strict_optional = false disallow_untyped_defs = true disallow_untyped_calls = true @@ -13,4 +13,4 @@ ignore_missing_imports = true [tool.ruff] line-length = 90 -target-version = "py39" +target-version = "py310" From a4d0b5cd57128b4a68696e315e257013eef8ccc9 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 2 Jun 2026 10:42:05 -0400 Subject: [PATCH 13/33] Update development dependencies --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e976d18..8ac2e1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,12 +6,12 @@ types-tabulate # For testing -pytest==8.3.5 -pytest-cov==5.0.0 +pytest==9.0.3 +pytest-cov>=7.1.0 # For development -mypy==1.19.0 -ruff==0.14.9 +mypy==2.1.0 +ruff==0.15.15 # Documentation build # Disabled for now since we don't need them From 65d7ef24e23f41d7806c84c66387f37af00c59f8 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 2 Jun 2026 13:21:13 -0400 Subject: [PATCH 14/33] Support setting token and label index across more commands --- seqscore/conll.py | 127 ++++++++++++------ seqscore/model.py | 12 +- seqscore/scripts/seqscore.py | 59 +++++++- seqscore/util.py | 4 +- seqscore/validation.py | 3 +- .../diff_token_label_indices.bio | 17 +++ .../diff_token_label_indices.bioes | 17 +++ .../conll_annotation/labels_not_last_col.bio | 1 + .../labels_not_last_col.bioes | 17 +++ tests/test_conll_format.py | 29 ++-- tests/test_conversion_click.py | 40 +++++- tests/test_model.py | 2 +- tests/test_validation.py | 5 +- tests/test_validation_click.py | 43 ++++-- 14 files changed, 283 insertions(+), 93 deletions(-) create mode 100644 tests/conll_annotation/diff_token_label_indices.bio create mode 100644 tests/conll_annotation/diff_token_label_indices.bioes create mode 100644 tests/conll_annotation/labels_not_last_col.bioes diff --git a/seqscore/conll.py b/seqscore/conll.py index b429a7f..9b9fedd 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -43,16 +43,36 @@ class CoNLLFormatError(Exception): pass +@attrs(frozen=True) +class LineSpec: + """Defines the fields and delimiters for a CoNLL-format line""" + + token_index: int = attrib() + ner_label_index: int = attrib() + + def __attrs_post_init__(self) -> None: + # This will only catch cases where the indices are identical, not + # when they refer to the same position, such as 1 and -1 in a + # sequence of length two + if self.token_index == self.ner_label_index: + raise ValueError( + f"Token index ({self.token_index}) and " + f"label index ({self.ner_label_index}) cannot be the same" + ) + + @attrs(frozen=True) class _CoNLLToken: text: str = attrib() label: str = attrib() is_docstart: bool = attrib() line_num: int = attrib() - other_fields: tuple[str, ...] = attrib() + orig_fields: tuple[str, ...] = attrib() @classmethod - def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: int) -> "_CoNLLToken": + def from_line( + cls, line: str, line_num: int, source_name: str, line_spec: LineSpec + ) -> "_CoNLLToken": # Note: The caller must strip the line of any trailing whitespace # TODO: Sense the file rather than the line so we get consistency across lines # Try tab first since it's safer, then space @@ -72,24 +92,19 @@ def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}" ) - if ner_label_index == 0: - raise ValueError("ner_label_index cannot be 0") - - text = splits[0] - label = splits[ner_label_index] - other_fields = tuple(splits[1:ner_label_index]) - if ner_label_index != -1: - other_fields += tuple(splits[ner_label_index + 1:]) + text = splits[line_spec.token_index] + label = splits[line_spec.ner_label_index] + orig_fields = tuple(splits) is_docstart = text == DOCSTART - return cls(text, label, is_docstart, line_num, other_fields) + return cls(text, label, is_docstart, line_num, orig_fields) @attrs(frozen=True) class CoNLLIngester: encoding: Encoding = attrib() + line_spec: LineSpec = attrib() parse_comment_lines: bool = attrib(default=False, kw_only=True) ignore_document_boundaries: bool = attrib(default=True, kw_only=True) - ner_label_index: int = attrib(default=-1, kw_only=True) def ingest( self, @@ -119,7 +134,7 @@ def ingest( continue # Create mentions from tokens in sequence - tokens, labels, line_nums, other_fields = self._decompose_sequence( + tokens, labels, line_nums, orig_fields = self._decompose_sequence( source_sequence ) @@ -193,7 +208,7 @@ def ingest( tokens, labels, mentions, - other_fields=other_fields, + orig_fields=orig_fields, provenance=SequenceProvenance(line_nums[0], source_name), comment=comment, ) @@ -210,13 +225,17 @@ def ingest( yield document def validate( - self, source: TextIO, source_name: str + self, + source: TextIO, + source_name: str, ) -> list[list[SequenceValidationResult]]: all_results: list[list[SequenceValidationResult]] = [] document_results: list[SequenceValidationResult] = [] for source_sequence, _ in self._parse_file( - source, source_name, parse_comments=self.parse_comment_lines, ner_label_index=self.ner_label_index + source, + source_name, + parse_comments=self.parse_comment_lines, ): if source_sequence[0].is_docstart: # We can ony receive DOCSTART in a sequence by itself, see _parse_file. @@ -254,12 +273,15 @@ def _decompose_sequence( tokens = tuple(tok.text for tok in source_sequence) labels = tuple(tok.label for tok in source_sequence) line_nums = tuple(tok.line_num for tok in source_sequence) - other_fields = tuple(tok.other_fields for tok in source_sequence) - return tokens, labels, line_nums, other_fields + orig_fields = tuple(tok.orig_fields for tok in source_sequence) + return tokens, labels, line_nums, orig_fields - @classmethod def _parse_file( - cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False, ner_label_index: int = -1 + self, + input_file: TextIO, + source_name: str, + *, + parse_comments: bool = False, ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]: sequence: list = [] comment: Optional[str] = None @@ -284,14 +306,14 @@ def _parse_file( if not line.strip(): # Clear out sequence if there's anything in it if sequence: - cls._check_sequence(sequence) + self._check_sequence(sequence) yield tuple(sequence), comment sequence = [] comment = None # Always skip empty lines continue - token = _CoNLLToken.from_line(line, line_num, source_name, ner_label_index) + token = _CoNLLToken.from_line(line, line_num, source_name, self.line_spec) # Skip document starts, but ensure sequence is empty when we reach them if token.is_docstart: if sequence: @@ -301,7 +323,7 @@ def _parse_file( else: # Yield it by itself. Since the sequence variable is empty, leave it unchanged. tmp_sent = (token,) - cls._check_sequence(tmp_sent) + self._check_sequence(tmp_sent) # Don't return the comment yet, it will be returned with the sequence yield tmp_sent, None else: @@ -309,7 +331,7 @@ def _parse_file( # Finish the last sequence if needed if sequence: - cls._check_sequence(sequence) + self._check_sequence(sequence) yield tuple(sequence), comment @staticmethod @@ -327,6 +349,7 @@ def ingest_conll_file( input_path: PathType, mention_encoding_name: str, file_encoding: str, + line_spec: LineSpec, *, repair: Optional[str] = None, ignore_document_boundaries: bool, @@ -343,6 +366,7 @@ def ingest_conll_file( ingester = CoNLLIngester( mention_encoding, + line_spec, parse_comment_lines=parse_comment_lines, ignore_document_boundaries=ignore_document_boundaries, ) @@ -355,17 +379,17 @@ def validate_conll_file( input_path: str, mention_encoding_name: str, file_encoding: str, + line_spec: LineSpec, *, ignore_document_boundaries: bool, parse_comment_lines: bool, - ner_label_index: int, ) -> ValidationResult: encoding = get_encoding(mention_encoding_name) ingester = CoNLLIngester( encoding, + line_spec, parse_comment_lines=parse_comment_lines, ignore_document_boundaries=ignore_document_boundaries, - ner_label_index=ner_label_index, ) with open(input_path, encoding=file_encoding) as input_file: results = ingester.validate(input_file, input_path) @@ -388,6 +412,7 @@ def repair_conll_file( mention_encoding_name: str, repair: Optional[str], file_encoding: str, + line_spec: LineSpec, output_delim: str, *, ignore_document_boundaries: bool, @@ -398,6 +423,7 @@ def repair_conll_file( input_file, mention_encoding_name, file_encoding, + line_spec, repair=repair, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, @@ -429,6 +455,7 @@ def write_docs_using_encoding( mention_encoding_name: str, file_encoding: str, delim: str, + line_spec: LineSpec, output_path: PathType, ) -> None: mention_encoding = get_encoding(mention_encoding_name) @@ -437,7 +464,12 @@ def write_docs_using_encoding( with open(output_path, "w", encoding=file_encoding) as file: for doc in docs: write_doc_using_encoding( - doc, mention_encoding, delim, file, output_docstart=output_docstart + doc, + mention_encoding, + delim, + file, + line_spec, + output_docstart=output_docstart, ) @@ -446,32 +478,42 @@ def write_doc_using_encoding( encoding: Encoding, delim: str, file: TextIO, + line_spec: LineSpec, *, output_docstart: bool, ) -> None: if output_docstart: - # Get a single token to figure out how many other_fields entries it has - sequence_other_fields = doc[0].other_fields - fields = [DOCSTART] - if sequence_other_fields: - fields.extend([EMPTY_OTHER_FIELD for _ in sequence_other_fields[0]]) - fields.append(encoding.dialect.outside) - + # Get the fields of the first token of the first sentence + if doc[0].orig_fields: + # to figure out how many fields there are + sequence_orig_fields = doc[0].orig_fields[0] + # Create the write number of fields + fields = [EMPTY_OTHER_FIELD] * len(sequence_orig_fields) + # Fill in the token and label + fields[line_spec.token_index] = DOCSTART + fields[line_spec.ner_label_index] = encoding.dialect.outside + else: + fields = [DOCSTART, encoding.dialect.outside] + # Write output print(delim.join(fields), file=file) print(file=file) for sequence in doc: labels = encoding.encode_sequence(sequence) - # Lengths of labels and other_fields have previously been checked to match tokens - for (token, other_fields), label in zip( - sequence.tokens_with_other_fields(), labels + # Lengths of labels and orig_fields have previously been checked to match tokens + for (token, orig_fields), label in zip( + sequence.tokens_with_orig_fields(), labels ): - fields = [token] - if other_fields: - fields.extend(other_fields) - fields.append(label) + if orig_fields: + fields = list(orig_fields) + fields[line_spec.token_index] = token + fields[line_spec.ner_label_index] = label + else: + fields = [token, label] + # Write output print(delim.join(fields), file=file) + # Print an emtpy line after each sequence print(file=file) @@ -482,6 +524,7 @@ def score_conll_files( mention_encoding_name: str, repair: Optional[str], file_encoding: str, + line_spec: LineSpec, *, ignore_document_boundaries: bool, parse_comment_lines: bool, @@ -497,6 +540,7 @@ def score_conll_files( reference_file, mention_encoding_name, file_encoding, + line_spec, repair=repair, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, @@ -521,6 +565,7 @@ def score_conll_files( pred_file, mention_encoding_name, file_encoding, + line_spec, repair=repair, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, diff --git a/seqscore/model.py b/seqscore/model.py index 6df73b3..d554371 100644 --- a/seqscore/model.py +++ b/seqscore/model.py @@ -60,7 +60,7 @@ class LabeledSequence(Sequence[str]): tokens: tuple[str, ...] = attrib(converter=tuplify_strs) labels: tuple[str, ...] = attrib(converter=tuplify_strs) mentions: tuple[Mention, ...] = attrib(default=(), converter=_tuplify_mentions) - other_fields: Optional[tuple[tuple[str, ...], ...]] = attrib( + orig_fields: Optional[tuple[tuple[str, ...], ...]] = attrib( default=None, kw_only=True, converter=tuplify_optional_nested_strs ) provenance: Optional[SequenceProvenance] = attrib( @@ -79,9 +79,9 @@ def __attrs_post_init__(self) -> None: if not self.tokens: raise ValueError("Tokens and labels must be non-empty") - if self.other_fields and len(self.tokens) != len(self.other_fields): + if self.orig_fields and len(self.tokens) != len(self.orig_fields): raise ValueError( - f"Tokens ({len(self.tokens)}) and other_fields ({len(self.other_fields)}) " + f"Tokens ({len(self.tokens)}) and orig_fields ({len(self.orig_fields)}) " "must be of the same length" ) @@ -126,11 +126,11 @@ def __str__(self) -> str: def tokens_with_labels(self) -> tuple[tuple[str, str], ...]: return tuple(zip(self.tokens, self.labels)) - def tokens_with_other_fields( + def tokens_with_orig_fields( self, ) -> tuple[tuple[str, Optional[tuple[str, ...]]], ...]: - if self.other_fields: - return tuple(zip(self.tokens, self.other_fields)) + if self.orig_fields: + return tuple(zip(self.tokens, self.orig_fields)) else: return tuple(zip(self.tokens, repeat(None))) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index 20606f3..c2e9265 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -11,6 +11,7 @@ from seqscore.conll import ( FORMAT_DELIM, SUPPORTED_SCORE_FORMATS, + LineSpec, ingest_conll_file, repair_conll_file, score_conll_files, @@ -44,6 +45,20 @@ def _input_file_options() -> list[Callable]: click.option( "--ignore-document-boundaries/--use-document-boundaries", default=False ), + click.option( + "--token-index", + default=0, + show_default=True, + type=int, + help="Index of the input field to use for the token", + ), + click.option( + "--label-index", + default=-1, + show_default=True, + type=int, + help="Index of the input field to use for the label", + ), ] @@ -127,7 +142,6 @@ def _quiet_option() -> Callable: @cli.command(help="validate labels") @_multi_input_file_arguments @_labels_option() -@_ner_label_index_option() @_quiet_option() def validate( file: list[str], # Name is "file" to make sense on the command line, but it's a list @@ -136,18 +150,20 @@ def validate( *, ignore_document_boundaries: bool, parse_comment_lines: bool, - ner_label_index: int, + token_index: int, + label_index: int, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) error = False for each_file in file: result = validate_conll_file( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, - ner_label_index=ner_label_index, ) if result.errors: print( @@ -178,6 +194,8 @@ def repair( output_file: str, labels: str, file_encoding: str, + token_index: int, + label_index: int, repair_method: str, output_delim: str, *, @@ -188,6 +206,7 @@ def repair( output_delim = _normalize_tab(output_delim) if repair_method == REPAIR_NONE: raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}") + line_spec = LineSpec(token_index, label_index) repair_conll_file( file, @@ -195,6 +214,7 @@ def repair( labels, repair_method, file_encoding, + line_spec, output_delim, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, @@ -212,6 +232,8 @@ def convert( file: str, output_file: str, file_encoding: str, + token_index: int, + label_index: int, output_delim: str, input_labels: str, output_labels: str, @@ -220,19 +242,19 @@ def convert( parse_comment_lines: bool, ) -> None: output_delim = _normalize_tab(output_delim) - if input_labels == output_labels: - raise ValueError("Conversion requires different input and output labels") + line_spec = LineSpec(token_index, label_index) docs = ingest_conll_file( file, input_labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, ) write_docs_using_encoding( - docs, output_labels, file_encoding, output_delim, output_file + docs, output_labels, file_encoding, output_delim, line_spec, output_file ) @@ -260,6 +282,8 @@ def process( file: str, output_file: str, file_encoding: str, + token_index: int, + label_index: int, output_delim: str, labels: str, keep_types: str, @@ -270,6 +294,7 @@ def process( parse_comment_lines: bool, ) -> None: output_delim = _normalize_tab(output_delim) + line_spec = LineSpec(token_index, label_index) keep_types_set = _parse_type_list(keep_types) remove_types_set = _parse_type_list(remove_types) type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding) @@ -286,13 +311,16 @@ def process( file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, ) mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict) - write_docs_using_encoding(mod_docs, labels, file_encoding, output_delim, output_file) + write_docs_using_encoding( + mod_docs, labels, file_encoding, output_delim, line_spec, output_file + ) @cli.command(help="show counts for all the mentions contained in a file") @@ -309,6 +337,8 @@ def process( def count( file: list[str], # Name is "file" to make sense on the command line, but it's a list file_encoding: str, + token_index: int, + label_index: int, output_file: Optional[str], labels: str, *, @@ -318,6 +348,7 @@ def count( repair_method: str, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) if repair_method == REPAIR_NONE: repair_method = None @@ -334,6 +365,7 @@ def count( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, repair=repair_method, @@ -368,6 +400,8 @@ def count( def summarize( file: list[str], # Name is "file" to make sense on the command line, but it's a list file_encoding: str, + token_index: int, + label_index: int, labels: str, *, ignore_document_boundaries: bool, @@ -375,6 +409,7 @@ def summarize( repair_method: str, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) if repair_method == REPAIR_NONE: repair_method = None @@ -386,6 +421,7 @@ def summarize( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, repair=repair_method, @@ -453,11 +489,15 @@ def score( reference: str, score_format: str, delim: str, + token_index: int, + label_index: int, repair_method: str, error_counts: bool, full_precision: bool, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) + if repair_method == REPAIR_NONE: repair_method = None @@ -475,6 +515,7 @@ def score( labels, repair_method, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, output_format=score_format, @@ -492,18 +533,22 @@ def score( def extract_text( file: list[str], # Name is "file" to make sense on the command line, but it's a list file_encoding: str, + token_index: int, + label_index: int, labels: str, output_file: str, *, ignore_document_boundaries: bool, parse_comment_lines: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) all_docs = [] for each_file in file: docs = ingest_conll_file( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, ) diff --git a/seqscore/util.py b/seqscore/util.py index 70677fd..666ead2 100644 --- a/seqscore/util.py +++ b/seqscore/util.py @@ -30,13 +30,15 @@ def tuplify_optional_nested_strs( def file_fields_match(path1: PathType, path2: PathType, *, debug: bool = False) -> bool: """Return whether the whitespace-delimited fields of two files are identical.""" with open(path1, encoding="utf8") as f1, open(path2, encoding="utf8") as f2: + line_count = 1 for l1, l2 in zip_longest(f1, f2): if l1 is None or l2 is None or l1.split() != l2.split(): if debug: # pragma: no cover - print("Non-matching lines:") + print(f"Failed to match at line {line_count}:") print(repr(l1)) print(repr(l2)) return False + line_count += 1 return True diff --git a/seqscore/validation.py b/seqscore/validation.py index 6d3c756..9593713 100644 --- a/seqscore/validation.py +++ b/seqscore/validation.py @@ -106,7 +106,8 @@ def validate_labels( raise InvalidLabelError( label, f"Could not parse label {repr(label)}{line_msg}{source_msg} during validation: " - + str(e), + + str(e) + + " Use the --label-index argument if the label is not the last field.", ) from e if not encoding.is_valid_state(state): diff --git a/tests/conll_annotation/diff_token_label_indices.bio b/tests/conll_annotation/diff_token_label_indices.bio new file mode 100644 index 0000000..ba68d3f --- /dev/null +++ b/tests/conll_annotation/diff_token_label_indices.bio @@ -0,0 +1,17 @@ +1 This O DET +2 is O VERB +3 a O DET +4 sentence O NOUN +5 . O PUNCT + +6 University B-ORG NOUN +7 of I-ORG ADP +8 Pennsylvania I-ORG NOUN +9 is O VERB +10 in O ADP +11 West B-LOC NOUN +12 Philadelphia I-LOC NOUN +13 , O PUNCT +14 Pennsylvania B-LOC NOUN +15 . O PUNCT + diff --git a/tests/conll_annotation/diff_token_label_indices.bioes b/tests/conll_annotation/diff_token_label_indices.bioes new file mode 100644 index 0000000..46a6398 --- /dev/null +++ b/tests/conll_annotation/diff_token_label_indices.bioes @@ -0,0 +1,17 @@ +1 This O DET +2 is O VERB +3 a O DET +4 sentence O NOUN +5 . O PUNCT + +6 University B-ORG NOUN +7 of I-ORG ADP +8 Pennsylvania E-ORG NOUN +9 is O VERB +10 in O ADP +11 West B-LOC NOUN +12 Philadelphia E-LOC NOUN +13 , O PUNCT +14 Pennsylvania S-LOC NOUN +15 . O PUNCT + diff --git a/tests/conll_annotation/labels_not_last_col.bio b/tests/conll_annotation/labels_not_last_col.bio index cc91cf4..67d4b7f 100644 --- a/tests/conll_annotation/labels_not_last_col.bio +++ b/tests/conll_annotation/labels_not_last_col.bio @@ -14,3 +14,4 @@ Philadelphia I-LOC NOUN , O PUNCT Pennsylvania B-LOC NOUN . O PUNCT + diff --git a/tests/conll_annotation/labels_not_last_col.bioes b/tests/conll_annotation/labels_not_last_col.bioes new file mode 100644 index 0000000..0e55cd2 --- /dev/null +++ b/tests/conll_annotation/labels_not_last_col.bioes @@ -0,0 +1,17 @@ +This O DET +is O VERB +a O DET +sentence O NOUN +. O PUNCT + +University B-ORG NOUN +of I-ORG ADP +Pennsylvania E-ORG NOUN +is O VERB +in O ADP +West B-LOC NOUN +Philadelphia E-LOC NOUN +, O PUNCT +Pennsylvania S-LOC NOUN +. O PUNCT + diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py index bc4d196..f32506b 100644 --- a/tests/test_conll_format.py +++ b/tests/test_conll_format.py @@ -2,14 +2,15 @@ import pytest -from seqscore.conll import CoNLLFormatError, CoNLLIngester +from seqscore.conll import CoNLLFormatError, CoNLLIngester, LineSpec from seqscore.encoding import REPAIR_NONE, get_encoding from seqscore.validation import InvalidLabelError def test_parse_comments_true() -> None: mention_encoding = get_encoding("BIO") - ingester = CoNLLIngester(mention_encoding, parse_comment_lines=True) + line_spec = LineSpec(0, 1) + ingester = CoNLLIngester(mention_encoding, line_spec, parse_comment_lines=True) comments_path = Path("tests") / "test_files" / "minimal_comments.bio" with comments_path.open(encoding="utf8") as file: documents = list(ingester.ingest(file, "test", REPAIR_NONE)) @@ -32,7 +33,8 @@ def test_parse_comments_true() -> None: def test_parse_comments_false() -> None: mention_encoding = get_encoding("BIO") - ingester = CoNLLIngester(mention_encoding) + line_spec = LineSpec(0, 1) + ingester = CoNLLIngester(mention_encoding, line_spec) comments_path = Path("tests") / "test_files" / "minimal_comments_1.bio" with comments_path.open(encoding="utf8") as file: @@ -46,35 +48,24 @@ def test_parse_comments_false() -> None: comments_path = Path("tests") / "test_files" / "minimal_comments_2.bio" with comments_path.open(encoding="utf8") as file: - with pytest.raises(InvalidLabelError) as err: + with pytest.raises(InvalidLabelError): list(ingester.ingest(file, "test", REPAIR_NONE)) - assert ( - str(err.value) - == "Could not parse label 'Comment' on line 1 of test during validation: Label 'Comment' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." - ) comments_path = Path("tests") / "test_files" / "minimal_comments_3.bio" with comments_path.open(encoding="utf8") as file: - with pytest.raises(InvalidLabelError) as err: + with pytest.raises(InvalidLabelError): list(ingester.ingest(file, "test", REPAIR_NONE)) - assert ( - str(err.value) - == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." - ) comments_path = Path("tests") / "test_files" / "minimal_comments_4.bio" with comments_path.open(encoding="utf8") as file: - with pytest.raises(InvalidLabelError) as err: + with pytest.raises(InvalidLabelError): list(ingester.ingest(file, "test", REPAIR_NONE)) - assert ( - str(err.value) - == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." - ) def test_invalid_token_leading_space() -> None: mention_encoding = get_encoding("BIO") - ingester = CoNLLIngester(mention_encoding) + line_spec = LineSpec(0, -1) + ingester = CoNLLIngester(mention_encoding, line_spec) path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt" with path.open(encoding="utf8") as file: diff --git a/tests/test_conversion_click.py b/tests/test_conversion_click.py index 8fafcf6..b41a32b 100644 --- a/tests/test_conversion_click.py +++ b/tests/test_conversion_click.py @@ -174,6 +174,28 @@ def test_IOB_to_BIO_fields() -> None: ) +def test_IOB_to_BIO_fields_and_specified_indices() -> None: + runner = CliRunner() + result = runner.invoke( + convert, + [ + "--input-labels", + "BIO", + "--output-labels", + "BIOES", + "--label-index", + "1", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"), + ], + ) + assert result.exit_code == 0 + assert file_fields_match( + os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"), + os.path.join("tests", "conll_annotation", "labels_not_last_col.bioes"), + ) + + def test_IO_to_BIOES() -> None: runner = CliRunner() result = runner.invoke( @@ -215,7 +237,7 @@ def test_BIOES_to_IO() -> None: ) -def test_same_input_and_output_labels_raises_error() -> None: +def test_diff_token_label_indices() -> None: runner = CliRunner() result = runner.invoke( convert, @@ -223,9 +245,17 @@ def test_same_input_and_output_labels_raises_error() -> None: "--input-labels", "BIO", "--output-labels", - "BIO", - os.path.join("tests", "conll_annotation", "minimal.bio"), - os.path.join(TMP_DIR.name, "temp.txt"), + "BIOES", + "--token-index", + "1", + "--label-index", + "2", + os.path.join("tests", "conll_annotation", "diff_token_label_indices.bio"), + os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"), ], ) - assert result.exit_code != 0 + assert result.exit_code == 0 + assert file_fields_match( + os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"), + os.path.join("tests", "conll_annotation", "diff_token_label_indices.bioes"), + ) diff --git a/tests/test_model.py b/tests/test_model.py index 50dcee1..0853d16 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -73,4 +73,4 @@ def test_labeled_sentence() -> None: with pytest.raises(ValueError): # Mismatched length between tokens and other_fields - LabeledSequence(["a", "b"], ["B-PER", "I-PER"], other_fields=[["DT"]]) + LabeledSequence(["a", "b"], ["B-PER", "I-PER"], orig_fields=[["DT"]]) diff --git a/tests/test_validation.py b/tests/test_validation.py index c8f121e..9fbde19 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -340,9 +340,8 @@ def test_validation_bad_label() -> None: labels = ["O", "PER", "PER"] with pytest.raises(EncodingError) as err: validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums) - assert ( - str(err.value) - == "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." + assert str(err.value).startswith( + "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py index 3cb4a98..5cf201c 100644 --- a/tests/test_validation_click.py +++ b/tests/test_validation_click.py @@ -210,9 +210,8 @@ def test_bad_label() -> None: ["--labels", "BIO", os.path.join("tests", "conll_annotation", "bad_label2.bio")], ) assert result.exit_code != 0 - assert ( - str(result.exception) - == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." + assert str(result.exception).startswith( + "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) @@ -220,9 +219,18 @@ def test_ner_label_index_pos() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", "--ner-label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + [ + "--labels", + "BIO", + "--label-index", + "1", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + ], + ) + assert ( + result.output + == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" ) - assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" assert result.exit_code == 0 @@ -230,9 +238,18 @@ def test_ner_label_index_neg() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", "--ner-label-index", "-2", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + [ + "--labels", + "BIO", + "--label-index", + "-2", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + ], + ) + assert ( + result.output + == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" ) - assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" assert result.exit_code == 0 @@ -240,7 +257,15 @@ def test_ner_label_index_zero() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", "--ner-label-index", "0", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + [ + "--labels", + "BIO", + "--label-index", + "0", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + ], ) assert result.exit_code != 0 - assert "ner_label_index cannot be 0" in str(result.exception) + assert "Token index (0) and label index (0) cannot be the same" in str( + result.exception + ) From 0e2050f663f1842bc8581d761813a417b388d343 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 12:13:10 -0400 Subject: [PATCH 15/33] Update contributors in README --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3f6c148..6fc280d 100644 --- a/README.md +++ b/README.md @@ -608,6 +608,7 @@ To install from a clone of this repository, use: # Contributors SeqScore was developed by the BLT Lab at Brandeis University under the -direction of PI and lead developer Constantine Lignos. Chester Palen-Michel -and Nolan Holley contributed to its development. Gordon Dou, Maya Kruse, and -Andrew Rueda gave feedback on its features and assisted in README writing. +direction of PI and lead developer Constantine Lignos. Chester +Palen-Michel, Nolan Holley, and Claire Wang contributed to its +development. Gordon Dou, Maya Kruse, and Andrew Rueda gave feedback +on its features and assisted in README writing. From 3e9b46bae506435b69748ec75d368b5838d49b17 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 12:16:36 -0400 Subject: [PATCH 16/33] Drop Python 3.9 support in setup.py --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 90b2082..fbb7f4a 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def setup_package() -> None: packages=find_packages(include=("seqscore", "seqscore.*")), # Package type information package_data={"seqscore": ["py.typed"]}, - python_requires=">=3.9", + python_requires=">=3.10", license="MIT", description="SeqScore: Scoring for named entity recognition and other sequence labeling tasks", long_description=long_description, @@ -34,7 +34,6 @@ def setup_package() -> None: classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", From 208a933acb06db16b9f851b4d1d6a1887506934d Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 12:19:43 -0400 Subject: [PATCH 17/33] Update year and contributors in license --- LICENSE | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 6e764b2..d0f5cdd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (c) 2023 Constantine Lignos, Chester Palen-Michel, and Nolan Holley +Copyright (c) 2026 Constantine Lignos, Chester Palen-Michel, Nolan Holley, +and Claire Wang. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From f038db07f3b21b8977095b2626e1c8ca906a9cbf Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 13:41:32 -0400 Subject: [PATCH 18/33] Fix aggregation of accuracy scores across files --- seqscore/conll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqscore/conll.py b/seqscore/conll.py index 9b9fedd..24d863a 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -576,7 +576,7 @@ def score_conll_files( pred_docs, ref_docs, count_fp_fn_examples=error_counts ) all_class_scores.append(class_scores) - all_acc_scores.append(class_scores) + all_acc_scores.append(acc_scores) if error_counts: if multi_files: From 023e950b746ffff024bf536bea5786e80de48371 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 13:42:58 -0400 Subject: [PATCH 19/33] Improve test coverage of conll.py and clean up some tests --- seqscore/conll.py | 24 ++-- tests/conll_annotation/bad_label1.bio | 5 - tests/test_conll_format.py | 125 +++++++++++++++--- .../bad_label1.bio} | 0 .../bad_label2.bio} | 2 +- tests/test_files/minimal_bad_docstart.bio | 19 +++ tests/test_files/minimal_docstart.bio | 19 +++ tests/test_files/minimal_no_delims.bio | 17 +++ tests/test_summarize_click.py | 25 ++++ tests/test_validation_click.py | 2 +- 10 files changed, 201 insertions(+), 37 deletions(-) delete mode 100644 tests/conll_annotation/bad_label1.bio rename tests/{conll_annotation/bad_label2.bio => test_files/bad_label1.bio} (100%) rename tests/{conll_annotation/bad_label3.bio => test_files/bad_label2.bio} (68%) create mode 100644 tests/test_files/minimal_bad_docstart.bio create mode 100644 tests/test_files/minimal_docstart.bio create mode 100644 tests/test_files/minimal_no_delims.bio diff --git a/seqscore/conll.py b/seqscore/conll.py index 24d863a..a6fb155 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -104,7 +104,7 @@ class CoNLLIngester: encoding: Encoding = attrib() line_spec: LineSpec = attrib() parse_comment_lines: bool = attrib(default=False, kw_only=True) - ignore_document_boundaries: bool = attrib(default=True, kw_only=True) + ignore_document_boundaries: bool = attrib(default=False, kw_only=True) def ingest( self, @@ -113,8 +113,8 @@ def ingest( repair: Optional[str], *, quiet: bool = False, - ) -> Iterable[list[LabeledSequence]]: - document_counter = 0 + ) -> list[list[LabeledSequence]]: + all_documents: list[list[LabeledSequence]] = [] document: list[LabeledSequence] = [] for source_sequence, comment in self._parse_file( @@ -128,8 +128,7 @@ def ingest( # We skip this if the builder is empty, which will happen for the very # first document in the corpus (as there is no previous document to end). if not self.ignore_document_boundaries and document: - document_counter += 1 - yield document + all_documents.append(document) document = [] continue @@ -219,10 +218,11 @@ def ingest( ) from e document.append(sequences) - # Yield final document if non-empty + # Add final document if non-empty if document: - document_counter += 1 - yield document + all_documents.append(document) + + return all_documents def validate( self, @@ -317,8 +317,8 @@ def _parse_file( # Skip document starts, but ensure sequence is empty when we reach them if token.is_docstart: if sequence: - raise ValueError( - f"Encountered DOCSTART at line {line_num} while still in sequence" + raise CoNLLFormatError( + f"Encountered {DOCSTART} at line {line_num} of {source_name} in the middle of a sequence" ) else: # Yield it by itself. Since the sequence variable is empty, leave it unchanged. @@ -341,7 +341,7 @@ def _check_sequence(sequence: Sequence[_CoNLLToken]) -> None: # get document boundaries as their own sequences. if sequence[0].is_docstart and len(sequence) > 1: raise ValueError( - f"Returned -DOCSTART- as part of a sequence at line {sequence[0].line_num}" + f"Returned {DOCSTART} as part of a sequence at line {sequence[0].line_num}" ) @@ -371,7 +371,7 @@ def ingest_conll_file( ignore_document_boundaries=ignore_document_boundaries, ) with open(input_path, encoding=file_encoding) as input_file: - docs = list(ingester.ingest(input_file, str(input_path), repair, quiet=quiet)) + docs = ingester.ingest(input_file, str(input_path), repair, quiet=quiet) return docs diff --git a/tests/conll_annotation/bad_label1.bio b/tests/conll_annotation/bad_label1.bio deleted file mode 100644 index dcda560..0000000 --- a/tests/conll_annotation/bad_label1.bio +++ /dev/null @@ -1,5 +0,0 @@ -This O -is -a O -sentence O -. O diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py index f32506b..b5b3fa5 100644 --- a/tests/test_conll_format.py +++ b/tests/test_conll_format.py @@ -2,18 +2,26 @@ import pytest -from seqscore.conll import CoNLLFormatError, CoNLLIngester, LineSpec +from seqscore.conll import ( + DOCSTART, + CoNLLFormatError, + CoNLLIngester, + LineSpec, + _CoNLLToken, + ingest_conll_file, +) from seqscore.encoding import REPAIR_NONE, get_encoding from seqscore.validation import InvalidLabelError +BIO = get_encoding("BIO") +LINE_SPEC = LineSpec(0, -1) + def test_parse_comments_true() -> None: - mention_encoding = get_encoding("BIO") - line_spec = LineSpec(0, 1) - ingester = CoNLLIngester(mention_encoding, line_spec, parse_comment_lines=True) + ingester = CoNLLIngester(BIO, LINE_SPEC, parse_comment_lines=True) comments_path = Path("tests") / "test_files" / "minimal_comments.bio" with comments_path.open(encoding="utf8") as file: - documents = list(ingester.ingest(file, "test", REPAIR_NONE)) + documents = ingester.ingest(file, "test", REPAIR_NONE) assert len(documents) == 1 sequences = documents[0] @@ -32,15 +40,12 @@ def test_parse_comments_true() -> None: def test_parse_comments_false() -> None: - mention_encoding = get_encoding("BIO") - line_spec = LineSpec(0, 1) - ingester = CoNLLIngester(mention_encoding, line_spec) - + ingester = CoNLLIngester(BIO, LINE_SPEC) comments_path = Path("tests") / "test_files" / "minimal_comments_1.bio" with comments_path.open(encoding="utf8") as file: # err1 needs to not be reused below because the exception is a different type with pytest.raises(CoNLLFormatError) as err1: - list(ingester.ingest(file, "test", REPAIR_NONE)) + ingester.ingest(file, "test", REPAIR_NONE) assert ( str(err1.value) == "Line 1 of test does not appear to be delimited and begins with #. Perhaps you want to use the --parse-comment-lines flag? Line contents: '#'" @@ -49,27 +54,111 @@ def test_parse_comments_false() -> None: comments_path = Path("tests") / "test_files" / "minimal_comments_2.bio" with comments_path.open(encoding="utf8") as file: with pytest.raises(InvalidLabelError): - list(ingester.ingest(file, "test", REPAIR_NONE)) + ingester.ingest(file, "test", REPAIR_NONE) comments_path = Path("tests") / "test_files" / "minimal_comments_3.bio" with comments_path.open(encoding="utf8") as file: with pytest.raises(InvalidLabelError): - list(ingester.ingest(file, "test", REPAIR_NONE)) + ingester.ingest(file, "test", REPAIR_NONE) comments_path = Path("tests") / "test_files" / "minimal_comments_4.bio" with comments_path.open(encoding="utf8") as file: with pytest.raises(InvalidLabelError): - list(ingester.ingest(file, "test", REPAIR_NONE)) + ingester.ingest(file, "test", REPAIR_NONE) def test_invalid_token_leading_space() -> None: - mention_encoding = get_encoding("BIO") - line_spec = LineSpec(0, -1) - ingester = CoNLLIngester(mention_encoding, line_spec) - + ingester = CoNLLIngester(BIO, LINE_SPEC) path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt" with path.open(encoding="utf8") as file: with pytest.raises(ValueError) as err: - list(ingester.ingest(file, "test", REPAIR_NONE)) + ingester.ingest(file, "test", REPAIR_NONE) assert str(err.value) == "Invalid token '' on line 9 of test" + + +def test_bad_docstart() -> None: + ingester = CoNLLIngester(BIO, LINE_SPEC) + path = Path("tests") / "test_files" / "minimal_bad_docstart.bio" + with path.open(encoding="utf8") as file: + with pytest.raises(CoNLLFormatError) as err: + ingester.ingest(file, str(path), REPAIR_NONE) + + assert ( + str(err.value) + == "Encountered -DOCSTART- at line 4 of tests/test_files/minimal_bad_docstart.bio in the middle of a sequence" + ) + + +def test_check_sequence() -> None: + tokens = [ + _CoNLLToken(DOCSTART, "O", True, 0, ()), + _CoNLLToken("Hello", "O", True, 0, ()), + ] + with pytest.raises(ValueError): + CoNLLIngester._check_sequence(tokens) + + +def test_no_delims() -> None: + ingester = CoNLLIngester(BIO, LINE_SPEC) + path = Path("tests") / "test_files" / "minimal_no_delims.bio" + with path.open(encoding="utf8") as file: + with pytest.raises(CoNLLFormatError) as err: + ingester.ingest(file, str(path), REPAIR_NONE) + + assert ( + str(err.value) + == "Line 1 of tests/test_files/minimal_no_delims.bio is not delimited by space or tab: 'ThisO'" + ) + + +def test_validate_with_docstart() -> None: + ingester = CoNLLIngester(BIO, LINE_SPEC, ignore_document_boundaries=False) + path = Path("tests") / "test_files" / "minimal_docstart.bio" + with path.open(encoding="utf8") as file: + ingester.validate( + file, + str(path), + ) + + +def test_repair_bad_name() -> None: + path = Path("tests") / "conll_annotation" / "minimal.bio" + with pytest.raises(ValueError) as err: + ingest_conll_file( + str(path), + "BIOES", + "UTF-8", + LINE_SPEC, + repair="conlleval", + ignore_document_boundaries=False, + parse_comment_lines=False, + ) + + assert str(err.value).startswith( + "Cannot repair mention encoding BIOES using method conlleval." + ) + + +def test_bad_label1() -> None: + ingester = CoNLLIngester(BIO, LINE_SPEC) + path = Path("tests") / "test_files" / "bad_label1.bio" + with path.open(encoding="utf8") as file: + with pytest.raises(InvalidLabelError) as err: + ingester.ingest(file, str(path), repair=REPAIR_NONE) + + assert str(err.value).startswith( + "Could not parse label 'GPE' on line 4 of tests/test_files/bad_label1.bio during validation" + ) + + +def test_bad_label2() -> None: + ingester = CoNLLIngester(BIO, LINE_SPEC) + path = Path("tests") / "test_files" / "bad_label2.bio" + with path.open(encoding="utf8") as file: + with pytest.raises(InvalidLabelError) as err: + ingester.ingest(file, str(path), repair=REPAIR_NONE) + + assert str(err.value).startswith( + "Could not parse label 'OUT' on line 1 of tests/test_files/bad_label2.bio during validation" + ) diff --git a/tests/conll_annotation/bad_label2.bio b/tests/test_files/bad_label1.bio similarity index 100% rename from tests/conll_annotation/bad_label2.bio rename to tests/test_files/bad_label1.bio diff --git a/tests/conll_annotation/bad_label3.bio b/tests/test_files/bad_label2.bio similarity index 68% rename from tests/conll_annotation/bad_label3.bio rename to tests/test_files/bad_label2.bio index 95d093e..e67150c 100644 --- a/tests/conll_annotation/bad_label3.bio +++ b/tests/test_files/bad_label2.bio @@ -1,5 +1,5 @@ This OUT is OUT a OUT -sentence OUT +sentence GPE . OUT diff --git a/tests/test_files/minimal_bad_docstart.bio b/tests/test_files/minimal_bad_docstart.bio new file mode 100644 index 0000000..3284f2c --- /dev/null +++ b/tests/test_files/minimal_bad_docstart.bio @@ -0,0 +1,19 @@ +-DOCSTART- O +This O +is O +-DOCSTART- O +a O +sentence O +. O + +University B-ORG +of I-ORG +Pennsylvania I-ORG +is O +in O +West B-LOC +Philadelphia I-LOC +, O +Pennsylvania B-LOC +. O + diff --git a/tests/test_files/minimal_docstart.bio b/tests/test_files/minimal_docstart.bio new file mode 100644 index 0000000..c46e330 --- /dev/null +++ b/tests/test_files/minimal_docstart.bio @@ -0,0 +1,19 @@ +-DOCSTART- O +This O +is O +a O +sentence O +. O + +-DOCSTART- O +University B-ORG +of I-ORG +Pennsylvania I-ORG +is O +in O +West B-LOC +Philadelphia I-LOC +, O +Pennsylvania B-LOC +. O + diff --git a/tests/test_files/minimal_no_delims.bio b/tests/test_files/minimal_no_delims.bio new file mode 100644 index 0000000..d86767e --- /dev/null +++ b/tests/test_files/minimal_no_delims.bio @@ -0,0 +1,17 @@ +ThisO +isO +aO +sentenceO +.O + +UniversityB-ORG +ofI-ORG +PennsylvaniaI-ORG +isO +inO +WestB-LOC +PhiladelphiaI-LOC +,O +PennsylvaniaB-LOC +.O + diff --git a/tests/test_summarize_click.py b/tests/test_summarize_click.py index 4088093..8a9dec8 100644 --- a/tests/test_summarize_click.py +++ b/tests/test_summarize_click.py @@ -77,6 +77,31 @@ def test_summarize_iob_twodoc() -> None: ) +def test_summarize_iob_twodoc_ignore_doc_boundaries() -> None: + runner = CliRunner() + result = runner.invoke( + summarize, + [ + "--labels", + "IOB", + "--ignore-document-boundaries", + os.path.join("tests", "conll_annotation", "minimal_fields.iob"), + ], + ) + assert result.exit_code == 0 + assert ( + result.output + == """File 'tests/conll_annotation/minimal_fields.iob' contains 1 document(s) and 2 sentences +Entity Type Count +------------- ------- +LOC 2 +ORG 1 +------------- ------- +TOTAL 3 +""" + ) + + def test_summarize_bio_twofiles() -> None: runner = CliRunner() result = runner.invoke( diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py index 5cf201c..6f72a7d 100644 --- a/tests/test_validation_click.py +++ b/tests/test_validation_click.py @@ -207,7 +207,7 @@ def test_bad_label() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", os.path.join("tests", "conll_annotation", "bad_label2.bio")], + ["--labels", "BIO", os.path.join("tests", "test_files", "bad_label1.bio")], ) assert result.exit_code != 0 assert str(result.exception).startswith( From 15b17a0d104fee54077175e9eeaf149c19cea26c Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 14:02:53 -0400 Subject: [PATCH 20/33] Remove repair-specific deprecated file writing function --- seqscore/conll.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/seqscore/conll.py b/seqscore/conll.py index a6fb155..15f8edc 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -429,25 +429,9 @@ def repair_conll_file( parse_comment_lines=parse_comment_lines, quiet=quiet, ) - - output_docstart = len(docs) > 1 - - with open(output_file, "w", encoding=file_encoding) as file: - for doc in docs: - _write_doc_labels(doc, output_delim, file, output_docstart=output_docstart) - - -def _write_doc_labels( - doc: Sequence[LabeledSequence], delim: str, file: TextIO, *, output_docstart: bool -) -> None: - if output_docstart: - print(f"{DOCSTART}{delim}O", file=file) - print(file=file) - - for sequence in doc: - for token, label in sequence.tokens_with_labels(): - print(f"{token}{delim}{label}", file=file) - print(file=file) + write_docs_using_encoding( + docs, mention_encoding_name, file_encoding, output_delim, line_spec, output_file + ) def write_docs_using_encoding( From a888abcc01218348fcb72bd6f2a92b9ef8b9ed64 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 14:41:50 -0400 Subject: [PATCH 21/33] Add LabeledSequence.from_tokens_and_labels utility method --- seqscore/model.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/seqscore/model.py b/seqscore/model.py index d554371..2921b50 100644 --- a/seqscore/model.py +++ b/seqscore/model.py @@ -1,6 +1,6 @@ from collections.abc import Iterable, Iterator, Sequence from itertools import repeat -from typing import Any, Optional, Union, overload +from typing import TYPE_CHECKING, Any, Optional, Union, overload from attr import Attribute, attrib, attrs @@ -10,6 +10,9 @@ validator_nonempty_str, ) +if TYPE_CHECKING: + from seqscore.encoding import Encoding # pragma: no cover + def _validator_nonnegative(_inst: Any, _attr: Attribute, value: Any) -> None: if value < 0: @@ -139,3 +142,14 @@ def span_tokens(self, span: Span) -> tuple[str, ...]: def mention_tokens(self, mention: Mention) -> tuple[str, ...]: return self.span_tokens(mention.span) + + @classmethod + def from_tokens_and_labels( + cls, + tokens: Sequence[str], + labels: Sequence[str], + encoding: "Encoding", + **kwargs: Any, + ) -> "LabeledSequence": + mentions = encoding.decode_labels(labels) + return cls(tokens, labels, mentions, **kwargs) From 2296b4c69811310e8aade3efa82aa1e130954131 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 14:43:55 -0400 Subject: [PATCH 22/33] Improve test coverage of conll.py --- tests/test_conll_format.py | 48 ++++++++++++++--- tests/test_files/minimal_docstart1.bio | 21 ++++++++ ...mal_docstart.bio => minimal_docstart2.bio} | 0 tests/test_scoring_click.py | 54 +++++++++++++++++++ 4 files changed, 117 insertions(+), 6 deletions(-) create mode 100644 tests/test_files/minimal_docstart1.bio rename tests/test_files/{minimal_docstart.bio => minimal_docstart2.bio} (100%) diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py index b5b3fa5..2b19a71 100644 --- a/tests/test_conll_format.py +++ b/tests/test_conll_format.py @@ -9,8 +9,11 @@ LineSpec, _CoNLLToken, ingest_conll_file, + write_docs_using_encoding, ) from seqscore.encoding import REPAIR_NONE, get_encoding +from seqscore.model import LabeledSequence +from seqscore.util import file_fields_match from seqscore.validation import InvalidLabelError BIO = get_encoding("BIO") @@ -114,12 +117,15 @@ def test_no_delims() -> None: def test_validate_with_docstart() -> None: ingester = CoNLLIngester(BIO, LINE_SPEC, ignore_document_boundaries=False) - path = Path("tests") / "test_files" / "minimal_docstart.bio" - with path.open(encoding="utf8") as file: - ingester.validate( - file, - str(path), - ) + # Check two variants, one with docstart in its own sentence and another with + # docstart at the start of the sentence + for filename in ("minimal_docstart1.bio", "minimal_docstart2.bio"): + path = Path("tests") / "test_files" / filename + with path.open(encoding="utf8") as file: + ingester.validate( + file, + str(path), + ) def test_repair_bad_name() -> None: @@ -152,6 +158,36 @@ def test_bad_label1() -> None: ) +def test_write_docs_no_orig_fields(tmp_path: Path) -> None: + sent1 = LabeledSequence( + tokens=("This", "is", "a", "sentence", "."), + labels=("O", "O", "O", "O", "O"), + mentions=(), + ) + sent2 = LabeledSequence.from_tokens_and_labels( + ( + "University", + "of", + "Pennsylvania", + "is", + "in", + "West", + "Philadelphia", + ",", + "Pennsylvania", + ".", + ), + ("B-ORG", "I-ORG", "I-ORG", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O"), + BIO, + ) + docs = [[sent1], [sent2]] + output_file = tmp_path / "out.bio" + write_docs_using_encoding(docs, "BIO", "utf-8", "\t", LINE_SPEC, output_file) + assert file_fields_match( + output_file, Path("tests") / "test_files" / "minimal_docstart1.bio", debug=True + ) + + def test_bad_label2() -> None: ingester = CoNLLIngester(BIO, LINE_SPEC) path = Path("tests") / "test_files" / "bad_label2.bio" diff --git a/tests/test_files/minimal_docstart1.bio b/tests/test_files/minimal_docstart1.bio new file mode 100644 index 0000000..cc0b7e0 --- /dev/null +++ b/tests/test_files/minimal_docstart1.bio @@ -0,0 +1,21 @@ +-DOCSTART- O + +This O +is O +a O +sentence O +. O + +-DOCSTART- O + +University B-ORG +of I-ORG +Pennsylvania I-ORG +is O +in O +West B-LOC +Philadelphia I-LOC +, O +Pennsylvania B-LOC +. O + diff --git a/tests/test_files/minimal_docstart.bio b/tests/test_files/minimal_docstart2.bio similarity index 100% rename from tests/test_files/minimal_docstart.bio rename to tests/test_files/minimal_docstart2.bio diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py index 69b2573..c336f9b 100644 --- a/tests/test_scoring_click.py +++ b/tests/test_scoring_click.py @@ -27,6 +27,60 @@ def test_score_correct_labels() -> None: assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output +def test_score_incorrect_default_format() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + ], + ) + assert result.exit_code == 0 + assert ( + "| ALL | 50.00 | 66.67 | 57.14 | 3 | 4 | 2 |" + in result.output + ) + assert ( + "| LOC | 33.33 | 50.00 | 40.00 | 2 | 3 | 1 |" + in result.output + ) + assert ( + "| ORG | 100.00 | 100.00 | 100.00 | 1 | 1 | 1 |" + in result.output + ) + + +def test_score_incorrect_conlleval_format() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + "--score-format", + "conlleval", + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + ], + ) + assert result.exit_code == 0 + assert ( + "processed 15 tokens with 3 phrases; found: 4 phrases; correct: 2." + in result.output + ) + assert ( + "accuracy: 93.33%; precision: 50.00%; recall: 66.67%; FB1: 57.14" + in result.output + ) + assert "LOC: precision: 33.33%; recall: 50.00%; FB1: 40.00 3" in result.output + assert "ORG: precision: 100.00%; recall: 100.00%; FB1: 100.00 1" in result.output + + def test_score_invalid_sequence_conlleval() -> None: runner = CliRunner() result = runner.invoke( From ccf3172e0d6148f4e78cbf25abd4965c928e44a8 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 15:02:44 -0400 Subject: [PATCH 23/33] Change multi-file scoring to compute std. error and allow pretty output --- seqscore/conll.py | 85 +++++++++++++------ seqscore/scripts/seqscore.py | 4 +- .../incorrect1_nopredictions.bio | 30 +++---- tests/test_scoring_click.py | 70 ++++++++++++++- 4 files changed, 144 insertions(+), 45 deletions(-) diff --git a/seqscore/conll.py b/seqscore/conll.py index 15f8edc..6c75ecc 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -2,6 +2,7 @@ from collections import Counter, defaultdict from collections.abc import Iterable, Sequence from itertools import chain +from math import sqrt from statistics import mean, stdev from typing import ( Any, @@ -38,6 +39,8 @@ FORMAT_DELIM = "delim" SUPPORTED_SCORE_FORMATS = (FORMAT_PRETTY, FORMAT_CONLLEVAL, FORMAT_DELIM) +ALL_TYPES = "ALL" + class CoNLLFormatError(Exception): pass @@ -611,7 +614,8 @@ def score_conll_files( elif output_format in (FORMAT_PRETTY, FORMAT_DELIM): header, rows = format_output_table(class_scores, full_precision) if output_format == FORMAT_PRETTY: - # TODO: Should we raise an error for pretty output with full precision specified? + if full_precision: + raise ValueError("Cannot use full_precision with pretty formatting") # We don't allow full_precision in this case so we can use the usual float format score_summaries.append( tabulate(rows, header, tablefmt="github", floatfmt="6.2f") @@ -640,31 +644,33 @@ def score_conll_files( else: raise ValueError(f"Unrecognized output format: {output_format}") + # Compute summary statistics across files when multiple files are scored + if multi_files: + type_scores: DefaultDict[str, list] = defaultdict(list) + for class_score in all_class_scores: + for entity_type, entity_score in class_score.type_scores.items(): + type_scores[entity_type].append(entity_score.f1) + + entity_type_means = { + entity_type: mean(scores) for entity_type, scores in type_scores.items() + } + entity_type_means[ALL_TYPES] = mean(score.f1 for score in all_class_scores) + + entity_type_stderrs = { + entity_type: stdev(scores) / sqrt(len(scores)) + for entity_type, scores in type_scores.items() + } + all_f1s = [score.f1 for score in all_class_scores] + entity_type_stderrs[ALL_TYPES] = stdev(all_f1s) / sqrt(len(all_f1s)) + # For delimited, just join all the rows if output_format == FORMAT_DELIM: if multi_files: - # Compute summary statistics - type_scores: DefaultDict[str, list] = defaultdict(list) - for class_score in all_class_scores: - for entity_type, entity_score in class_score.type_scores.items(): - type_scores[entity_type].append(entity_score.f1) - - entity_type_means = { - entity_type: mean(scores) for entity_type, scores in type_scores.items() - } - entity_type_means["ALL"] = mean(score.f1 for score in all_class_scores) - # TODO: This should be standard error of the mean, not standard deviation - entity_type_sds = { - entity_type: stdev(scores) for entity_type, scores in type_scores.items() - } - entity_type_sds["ALL"] = stdev(score.f1 for score in all_class_scores) - - for entity_type, num in entity_type_sds.items(): + for entity_type, num in entity_type_stderrs.items(): score_summaries.append( - # TODO: Change SD precision _join_delim( [ - "SD", + "SE", entity_type, "NA", "NA", @@ -676,7 +682,6 @@ def score_conll_files( delim, ) ) - # Add aggregates for entity_type, num in entity_type_means.items(): score_summaries.append( _join_delim( @@ -698,8 +703,7 @@ def score_conll_files( if not multi_files: print(score_summaries[0]) else: - # TODO: Sort out aggregates here? - # Index because we care about when we're at the last entry + # Use the index because we care whether we're at the last entry for idx, (filename, summary) in enumerate(zip(pred_files, score_summaries)): print(filename) print(summary) @@ -707,6 +711,39 @@ def score_conll_files( if idx != len(pred_files) - 1: print() + # Print mean ± SE summary table + ref_scores = all_class_scores[0] + summary_header = ["Type", "Mean F1", "SE", "Reference"] + summary_rows = [ + [ + ALL_TYPES, + entity_type_means[ALL_TYPES] * 100, + entity_type_stderrs[ALL_TYPES] * 100, + ref_scores.total_ref, + ] + ] + for entity_type in sorted(entity_type_means): + if entity_type == ALL_TYPES: + continue + summary_rows.append( + [ + entity_type, + entity_type_means[entity_type] * 100, + entity_type_stderrs[entity_type] * 100, + ref_scores.type_scores[entity_type].total_ref, + ] + ) + print() + print("Summary") + print( + tabulate( + summary_rows, + summary_header, + tablefmt="github", + floatfmt="6.2f", + ) + ) + def format_output_conlleval( class_scores: ClassificationScore, @@ -757,7 +794,7 @@ def format_output_table( ] rows = [ [ - "ALL", + ALL_TYPES, convert_score(class_scores.precision, full_precision), convert_score(class_scores.recall, full_precision), convert_score(class_scores.f1, full_precision), diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index c2e9265..2c24334 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -476,7 +476,7 @@ def summarize( @click.option( "--full-precision", is_flag=True, - help="whether to output floating values at full precision instead of rounding half even at two decimal places", + help="whether to output floating values at full precision instead of multiplying by 100 and rounding half even at two decimal places", ) @_quiet_option() def score( @@ -505,7 +505,7 @@ def score( raise ValueError(f"Can only use full-precision with score-format {FORMAT_DELIM}") if error_counts and len(file) > 1: - raise ValueError("Cannot use error-counts with multiple files to be scored") + raise click.UsageError("Cannot use error-counts with multiple files to be scored") delim = _normalize_tab(delim) diff --git a/tests/conll_predictions/incorrect1_nopredictions.bio b/tests/conll_predictions/incorrect1_nopredictions.bio index 8146bb7..6a93472 100644 --- a/tests/conll_predictions/incorrect1_nopredictions.bio +++ b/tests/conll_predictions/incorrect1_nopredictions.bio @@ -1,16 +1,16 @@ -This O O -is O O -a O O -sentence O O -. O O +This O +is O +a O +sentence O +. O -University B-ORG O -of I-ORG O -Pennsylvania I-ORG O -is O O -in O O -West B-LOC O -Philadelphia I-LOC O -, O O -Pennsylvania B-LOC O -. O O +University O +of O +Pennsylvania O +is O +in O +West O +Philadelphia O +, O +Pennsylvania O +. O diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py index c336f9b..5ee7c9a 100644 --- a/tests/test_scoring_click.py +++ b/tests/test_scoring_click.py @@ -1,4 +1,3 @@ -import glob import os from click.testing import CliRunner @@ -232,9 +231,72 @@ def test_score_multiple_files() -> None: os.path.join("tests", "conll_annotation", "minimal.bio"), "--score-format", "delim", - ] - + glob.glob(os.path.join("tests", "conll_predictions", "*1.bio")), + os.path.join("tests", "conll_predictions", "correct1.bio"), + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + ], ) assert result.exit_code == 0 - assert "SD\tALL\tNA\tNA\t30.30\tNA\tNA\tNA" in result.output + assert "SE\tALL\tNA\tNA\t21.43\tNA\tNA\tNA" in result.output assert "Mean\tALL\tNA\tNA\t78.57\tNA\tNA\tNA" in result.output + + +def test_score_multiple_files_pretty() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + os.path.join("tests", "conll_predictions", "correct1.bio"), + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + ], + ) + assert result.exit_code == 0 + assert os.path.join("tests", "conll_predictions", "correct1.bio") in result.output + assert os.path.join("tests", "conll_predictions", "incorrect1.bio") in result.output + assert "Summary" in result.output + assert "| ALL | 78.57 | 21.43 | 3 |" in result.output + assert "| LOC | 70.00 | 30.00 | 2 |" in result.output + assert "| ORG | 100.00 | 0.00 | 1 |" in result.output + + +def test_score_error_counts_multiple_files() -> None: + # Cannot use error-counts with multiple files + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + os.path.join("tests", "conll_predictions", "correct1.bio"), + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + "--error-counts", + ], + ) + assert result.exit_code != 0 + assert "Cannot use error-counts with multiple files to be scored" in result.output + + +def test_score_error_counts_conlleval_format() -> None: + # Cannot use error-counts with conlleval format + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + "--score-format", + "conlleval", + os.path.join("tests", "conll_predictions", "correct1.bio"), + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + "--error-counts", + ], + ) + assert result.exit_code != 0 + assert "Cannot use error-counts with multiple files to be scored" in result.output From 8dbd007326ccf61c0aaba2b4e1e70a7025e18ec1 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 15:47:57 -0400 Subject: [PATCH 24/33] Improve test coverage of conll.py --- tests/test_conll_scoring.py | 69 +++++++++++++++++++++++++++++++++++++ tests/test_scoring_click.py | 42 ++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tests/test_conll_scoring.py diff --git a/tests/test_conll_scoring.py b/tests/test_conll_scoring.py new file mode 100644 index 0000000..1aaabf1 --- /dev/null +++ b/tests/test_conll_scoring.py @@ -0,0 +1,69 @@ +import os + +import pytest + +from seqscore.conll import ( + FORMAT_CONLLEVAL, + FORMAT_DELIM, + FORMAT_PRETTY, + LineSpec, + score_conll_files, +) + +REFERENCE = os.path.join("tests", "conll_annotation", "minimal.bio") +CORRECT1 = os.path.join("tests", "conll_predictions", "correct1.bio") +INCORRECT1 = os.path.join("tests", "conll_predictions", "incorrect1.bio") + + +def _score( + pred_files: list[str], + output_format: str, + error_counts: bool = False, + full_precision: bool = False, +) -> None: + score_conll_files( + pred_files, + REFERENCE, + mention_encoding_name="BIO", + repair=None, + file_encoding="utf-8", + line_spec=LineSpec(0, -1), + ignore_document_boundaries=False, + parse_comment_lines=False, + delim="\t", + output_format=output_format, + error_counts=error_counts, + full_precision=full_precision, + ) + + +def test_score_error_counts_multiple_files() -> None: + with pytest.raises( + ValueError, + match="Outputting error counts is only available for a single prediction file", + ): + _score([CORRECT1, INCORRECT1], FORMAT_DELIM, error_counts=True) + + +def test_score_error_counts_conlleval_format() -> None: + with pytest.raises( + ValueError, + match=f"Format {repr(FORMAT_CONLLEVAL)} is not supported with error counts", + ): + _score([CORRECT1], FORMAT_CONLLEVAL, error_counts=True) + + +def test_score_full_precision_pretty_format() -> None: + with pytest.raises( + ValueError, + match="Cannot use full_precision with pretty formatting", + ): + _score([CORRECT1], FORMAT_PRETTY, full_precision=True) + + +def test_score_unrecognized_format() -> None: + with pytest.raises( + ValueError, + match="Unrecognized output format: bogus", + ): + _score([CORRECT1], "bogus") diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py index 5ee7c9a..0ef943a 100644 --- a/tests/test_scoring_click.py +++ b/tests/test_scoring_click.py @@ -262,6 +262,48 @@ def test_score_multiple_files_pretty() -> None: assert "| ORG | 100.00 | 0.00 | 1 |" in result.output +def test_score_error_counts_single_file() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + "--error-counts", + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + ], + ) + assert result.exit_code == 0 + assert "| Count | Error | Type | Tokens |" in result.output + assert "| 1 | FP | LOC | West |" in result.output + assert "| 1 | FP | LOC | Philadelphia |" in result.output + assert "| 1 | FN | LOC | West Philadelphia |" in result.output + + +def test_score_error_counts_delim_format() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + "--score-format", + "delim", + "--error-counts", + os.path.join("tests", "conll_predictions", "incorrect1.bio"), + ], + ) + assert result.exit_code == 0 + assert "Count\tError\tType\tTokens" in result.output + assert "1\tFP\tLOC\tWest" in result.output + assert "1\tFP\tLOC\tPhiladelphia" in result.output + assert "1\tFN\tLOC\tWest Philadelphia" in result.output + + def test_score_error_counts_multiple_files() -> None: # Cannot use error-counts with multiple files runner = CliRunner() From 105f83738046b99f7f3d7974a00f020d9c479e08 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 16:33:31 -0400 Subject: [PATCH 25/33] Improve test coverage of seqscore.py --- seqscore/scripts/seqscore.py | 46 ++-- .../correct1_improper_sequence.bio | 16 -- .../correct1_improper_sequence_ref.txt | 16 -- tests/test_conversion_click.py | 127 ++++++++-- tests/test_files/map_empty_key.json | 3 + tests/test_files/map_empty_value.json | 3 + tests/test_files/map_invalid_json.json | 1 + tests/test_files/map_not_dict.json | 1 + tests/test_files/map_outside_key.json | 3 + tests/test_files/map_outside_value.json | 3 + ...pace_delim.txt => minimal_space_delim.txt} | 0 tests/test_process_click.py | 236 +++++++++++++++++- tests/test_scoring_click.py | 50 +++- tests/test_utils.py | 4 +- 14 files changed, 416 insertions(+), 93 deletions(-) delete mode 100644 tests/conll_predictions/correct1_improper_sequence.bio delete mode 100644 tests/conll_predictions/correct1_improper_sequence_ref.txt create mode 100644 tests/test_files/map_empty_key.json create mode 100644 tests/test_files/map_empty_value.json create mode 100644 tests/test_files/map_invalid_json.json create mode 100644 tests/test_files/map_not_dict.json create mode 100644 tests/test_files/map_outside_key.json create mode 100644 tests/test_files/map_outside_value.json rename tests/test_files/{space_delim.txt => minimal_space_delim.txt} (100%) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index 2c24334..053cab2 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -121,15 +121,6 @@ def _output_delim_option() -> Callable: ) -def _ner_label_index_option() -> Callable: - return click.option( - "--ner-label-index", - default=-1, - show_default=True, - type=int, - ) - - def _quiet_option() -> Callable: return click.option( "--quiet", @@ -205,7 +196,9 @@ def repair( ) -> None: output_delim = _normalize_tab(output_delim) if repair_method == REPAIR_NONE: - raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}") + raise click.UsageError( + f"Cannot repair with repair strategy {repr(repair_method)}" + ) line_spec = LineSpec(token_index, label_index) repair_conll_file( @@ -300,10 +293,10 @@ def process( type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding) if keep_types_set and remove_types_set: - raise ValueError("Cannot specify both keep-types and remove-types") + raise click.UsageError("Cannot specify both keep-types and remove-types") if not keep_types_set and not remove_types_set and not type_map: - raise ValueError( + raise click.UsageError( "Must specify at least one of keep-types, remove-types, or type-map" ) @@ -316,7 +309,10 @@ def process( parse_comment_lines=parse_comment_lines, ) - mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict) + try: + mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict) + except ValueError as err: + raise click.UsageError(str(err)) from err write_docs_using_encoding( mod_docs, labels, file_encoding, output_delim, line_spec, output_file @@ -502,7 +498,9 @@ def score( repair_method = None if full_precision and score_format != FORMAT_DELIM: - raise ValueError(f"Can only use full-precision with score-format {FORMAT_DELIM}") + raise click.UsageError( + f"Can only use full-precision with score-format {FORMAT_DELIM}" + ) if error_counts and len(file) > 1: raise click.UsageError("Cannot use error-counts with multiple files to be scored") @@ -580,7 +578,7 @@ def _parse_type_list(types: str) -> set[str]: # Check for outside type for entity_type in split_types: if entity_type == DEFAULT_OUTSIDE: - raise ValueError( + raise click.UsageError( f"Cannot specify the outside type {DEFAULT_OUTSIDE} in keep/remove types" ) return set(split_types) @@ -596,40 +594,42 @@ def _load_type_map( with open(type_map_path, encoding=file_encoding) as file: type_map = json.load(file) except FileNotFoundError as err: - raise ValueError(f"Could not open type map file {repr(type_map_path)}") from err + raise click.UsageError( + f"Could not open type map file {repr(type_map_path)}" + ) from err except json.decoder.JSONDecodeError as err: - raise ValueError( + raise click.UsageError( f"Type map provided in file {repr(type_map_path)} is not valid JSON" ) from err # Validate types if not isinstance(type_map, dict): - raise ValueError( + raise click.UsageError( f"Type map provided in file {repr(type_map_path)} is not a dictionary" ) for from_type, to_types in type_map.items(): if not isinstance(from_type, str) or not from_type: - raise ValueError( + raise click.UsageError( f"Key {repr(from_type)} in type map {repr(type_map_path)} is not a non-empty string" ) if from_type == DEFAULT_OUTSIDE: - raise ValueError( + raise click.UsageError( f"Key {repr(from_type)} in type map {repr(type_map_path)} is the outside type {DEFAULT_OUTSIDE}" ) if not isinstance(to_types, list): - raise ValueError( + raise click.UsageError( f"Value {repr(to_types)} in type map {repr(type_map_path)} is not a list" ) for to_type in to_types: if not isinstance(to_type, str) or not to_type: - raise ValueError( + raise click.UsageError( f"Value {repr(to_type)} in type map {repr(type_map_path)} is not a non-empty string" ) if to_type == DEFAULT_OUTSIDE: - raise ValueError( + raise click.UsageError( f"Value {repr(to_type)} in type map {repr(type_map_path)} is the outside type {DEFAULT_OUTSIDE}" ) diff --git a/tests/conll_predictions/correct1_improper_sequence.bio b/tests/conll_predictions/correct1_improper_sequence.bio deleted file mode 100644 index ecc2f1a..0000000 --- a/tests/conll_predictions/correct1_improper_sequence.bio +++ /dev/null @@ -1,16 +0,0 @@ -This O O -is O O -a O O -sentence O O -. O O - -University B-ORG I-ORG -of I-ORG I-ORG -Pennsylvania I-ORG I-ORG -is O O -in O O -West B-LOC B-LOC -Philadelphia I-LOC I-LOC -, O O -Pennsylvania B-LOC B-LOC -. O O diff --git a/tests/conll_predictions/correct1_improper_sequence_ref.txt b/tests/conll_predictions/correct1_improper_sequence_ref.txt deleted file mode 100644 index 2ff8768..0000000 --- a/tests/conll_predictions/correct1_improper_sequence_ref.txt +++ /dev/null @@ -1,16 +0,0 @@ -This O -is O -a O -sentence O -. O - -University B-ORG -of I-ORG -Pennsylvania I-ORG -is O -in O -West B-LOC -Philadelphia I-LOC -, O -Pennsylvania B-LOC -. O diff --git a/tests/test_conversion_click.py b/tests/test_conversion_click.py index b41a32b..b827d0c 100644 --- a/tests/test_conversion_click.py +++ b/tests/test_conversion_click.py @@ -5,7 +5,7 @@ from click.testing import CliRunner from seqscore.scripts.seqscore import convert -from seqscore.util import file_fields_match +from seqscore.util import file_fields_match, file_lines_match TMP_DIR: Optional[tempfile.TemporaryDirectory] = None @@ -23,6 +23,7 @@ def teardown_module() -> None: def test_invalid_conversion_BIO() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "temp.txt") result = runner.invoke( convert, [ @@ -31,7 +32,7 @@ def test_invalid_conversion_BIO() -> None: "--output-labels", "BIOES", os.path.join("tests", "conll_annotation", "invalid1.bio"), - os.path.join(TMP_DIR.name, "temp.txt"), + output_path, ], ) assert result.exit_code != 0 @@ -39,6 +40,7 @@ def test_invalid_conversion_BIO() -> None: def test_invalid_conversion_BIOES() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "temp.txt") result = runner.invoke( convert, [ @@ -47,7 +49,7 @@ def test_invalid_conversion_BIOES() -> None: "--output-labels", "BIO", os.path.join("tests", "conll_annotation", "invalid1.bioes"), - os.path.join(TMP_DIR.name, "temp.txt"), + output_path, ], ) assert result.exit_code != 0 @@ -55,6 +57,7 @@ def test_invalid_conversion_BIOES() -> None: def test_BIO_to_BIOES() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOtoBIOES.txt") result = runner.invoke( convert, [ @@ -63,18 +66,19 @@ def test_BIO_to_BIOES() -> None: "--output-labels", "BIOES", os.path.join("tests", "conll_annotation", "minimal.bio"), - os.path.join(TMP_DIR.name, "BIOtoBIOES.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "BIOtoBIOES.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal.bioes"), ) def test_BIOES_to_BIO() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOEStoBIO.txt") result = runner.invoke( convert, [ @@ -83,18 +87,19 @@ def test_BIOES_to_BIO() -> None: "--output-labels", "BIO", os.path.join("tests", "conll_annotation", "minimal.bioes"), - os.path.join(TMP_DIR.name, "BIOEStoBIO.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "BIOEStoBIO.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal.bio"), ) def test_BIO_to_IO() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOtoIO.txt") result = runner.invoke( convert, [ @@ -103,18 +108,19 @@ def test_BIO_to_IO() -> None: "--output-labels", "IO", os.path.join("tests", "conll_annotation", "minimal.bio"), - os.path.join(TMP_DIR.name, "BIOtoIO.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "BIOtoIO.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal.io"), ) def test_IO_to_BIO() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "IOtoBIO.txt") result = runner.invoke( convert, [ @@ -123,19 +129,20 @@ def test_IO_to_BIO() -> None: "--output-labels", "BIO", os.path.join("tests", "conll_annotation", "minimal.io"), - os.path.join(TMP_DIR.name, "IOtoBIO.txt"), + output_path, ], ) assert result.exit_code == 0 # conversion will not necessarily reproduce BIO correctly but does in this case assert file_fields_match( - os.path.join(TMP_DIR.name, "IOtoBIO.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal.bio"), ) def test_BIO_to_IOB_fields() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOtoIOB.txt") result = runner.invoke( convert, [ @@ -144,18 +151,19 @@ def test_BIO_to_IOB_fields() -> None: "--output-labels", "IOB", os.path.join("tests", "conll_annotation", "minimal_fields.bio"), - os.path.join(TMP_DIR.name, "BIOtoIOB.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "BIOtoIOB.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal_fields.iob"), ) def test_IOB_to_BIO_fields() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "IOBtoBIO.txt") result = runner.invoke( convert, [ @@ -164,18 +172,19 @@ def test_IOB_to_BIO_fields() -> None: "--output-labels", "BIO", os.path.join("tests", "conll_annotation", "minimal_fields.iob"), - os.path.join(TMP_DIR.name, "IOBtoBIO.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "IOBtoBIO.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal_fields.bio"), ) def test_IOB_to_BIO_fields_and_specified_indices() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "labels_not_last_col.bioes") result = runner.invoke( convert, [ @@ -186,18 +195,19 @@ def test_IOB_to_BIO_fields_and_specified_indices() -> None: "--label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), - os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"), + output_path, os.path.join("tests", "conll_annotation", "labels_not_last_col.bioes"), ) def test_IO_to_BIOES() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "IOtoBIOES.txt") result = runner.invoke( convert, [ @@ -206,19 +216,20 @@ def test_IO_to_BIOES() -> None: "--output-labels", "BIOES", os.path.join("tests", "conll_annotation", "minimal.io"), - os.path.join(TMP_DIR.name, "IOtoBIOES.txt"), + output_path, ], ) assert result.exit_code == 0 # conversion will not necessarily reproduce BIOES correctly but does in this case assert file_fields_match( - os.path.join(TMP_DIR.name, "IOtoBIOES.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal.bioes"), ) def test_BIOES_to_IO() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOEStoIO.txt") result = runner.invoke( convert, [ @@ -227,18 +238,88 @@ def test_BIOES_to_IO() -> None: "--output-labels", "IO", os.path.join("tests", "conll_annotation", "minimal.bioes"), - os.path.join(TMP_DIR.name, "BIOEStoIO.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "BIOEStoIO.txt"), + output_path, os.path.join("tests", "conll_annotation", "minimal.io"), ) +def test_BIO_to_BIO_space_delim() -> None: + runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOtoBIO_space.txt") + result = runner.invoke( + convert, + [ + "--input-labels", + "BIO", + "--output-labels", + "BIO", + "--output-delim", + " ", + os.path.join("tests", "conll_annotation", "minimal.bio"), + output_path, + ], + ) + assert result.exit_code == 0 + assert file_lines_match( + output_path, + os.path.join("tests", "test_files", "minimal_space_delim.txt"), + ) + + +def test_BIO_to_BIO_tab_spelled_out() -> None: + runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOtoBIO_tab_spelled_out.txt") + result = runner.invoke( + convert, + [ + "--input-labels", + "BIO", + "--output-labels", + "BIO", + "--output-delim", + "tab", + os.path.join("tests", "conll_annotation", "minimal.bio"), + output_path, + ], + ) + assert result.exit_code == 0 + assert file_lines_match( + output_path, + os.path.join("tests", "conll_annotation", "minimal.bio"), + ) + + +def test_BIO_to_BIO_tab_backslash_t() -> None: + runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "BIOtoBIO_tab_backslash_t.txt") + result = runner.invoke( + convert, + [ + "--input-labels", + "BIO", + "--output-labels", + "BIO", + "--output-delim", + "\\t", + os.path.join("tests", "conll_annotation", "minimal.bio"), + output_path, + ], + ) + assert result.exit_code == 0 + assert file_lines_match( + output_path, + os.path.join("tests", "conll_annotation", "minimal.bio"), + ) + + def test_diff_token_label_indices() -> None: runner = CliRunner() + output_path = os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt") result = runner.invoke( convert, [ @@ -251,11 +332,11 @@ def test_diff_token_label_indices() -> None: "--label-index", "2", os.path.join("tests", "conll_annotation", "diff_token_label_indices.bio"), - os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"), + output_path, ], ) assert result.exit_code == 0 assert file_fields_match( - os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"), + output_path, os.path.join("tests", "conll_annotation", "diff_token_label_indices.bioes"), ) diff --git a/tests/test_files/map_empty_key.json b/tests/test_files/map_empty_key.json new file mode 100644 index 0000000..1eeffa0 --- /dev/null +++ b/tests/test_files/map_empty_key.json @@ -0,0 +1,3 @@ +{ + "": ["LOC"] +} diff --git a/tests/test_files/map_empty_value.json b/tests/test_files/map_empty_value.json new file mode 100644 index 0000000..fa87826 --- /dev/null +++ b/tests/test_files/map_empty_value.json @@ -0,0 +1,3 @@ +{ + "GPE": [""] +} diff --git a/tests/test_files/map_invalid_json.json b/tests/test_files/map_invalid_json.json new file mode 100644 index 0000000..828e21d --- /dev/null +++ b/tests/test_files/map_invalid_json.json @@ -0,0 +1 @@ +{ invalid json diff --git a/tests/test_files/map_not_dict.json b/tests/test_files/map_not_dict.json new file mode 100644 index 0000000..c941e94 --- /dev/null +++ b/tests/test_files/map_not_dict.json @@ -0,0 +1 @@ +["LOC", "GPE"] diff --git a/tests/test_files/map_outside_key.json b/tests/test_files/map_outside_key.json new file mode 100644 index 0000000..ed0263d --- /dev/null +++ b/tests/test_files/map_outside_key.json @@ -0,0 +1,3 @@ +{ + "O": ["LOC"] +} diff --git a/tests/test_files/map_outside_value.json b/tests/test_files/map_outside_value.json new file mode 100644 index 0000000..e3b6ef0 --- /dev/null +++ b/tests/test_files/map_outside_value.json @@ -0,0 +1,3 @@ +{ + "GPE": ["O"] +} diff --git a/tests/test_files/space_delim.txt b/tests/test_files/minimal_space_delim.txt similarity index 100% rename from tests/test_files/space_delim.txt rename to tests/test_files/minimal_space_delim.txt diff --git a/tests/test_process_click.py b/tests/test_process_click.py index 5fd251c..f59cbe5 100644 --- a/tests/test_process_click.py +++ b/tests/test_process_click.py @@ -232,6 +232,27 @@ def test_map_types_keep_types() -> None: assert file_fields_match(TEST_FILES_DIR / "minimal_no_names.bio", output_path) +def test_keep_and_remove_types() -> None: + runner = CliRunner() + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--keep-types", + "LOC,ORG", + "--remove-types", + "MISC", + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert "Cannot specify both keep-types and remove-types" in result.output + + def test_map_types_invalid_map() -> None: runner = CliRunner() map_path = str(TEST_FILES_DIR / "map_bad_value.json") @@ -248,8 +269,11 @@ def test_map_types_invalid_map() -> None: output_path, ], ) - # Malformed map, dictionary value is a string and not a list - assert result.exit_code != 0 + assert result.exit_code == 2 + assert ( + "Value 'LOC' in type map 'tests/test_files/map_bad_value.json' is not a list" + in result.output + ) def test_map_types_duplicate_mapping() -> None: @@ -268,11 +292,31 @@ def test_map_types_duplicate_mapping() -> None: output_path, ], ) - # Malformed map, dictionary value is a string and not a list - assert result.exit_code != 0 + assert result.exit_code == 2 + assert "Multiple mappings specified for type 'LOC' in type map" in result.output -def test_keep_and_remove_types() -> None: +def test_no_operation() -> None: + runner = CliRunner() + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Must specify at least one of keep-types, remove-types, or type-map" + in result.output + ) + + +def test_keep_outside_type() -> None: runner = CliRunner() input_path = str(ANNOTATION_DIR / "minimal.bio") output_path = str(Path(TMP_DIR.name) / "out.bio") @@ -280,14 +324,188 @@ def test_keep_and_remove_types() -> None: process, [ "--keep-types", - "LOC,ORG", + "O", + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert "Cannot specify the outside type O in keep/remove types" in result.output + + +def test_remove_outside_type() -> None: + runner = CliRunner() + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ "--remove-types", - "MISC", + "O", + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert "Cannot specify the outside type O in keep/remove types" in result.output + + +def test_type_map_missing_file() -> None: + runner = CliRunner() + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + "nonexistent_map.json", "--labels", "BIO", input_path, output_path, ], ) - # Can't specify both keep and remove - assert result.exit_code != 0 + assert result.exit_code == 2 + assert "Could not open type map file 'nonexistent_map.json'" in result.output + + +def test_type_map_invalid_json() -> None: + runner = CliRunner() + map_path = str(TEST_FILES_DIR / "map_invalid_json.json") + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + map_path, + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Type map provided in file 'tests/test_files/map_invalid_json.json' is not valid JSON" + in result.output + ) + + +def test_type_map_not_dict() -> None: + runner = CliRunner() + map_path = str(TEST_FILES_DIR / "map_not_dict.json") + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + map_path, + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Type map provided in file 'tests/test_files/map_not_dict.json' is not a dictionary" + in result.output + ) + + +def test_type_map_empty_key() -> None: + runner = CliRunner() + map_path = str(TEST_FILES_DIR / "map_empty_key.json") + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + map_path, + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Key '' in type map 'tests/test_files/map_empty_key.json' is not a non-empty string" + in result.output + ) + + +def test_type_map_outside_key() -> None: + runner = CliRunner() + map_path = str(TEST_FILES_DIR / "map_outside_key.json") + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + map_path, + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Key 'O' in type map 'tests/test_files/map_outside_key.json' is the outside type O" + in result.output + ) + + +def test_type_map_empty_value() -> None: + runner = CliRunner() + map_path = str(TEST_FILES_DIR / "map_empty_value.json") + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + map_path, + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Value '' in type map 'tests/test_files/map_empty_value.json' is not a non-empty string" + in result.output + ) + + +def test_type_map_outside_value() -> None: + runner = CliRunner() + map_path = str(TEST_FILES_DIR / "map_outside_value.json") + input_path = str(ANNOTATION_DIR / "minimal.bio") + output_path = str(Path(TMP_DIR.name) / "out.bio") + result = runner.invoke( + process, + [ + "--type-map", + map_path, + "--labels", + "BIO", + input_path, + output_path, + ], + ) + assert result.exit_code == 2 + assert ( + "Value 'O' in type map 'tests/test_files/map_outside_value.json' is the outside type O" + in result.output + ) diff --git a/tests/test_scoring_click.py b/tests/test_scoring_click.py index 0ef943a..9a97ab4 100644 --- a/tests/test_scoring_click.py +++ b/tests/test_scoring_click.py @@ -26,6 +26,27 @@ def test_score_correct_labels() -> None: assert "ORG\t100.00\t100.00\t100.00\t1\t1\t1" in result.output +def test_score_no_predictions() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + "--score-format", + "delim", + os.path.join("tests", "conll_predictions", "incorrect1_nopredictions.bio"), + ], + ) + assert result.exit_code == 0 + assert "Type\tPrecision\tRecall\tF1\tReference\tPredicted\tCorrect" in result.output + assert "ALL\t0.00\t0.00\t0.00\t3\t0\t0" in result.output + assert "LOC\t0.00\t0.00\t0.00\t2\t0\t0" in result.output + assert "ORG\t0.00\t0.00\t0.00\t1\t0\t0" in result.output + + def test_score_incorrect_default_format() -> None: runner = CliRunner() result = runner.invoke( @@ -150,7 +171,8 @@ def test_score_invalid_sequence_none() -> None: ), ], ) - assert result.exit_code != 0 + assert result.exit_code == 1 + assert "Invalid transition 'O' -> 'I-ORG'" in str(result.exception) def test_score_valid_incorrect_sequence() -> None: @@ -217,7 +239,10 @@ def test_score_invalid_labels() -> None: os.path.join("tests", "conll_predictions", "incorrect1.bio"), ], ) - assert result.exit_code != 0 + assert result.exit_code == 1 + assert "The above labels are not valid for the chunk encoding BIO." in str( + result.exception + ) def test_score_multiple_files() -> None: @@ -319,10 +344,27 @@ def test_score_error_counts_multiple_files() -> None: "--error-counts", ], ) - assert result.exit_code != 0 + assert result.exit_code == 2 assert "Cannot use error-counts with multiple files to be scored" in result.output +def test_score_full_precision_not_delim() -> None: + runner = CliRunner() + result = runner.invoke( + score, + [ + "--labels", + "BIO", + "--reference", + os.path.join("tests", "conll_annotation", "minimal.bio"), + "--full-precision", + os.path.join("tests", "conll_predictions", "correct1.bio"), + ], + ) + assert result.exit_code == 2 + assert "Can only use full-precision with score-format delim" in result.output + + def test_score_error_counts_conlleval_format() -> None: # Cannot use error-counts with conlleval format runner = CliRunner() @@ -340,5 +382,5 @@ def test_score_error_counts_conlleval_format() -> None: "--error-counts", ], ) - assert result.exit_code != 0 + assert result.exit_code == 2 assert "Cannot use error-counts with multiple files to be scored" in result.output diff --git a/tests/test_utils.py b/tests/test_utils.py index d1bbf7c..f5b6721 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -33,11 +33,11 @@ def test_empty_file() -> None: def test_differing_whitespace() -> None: assert file_fields_match( - os.path.join("tests", "test_files", "space_delim.txt"), + os.path.join("tests", "test_files", "minimal_space_delim.txt"), os.path.join("tests", "conll_annotation", "minimal.bio"), ) assert not file_lines_match( - os.path.join("tests", "test_files", "space_delim.txt"), + os.path.join("tests", "test_files", "minimal_space_delim.txt"), os.path.join("tests", "conll_annotation", "minimal.bio"), ) From fafc12fe0cc8d01024c3e76f532c3767afac5662 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 10:41:01 -0400 Subject: [PATCH 26/33] Move all setup to pyproject.toml and add uv instructions --- README.md | 12 ++++++---- pyproject.toml | 55 ++++++++++++++++++++++++++++++++++++++++++++- requirements.txt | 19 ---------------- setup.py | 52 ------------------------------------------ tests/pre_commit.sh | 2 +- 5 files changed, 63 insertions(+), 77 deletions(-) delete mode 100644 requirements.txt delete mode 100755 setup.py diff --git a/README.md b/README.md index 6fc280d..43b68ad 100644 --- a/README.md +++ b/README.md @@ -595,15 +595,19 @@ The following instructions are for the project maintainers only. For development, check out the `dev` branch (latest, but less tested than `main`). -To install from a clone of this repository, use: -`pip install -e .` - ## Setting up an environment for development +### Using uv + +1. Create an environment: `uv venv --python 3.10 .venv` +2. Install seqscore and development dependencies: `uv pip install -e ".[dev]"` + +### Using conda + 1. Create an environment: `conda create -yn seqscore python=3.10` 2. Activate the environment: `conda activate seqscore` 3. Install seqscore: `pip install -e .` -4. Install development dependencies: `pip install -r requirements.txt` +4. Install development dependencies: `pip install -e ".[dev]"` # Contributors diff --git a/pyproject.toml b/pyproject.toml index 212b567..23866eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,57 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "seqscore" +dynamic = ["version"] +description = "SeqScore: Scoring for named entity recognition and other sequence labeling tasks" +readme = "README.md" +license = {text = "MIT"} +authors = [ + {name = "Constantine Lignos", email = "lignos@brandeis.edu"}, +] +requires-python = ">=3.10" +dependencies = [ + "attrs>=19.2.0", + "click", + "tabulate", +] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] + +[project.urls] +Homepage = "https://github.com/bltlab/seqscore" + +[project.scripts] +seqscore = "seqscore.scripts.seqscore:cli" + +[project.optional-dependencies] +dev = [ + "types-tabulate", + "pytest==9.0.3", + "pytest-cov>=7.1.0", + "mypy==2.1.0", + "ruff==0.15.15", +] + +[tool.setuptools.dynamic] +version = {attr = "seqscore.__version__"} + +[tool.setuptools.packages.find] +include = ["seqscore", "seqscore.*"] + +[tool.setuptools.package-data] +seqscore = ["py.typed"] + [tool.mypy] python_version = "3.10" strict_optional = false @@ -6,7 +60,6 @@ disallow_untyped_calls = true [[tool.mypy.overrides]] module = [ - "setuptools", "click.*", ] ignore_missing_imports = true diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8ac2e1b..0000000 --- a/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -# This file only contains dependencies needed for development. -# setup.py contains the actual package dependencies, and the package -# should be installed before these requirements. - -# Type annotations for tabulate -types-tabulate - -# For testing -pytest==9.0.3 -pytest-cov>=7.1.0 - -# For development -mypy==2.1.0 -ruff==0.15.15 - -# Documentation build -# Disabled for now since we don't need them -# sphinx -# sphinx-rtd-theme diff --git a/setup.py b/setup.py deleted file mode 100755 index fbb7f4a..0000000 --- a/setup.py +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env python - -from os import path - -from setuptools import find_packages, setup - -from seqscore import __version__ - - -def setup_package() -> None: - root = path.abspath(path.dirname(__file__)) - with open(path.join(root, "README.md"), encoding="utf-8") as f: - long_description = f.read() - - setup( - name="seqscore", - version=__version__, - packages=find_packages(include=("seqscore", "seqscore.*")), - # Package type information - package_data={"seqscore": ["py.typed"]}, - python_requires=">=3.10", - license="MIT", - description="SeqScore: Scoring for named entity recognition and other sequence labeling tasks", - long_description=long_description, - install_requires=[ - "attrs>=19.2.0", - "click", - "tabulate", - ], - entry_points=""" - [console_scripts] - seqscore=seqscore.scripts.seqscore:cli - """, - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], - url="https://github.com/bltlab/seqscore", - long_description_content_type="text/markdown", - author="Constantine Lignos", - author_email="lignos@brandeis.edu", - ) - - -if __name__ == "__main__": - setup_package() diff --git a/tests/pre_commit.sh b/tests/pre_commit.sh index 411b6f3..3b76d84 100755 --- a/tests/pre_commit.sh +++ b/tests/pre_commit.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euxo pipefail -files=(seqscore/ tests/ *.py) +files=(seqscore/ tests/) ruff check --fix "${files[@]}" ruff check --select I --fix "${files[@]}" # Organize imports ruff format "${files[@]}" From 5d8456e6dc65f4ef60609a1e802d83808073e30a Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 10:50:14 -0400 Subject: [PATCH 27/33] Add flowmark for markdown autoformatting --- pyproject.toml | 1 + tests/pre_commit.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 23866eb..a6256ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dev = [ "pytest-cov>=7.1.0", "mypy==2.1.0", "ruff==0.15.15", + "flowmark", ] [tool.setuptools.dynamic] diff --git a/tests/pre_commit.sh b/tests/pre_commit.sh index 3b76d84..1967c0a 100755 --- a/tests/pre_commit.sh +++ b/tests/pre_commit.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash set -euxo pipefail +flowmark -i --nobackup *.md files=(seqscore/ tests/) ruff check --fix "${files[@]}" ruff check --select I --fix "${files[@]}" # Organize imports From 8c52701c52a2a3945e98323e39d05707d7f37ab2 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 10:50:29 -0400 Subject: [PATCH 28/33] Autoformat README --- README.md | 212 ++++++++++++++++++++------------------------ tests/pre_commit.sh | 2 +- 2 files changed, 95 insertions(+), 119 deletions(-) diff --git a/README.md b/README.md index 43b68ad..afb832d 100644 --- a/README.md +++ b/README.md @@ -5,24 +5,22 @@ [![image](https://img.shields.io/pypi/l/seqscore.svg)](https://pypi.python.org/pypi/seqscore) [![image](https://img.shields.io/pypi/pyversions/seqscore.svg)](https://pypi.python.org/pypi/seqscore) -SeqScore provides scoring for named entity recognition and other -chunking tasks evaluated over sequence labels. +SeqScore provides scoring for named entity recognition and other chunking tasks +evaluated over sequence labels. -SeqScore is maintained by the BLT Lab at Brandeis University. Please -open an issue if you find incorrect behavior or features you would like -to see added. Due to the risk of introducing regressions or incorrect -scoring behavior, *we generally do not accept pull requests*. Please do not -open a pull request unless you are asked to do so by a maintainer in an -issue. +SeqScore is maintained by the BLT Lab at Brandeis University. Please open an issue if +you find incorrect behavior or features you would like to see added. Due to the risk of +introducing regressions or incorrect scoring behavior, *we generally do not accept pull +requests*. Please do not open a pull request unless you are asked to do so by a +maintainer in an issue. ## Installation -To install the latest official release of SeqScore, run: `pip install seqscore`. -This will install the package and add the command `seqscore` in your Python -environment. +To install the latest official release of SeqScore, run: `pip install seqscore`. This +will install the package and add the command `seqscore` in your Python environment. -SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12, -3.13, and 3.14. +SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12, 3.13, +and 3.14. ## License @@ -78,7 +76,6 @@ Other papers related to SeqScore include: * [Toward More Meaningful Resources for Lower-resourced Languages](https://aclanthology.org/2022.findings-acl.44/) * [CoNLL#: Fine-grained Error Analysis and a Corrected Test Set for CoNLL-03 English](https://aclanthology.org/2024.lrec-main.330/) - # Usage ## Overview @@ -108,10 +105,9 @@ Commands: ## Scoring -The most common application of SeqScore is scoring CoNLL-format NER -predictions. Let's assume you have two files, one containing the -correct labels (annotation) and the other containing the predictions -(system output). +The most common application of SeqScore is scoring CoNLL-format NER predictions. Let's +assume you have two files, one containing the correct labels (annotation) and the other +containing the predictions (system output). The correct labels are in the file [samples/reference.bio](samples/reference.bio): @@ -132,7 +128,6 @@ Philadelphia I-LOC , O Pennsylvania B-LOC . O - ``` The predictions are in the file [samples/predicted.bio](samples/predicted.bio): @@ -154,7 +149,6 @@ Philadelphia B-LOC , O Pennsylvania B-LOC . O - ``` To score the predictions, run: @@ -171,27 +165,23 @@ To score the predictions, run: A few things to note: * The reference file must be specified with the `--reference` flag. -* The chunk encoding (BIO, BIOES, etc.) must be specified using the - `--labels` flag. -* Both files need to use the same chunk encoding. If you have - files that use different chunk encodings, use the `convert` command. -* You can get output in different formats using the `--score-format` - flag. Using `--score-format delim` will produce tab-delimited - output. In the delimited format, you can specify the `--full-precision` - flag to output higher numerical precision. -* In the default (pretty) output format, numbers are rounded "half up" - at two decimal places. In other words, 57.124 will round to 57.12, - and 57.125 will round to 57.13. This is different than the "half even" - rounding used by `conlleval` and other libraries that rely on `printf` - behavior for rounding. Half up rounding is used as it is more likely to - match the rounding a user would perform if shown three decimal places. - If you request `conlleval` output format, the same rounding used by +* The chunk encoding (BIO, BIOES, etc.) must be specified using the `--labels` flag. +* Both files need to use the same chunk encoding. If you have files that use different + chunk encodings, use the `convert` command. +* You can get output in different formats using the `--score-format` flag. Using + `--score-format delim` will produce tab-delimited output. In the delimited format, you + can specify the `--full-precision` flag to output higher numerical precision. +* In the default (pretty) output format, numbers are rounded "half up" at two decimal + places. In other words, 57.124 will round to 57.12, and 57.125 will round to 57.13. + This is different than the "half even" rounding used by `conlleval` and other + libraries that rely on `printf` behavior for rounding. Half up rounding is used as it + is more likely to match the rounding a user would perform if shown three decimal + places. If you request `conlleval` output format, the same rounding used by `conlleval` will be used. -The above scoring command will work for files that do not have any -invalid transitions, that is, those that perfectly follow what the -encoding allows. However, consider this BIO-encoded file, -[samples/invalid.bio](samples/invalid.bio): +The above scoring command will work for files that do not have any invalid transitions, +that is, those that perfectly follow what the encoding allows. However, consider this +BIO-encoded file, [samples/invalid.bio](samples/invalid.bio): ``` This O @@ -210,11 +200,10 @@ Philadelphia I-LOC , O Pennsylvania B-LOC . O - ``` -Note that the token `University` has the label `I-ORG`, but there is -no preceding `B-ORG`. If we score it as before with +Note that the token `University` has the label `I-ORG`, but there is no preceding +`B-ORG`. If we score it as before with `seqscore score --labels BIO --reference samples/reference.bio samples/invalid.bio`, scoring will fail: @@ -223,10 +212,9 @@ seqscore.encoding.EncodingError: Stopping due to validation errors in invalid.bi Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7 ``` -To score output with invalid transitions, we need to specify a repair -method which can correct them. We can tell SeqScore to use the same -approach that conlleval uses (which we refer to as "begin" repair in our -paper): +To score output with invalid transitions, we need to specify a repair method which can +correct them. We can tell SeqScore to use the same approach that conlleval uses (which +we refer to as "begin" repair in our paper): `seqscore score --labels BIO --repair-method conlleval --reference samples/reference.bio samples/invalid.bio`: ``` @@ -242,8 +230,8 @@ New: ('B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O') | ORG | 100.00 | 100.00 | 100.00 | 1 | 1 | 1 | ``` -You can use the `-q` flag to suppress the logging of all of the repairs -applied. For example, running the command +You can use the `-q` flag to suppress the logging of all of the repairs applied. For +example, running the command `seqscore score -q --labels BIO --repair-method conlleval --reference samples/reference.bio samples/invalid.bio` will hide the repairs: @@ -255,13 +243,12 @@ will hide the repairs: | ORG | 100.00 | 100.00 | 100.00 | 1 | 1 | 1 | ``` -You may want to also explore the `discard` repair, which can -produce higher scores for output from models without a CRF/constrained -decoding as they are more likely to produce invalid transitions. +You may want to also explore the `discard` repair, which can produce higher scores for +output from models without a CRF/constrained decoding as they are more likely to produce +invalid transitions. -SeqScore can also display all errors (false positives and false negatives) -encountered in scoring using the `--error-counts` flag. For example, running the -command +SeqScore can also display all errors (false positives and false negatives) encountered +in scoring using the `--error-counts` flag. For example, running the command `seqscore score --labels BIO --error-counts --reference samples/reference.bio samples/predicted.bio` will produce the following output: @@ -273,10 +260,10 @@ will produce the following output: | 1 | FN | LOC | West Philadelphia | ``` -The output shows that the system produced two false positives and missed one -mention in the reference (false negative). The most frequent errors appear at -the top. The `--error-counts` flag can be combined with `--score-format delim` -to write a delimited table that can be read as a spreadsheet. +The output shows that the system produced two false positives and missed one mention in +the reference (false negative). The most frequent errors appear at the top. The +`--error-counts` flag can be combined with `--score-format delim` to write a delimited +table that can be read as a spreadsheet. ## Validation @@ -290,7 +277,7 @@ No errors found in 0 tokens, 2 sequences, and 1 documents in reference.bio For the example of the [samples/invalid.bio](samples/invalid.bio), we can run `seqscore validate --labels BIO samples/invalid.bio`: - ``` +``` Encountered 1 errors in 1 tokens, 2 sequences, and 1 documents in invalid.bio Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7 ``` @@ -299,8 +286,8 @@ Invalid transition 'O' -> 'I-ORG' for token 'University' on line 7 We can convert a file from one chunk encoding to another. For example, `seqscore convert --input-labels BIO --output-labels BIOES samples/reference.bio samples/reference.bioes` -will read [samples/reference.bio](samples/reference.bio) in BIO -encoding and write the BIOES-converted file to [samples/reference.bioes](samples/reference.bioes): +will read [samples/reference.bio](samples/reference.bio) in BIO encoding and write the +BIOES-converted file to [samples/reference.bioes](samples/reference.bioes): ``` This O @@ -319,7 +306,6 @@ Philadelphia E-LOC , O Pennsylvania S-LOC . O - ``` We can get a list of available chunk encodings by running `seqscore convert --help`: @@ -341,12 +327,11 @@ Options: ## Repair -We can also apply repair methods to a file, creating an output file -with only valid transitions. For example, we can run +We can also apply repair methods to a file, creating an output file with only valid +transitions. For example, we can run `seqscore repair --labels BIO --repair-method conlleval samples/invalid.bio samples/invalid_repair_conlleval.bio`, which will apply the conlleval repair method to the -[samples/invalid.bio](samples/invalid.bio) and write the repaired -labels to +[samples/invalid.bio](samples/invalid.bio) and write the repaired labels to [samples/invalid_repair_conlleval.bio](samples/invalid_repair_conlleval.bio): ``` @@ -366,12 +351,12 @@ Philadelphia I-LOC , O Pennsylvania B-LOC . O - ``` If we want to apply the discard repair method, we can run `seqscore repair --labels BIO --repair-method discard samples/invalid.bio samples/invalid_repair_discard.bio` -and the output will be written to [samples/invalid_repair_discard.bio](samples/invalid_repair_discard.bio): +and the output will be written to +[samples/invalid_repair_discard.bio](samples/invalid_repair_discard.bio): ``` This O @@ -390,18 +375,16 @@ Philadelphia I-LOC , O Pennsylvania B-LOC . O - ``` -Repairing the file before performing other operations is available in the -`count` and `summarize` subcommands. +Repairing the file before performing other operations is available in the `count` and +`summarize` subcommands. ## Summarize -The `summarize` subcommand can produce counts of the types of chunks -in the input file. For example, if we run -`seqscore summarize --labels BIO samples/reference.bio` -we get the following output: +The `summarize` subcommand can produce counts of the types of chunks in the input file. +For example, if we run `seqscore summarize --labels BIO samples/reference.bio` we get +the following output: ``` File 'samples/reference.bio' contains 1 document(s) with the following mentions: @@ -411,14 +394,13 @@ File 'samples/reference.bio' contains 1 document(s) with the following mentions: | ORG | 1 | ``` -If the quiet (`-q`) flag is provided, the first line giving the filename -and document count is not printed. +If the quiet (`-q`) flag is provided, the first line giving the filename and document +count is not printed. ## Count -The `count` subcommand can produce the counts of chunks in the input -file. Unlike `summarize`, it counts chunk-type pairs, not just types. -For example, if we run +The `count` subcommand can produce the counts of chunks in the input file. Unlike +`summarize`, it counts chunk-type pairs, not just types. For example, if we run `seqscore count --labels BIO samples/reference.bio --output-file counts.csv`, tab-delimited counts would be written to `counts.csv` as follows: @@ -433,18 +415,18 @@ standard output. However, you may encounter Unicode issues if your terminal is n configured properly. You can use the `--output-delim` argument to change the delimiter used in the counts. -The default delimiter of tab is strongly recommended, as there is no escaping or -quoting of the names in the output. +The default delimiter of tab is strongly recommended, as there is no escaping or quoting +of the names in the output. ## Process -The `process` subcommand can remove entity types from a file or map them to -other types. Removing types can be performed by specifying one of `--keep-types` -or `--remove-types`. +The `process` subcommand can remove entity types from a file or map them to other types. +Removing types can be performed by specifying one of `--keep-types` or `--remove-types`. For example, if we wanted to keep only the ORG type, we could run: `seqscore process --labels BIO --keep-types ORG samples/reference.bio samples/keep_ORG.bio`, -and the following output will be written to [samples/keep_ORG.bio](samples/keep_ORG.bio): +and the following output will be written to +[samples/keep_ORG.bio](samples/keep_ORG.bio): ``` This O @@ -468,11 +450,12 @@ Pennsylvania O You can also keep multiple types by specifying a comma-separated list of types: `--keep-types LOC,ORG`. -Instead of specifying which types to keep, we can also specify which types to -remove using `--remove-types`. For example, if we wanted to remove only the -ORG type, we could run: +Instead of specifying which types to keep, we can also specify which types to remove +using `--remove-types`. For example, if we wanted to remove only the ORG type, we could +run: `seqscore process --labels BIO --remove-types ORG samples/reference.bio samples/remove_ORG.bio`, -and the following output will be written to [samples/remove_ORG.bio](samples/remove_ORG.bio): +and the following output will be written to +[samples/remove_ORG.bio](samples/remove_ORG.bio): ``` This O @@ -496,10 +479,9 @@ Pennsylvania B-LOC As with keep, you can specify multiple tags to remove, for example `--remove-types LOC,ORG`. -The `--type-map` argument allows you to specify a JSON file that specifies a -mapping between types and other types. Suppose you want to collapse several -types into a more generic NAME type. In that case, the type map would be -specified as follows: +The `--type-map` argument allows you to specify a JSON file that specifies a mapping +between types and other types. Suppose you want to collapse several types into a more +generic NAME type. In that case, the type map would be specified as follows: ``` { @@ -507,9 +489,9 @@ specified as follows: } ``` -The type map must be a JSON dictionary. The keys are the types to be mapped to, -while the value for each key is a list of types to be mapped from. Note that -the value must always be a list, even if it would only contain one element. +The type map must be a JSON dictionary. The keys are the types to be mapped to, while +the value for each key is a list of types to be mapped from. Note that the value must +always be a list, even if it would only contain one element. We can apply the above type map to a file using the following command: `seqscore process --labels BIO --type-map samples/type_map_NAME.json samples/reference.bio samples/all_NAME.bio`, @@ -534,9 +516,8 @@ Pennsylvania B-NAME . O ``` -When `--type-map` is specified at the same time as `--keep-types` or -`--remove-types`, the type mapping is applied **before** the keep/remove -filtering is applied. +When `--type-map` is specified at the same time as `--keep-types` or `--remove-types`, +the type mapping is applied **before** the keep/remove filtering is applied. ## Text extraction @@ -555,14 +536,12 @@ University of Pennsylvania is in West Philadelphia , Pennsylvania . Each sentence is written on one line with space-delimited tokens. - # FAQ ## Why can't I score output files that are in the format `conlleval` expects? -SeqScore intentionally does not support the "merged" -format used by `conlleval` where each line contains a token, correct -tag, and predicted tag: +SeqScore intentionally does not support the "merged" format used by `conlleval` where +each line contains a token, correct tag, and predicted tag: ``` University B-ORG B-ORG @@ -577,23 +556,21 @@ Pennsylvania B-LOC B-LOC . O O ``` -We do not support this format because we have found that creating -predictions in this format is a common source of errors in scoring -pipelines. +We do not support this format because we have found that creating predictions in this +format is a common source of errors in scoring pipelines. ## When do I need to specify the `--labels` argument? -The `--labels` argument must be specified for commands where knowing the label -encoding is essential to getting correct answers. These commands are `validate`, -`repair`, and `score`. For all other commands, `--labels BIO` is assumed by -default but can be overridden. +The `--labels` argument must be specified for commands where knowing the label encoding +is essential to getting correct answers. These commands are `validate`, `repair`, and +`score`. For all other commands, `--labels BIO` is assumed by default but can be +overridden. # Development The following instructions are for the project maintainers only. -For development, check out the `dev` branch (latest, but less tested -than `main`). +For development, check out the `dev` branch (latest, but less tested than `main`). ## Setting up an environment for development @@ -611,8 +588,7 @@ than `main`). # Contributors -SeqScore was developed by the BLT Lab at Brandeis University under the -direction of PI and lead developer Constantine Lignos. Chester -Palen-Michel, Nolan Holley, and Claire Wang contributed to its -development. Gordon Dou, Maya Kruse, and Andrew Rueda gave feedback -on its features and assisted in README writing. +SeqScore was developed by the BLT Lab at Brandeis University under the direction of PI +and lead developer Constantine Lignos. Chester Palen-Michel, Nolan Holley, and Claire +Wang contributed to its development. Gordon Dou, Maya Kruse, and Andrew Rueda gave +feedback on its features and assisted in README writing. diff --git a/tests/pre_commit.sh b/tests/pre_commit.sh index 1967c0a..abd3ec1 100755 --- a/tests/pre_commit.sh +++ b/tests/pre_commit.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euxo pipefail -flowmark -i --nobackup *.md +flowmark -i --nobackup ./*.md files=(seqscore/ tests/) ruff check --fix "${files[@]}" ruff check --select I --fix "${files[@]}" # Organize imports From a63548be90ee414925821514c4da97f1316feabd Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 10:58:06 -0400 Subject: [PATCH 29/33] Enable build on dev* branches --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ebf8a1e..d380885 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - dev + - dev* pull_request: branches: - main - - dev + - dev* jobs: build: From f79203ad2d1bd30ee222fc76dcfcf9c93fe6a389 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 11:02:03 -0400 Subject: [PATCH 30/33] Update build to use pyproject.toml --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d380885..0bc5ba2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -43,7 +43,7 @@ jobs: - name: Install quality check dependencies run: | - pip install -r requirements.txt + pip install ".[dev]" - name: Run quality checks run: | From 5de23c2c6e2015247e85d43859bf796fb4b7ec45 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 11:06:10 -0400 Subject: [PATCH 31/33] Pin version of pytest-cov --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a6256ca..94f89c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ seqscore = "seqscore.scripts.seqscore:cli" dev = [ "types-tabulate", "pytest==9.0.3", - "pytest-cov>=7.1.0", + "pytest-cov==7.1.0", "mypy==2.1.0", "ruff==0.15.15", "flowmark", From d540196008cb84a01e0e0bf527506d82634cf5e6 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 11:08:44 -0400 Subject: [PATCH 32/33] Update check.sh for removal of setup.py --- tests/check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/check.sh b/tests/check.sh index 1f05a83..7fb6ba5 100755 --- a/tests/check.sh +++ b/tests/check.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -euxo pipefail -files=(seqscore/ tests/ setup.py) +files=(seqscore/ tests/) ruff check "${files[@]}" mypy "${files[@]}" From 598f066594dbe0f551c1e98e3ea87dfac1625a9e Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Thu, 4 Jun 2026 11:22:49 -0400 Subject: [PATCH 33/33] Add release script --- README.md | 10 ++++++++ pyproject.toml | 2 ++ scripts/release.sh | 57 ++++++++++++++++++++++++++++++++++++++++++++++ tests/check.sh | 1 + 4 files changed, 70 insertions(+) create mode 100755 scripts/release.sh diff --git a/README.md b/README.md index afb832d..9b1320c 100644 --- a/README.md +++ b/README.md @@ -586,6 +586,16 @@ For development, check out the `dev` branch (latest, but less tested than `main` 3. Install seqscore: `pip install -e .` 4. Install development dependencies: `pip install -e ".[dev]"` +## Release + +The release script is located at `scripts/release.sh` and can only be used by project +maintainers. To make a release: + +1. Make sure `__version__` is up to date in `seqscore/__init__.py`. +2. Make sure you are on the main branch with no uncommitted changes. +3. Run `scripts/release.sh`. If anything goes wrong between tagging and releasing, you + will have to delete the tag on GitHub and try again. + # Contributors SeqScore was developed by the BLT Lab at Brandeis University under the direction of PI diff --git a/pyproject.toml b/pyproject.toml index 94f89c1..38cb8e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ dev = [ "mypy==2.1.0", "ruff==0.15.15", "flowmark", + "build", + "twine", ] [tool.setuptools.dynamic] diff --git a/scripts/release.sh b/scripts/release.sh new file mode 100755 index 0000000..13fa608 --- /dev/null +++ b/scripts/release.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Builds, uploads to PyPI, and tags the release. +# Should only be run by project maintainers. +set -euo pipefail + +VENV=".venv/bin" + +# Run pre-commit checks +bash tests/check.sh + +# Must be on main with a clean working tree +current_branch=$(git rev-parse --abbrev-ref HEAD) +if [[ "$current_branch" != "main" ]]; then + echo "Error: must be on main branch (currently on '$current_branch')" + exit 1 +fi + +if ! git diff --quiet || ! git diff --cached --quiet; then + echo "Error: working tree is not clean" + exit 1 +fi + +# Read version from package +version=$("$VENV/python" -c "import seqscore; print(seqscore.__version__)") +tag="v$version" + +# Abort if tag already exists +if git rev-parse "$tag" >/dev/null 2>&1; then + echo "Error: tag $tag already exists. Update __version__ in seqscore/__init__.py." + exit 1 +fi + +echo "Releasing $tag" + +# Build +rm -rf dist/ +"$VENV/python" -m build + +# Tag and push +git tag "$tag" +git push origin "$tag" + +# Prompt to verify tag before uploading +echo "" +echo "Tag $tag pushed. Check the release on GitHub before uploading to PyPI:" +echo " https://github.com/bltlab/seqscore/releases/tag/$tag" +echo "" +read -r -p "Upload to PyPI? [y/N] " confirm +if [[ "${confirm,,}" != "y" ]]; then + echo "Aborted. Re-run this script to retry the upload." + exit 1 +fi + +# Upload to PyPI +"$VENV/twine" upload dist/* + +echo "Done: $tag released and pushed" diff --git a/tests/check.sh b/tests/check.sh index 7fb6ba5..e544ad2 100755 --- a/tests/check.sh +++ b/tests/check.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash set -euxo pipefail +flowmark --check ./*.md files=(seqscore/ tests/) ruff check "${files[@]}" mypy "${files[@]}"