From 071faf1d591ddfa35b3fb6c9efb047652a8cef21 Mon Sep 17 00:00:00 2001 From: sunshower7 Date: Mon, 1 Jun 2026 13:19:02 -0400 Subject: [PATCH 1/5] Add --ner-label-index option to validate and disable Python 3.9 build --- .github/workflows/main.yml | 2 +- pyproject.toml | 2 +- seqscore/conll.py | 20 +++++++++---- seqscore/scripts/seqscore.py | 12 ++++++++ .../conll_annotation/labels_not_last_col.bio | 16 ++++++++++ tests/test_validation_click.py | 30 +++++++++++++++++++ 6 files changed, 74 insertions(+), 8 deletions(-) create mode 100644 tests/conll_annotation/labels_not_last_col.bio diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6cd5e32..ebf8a1e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v6 diff --git a/pyproject.toml b/pyproject.toml index e3493a6..f8d810e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.mypy] -python_version = 3.9 +python_version = 3.10 strict_optional = false disallow_untyped_defs = true disallow_untyped_calls = true diff --git a/seqscore/conll.py b/seqscore/conll.py index 127617a..b429a7f 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -52,7 +52,7 @@ class _CoNLLToken: other_fields: tuple[str, ...] = attrib() @classmethod - def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken": + def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: int) -> "_CoNLLToken": # Note: The caller must strip the line of any trailing whitespace # TODO: Sense the file rather than the line so we get consistency across lines # Try tab first since it's safer, then space @@ -72,9 +72,14 @@ def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken": f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}" ) + if ner_label_index == 0: + raise ValueError("ner_label_index cannot be 0") + text = splits[0] - label = splits[-1] - other_fields = tuple(splits[1:-1]) + label = splits[ner_label_index] + other_fields = tuple(splits[1:ner_label_index]) + if ner_label_index != -1: + other_fields += tuple(splits[ner_label_index + 1:]) is_docstart = text == DOCSTART return cls(text, label, is_docstart, line_num, other_fields) @@ -84,6 +89,7 @@ class CoNLLIngester: encoding: Encoding = attrib() parse_comment_lines: bool = attrib(default=False, kw_only=True) ignore_document_boundaries: bool = attrib(default=True, kw_only=True) + ner_label_index: int = attrib(default=-1, kw_only=True) def ingest( self, @@ -210,7 +216,7 @@ def validate( document_results: list[SequenceValidationResult] = [] for source_sequence, _ in self._parse_file( - source, source_name, parse_comments=self.parse_comment_lines + source, source_name, parse_comments=self.parse_comment_lines, ner_label_index=self.ner_label_index ): if source_sequence[0].is_docstart: # We can ony receive DOCSTART in a sequence by itself, see _parse_file. @@ -253,7 +259,7 @@ def _decompose_sequence( @classmethod def _parse_file( - cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False + cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False, ner_label_index: int = -1 ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]: sequence: list = [] comment: Optional[str] = None @@ -285,7 +291,7 @@ def _parse_file( # Always skip empty lines continue - token = _CoNLLToken.from_line(line, line_num, source_name) + token = _CoNLLToken.from_line(line, line_num, source_name, ner_label_index) # Skip document starts, but ensure sequence is empty when we reach them if token.is_docstart: if sequence: @@ -352,12 +358,14 @@ def validate_conll_file( *, ignore_document_boundaries: bool, parse_comment_lines: bool, + ner_label_index: int, ) -> ValidationResult: encoding = get_encoding(mention_encoding_name) ingester = CoNLLIngester( encoding, parse_comment_lines=parse_comment_lines, ignore_document_boundaries=ignore_document_boundaries, + ner_label_index=ner_label_index, ) with open(input_path, encoding=file_encoding) as input_file: results = ingester.validate(input_file, input_path) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index cdcd8c0..20606f3 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -106,6 +106,15 @@ def _output_delim_option() -> Callable: ) +def _ner_label_index_option() -> Callable: + return click.option( + "--ner-label-index", + default=-1, + show_default=True, + type=int, + ) + + def _quiet_option() -> Callable: return click.option( "--quiet", @@ -118,6 +127,7 @@ def _quiet_option() -> Callable: @cli.command(help="validate labels") @_multi_input_file_arguments @_labels_option() +@_ner_label_index_option() @_quiet_option() def validate( file: list[str], # Name is "file" to make sense on the command line, but it's a list @@ -126,6 +136,7 @@ def validate( *, ignore_document_boundaries: bool, parse_comment_lines: bool, + ner_label_index: int, quiet: bool, ) -> None: error = False @@ -136,6 +147,7 @@ def validate( file_encoding, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, + ner_label_index=ner_label_index, ) if result.errors: print( diff --git a/tests/conll_annotation/labels_not_last_col.bio b/tests/conll_annotation/labels_not_last_col.bio new file mode 100644 index 0000000..cc91cf4 --- /dev/null +++ b/tests/conll_annotation/labels_not_last_col.bio @@ -0,0 +1,16 @@ +This O DET +is O VERB +a O DET +sentence O NOUN +. O PUNCT + +University B-ORG NOUN +of I-ORG ADP +Pennsylvania I-ORG NOUN +is O VERB +in O ADP +West B-LOC NOUN +Philadelphia I-LOC NOUN +, O PUNCT +Pennsylvania B-LOC NOUN +. O PUNCT diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py index e688ecc..3cb4a98 100644 --- a/tests/test_validation_click.py +++ b/tests/test_validation_click.py @@ -214,3 +214,33 @@ def test_bad_label() -> None: str(result.exception) == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) + + +def test_ner_label_index_pos() -> None: + runner = CliRunner() + result = runner.invoke( + validate, + ["--labels", "BIO", "--ner-label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + ) + assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" + assert result.exit_code == 0 + + +def test_ner_label_index_neg() -> None: + runner = CliRunner() + result = runner.invoke( + validate, + ["--labels", "BIO", "--ner-label-index", "-2", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + ) + assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" + assert result.exit_code == 0 + + +def test_ner_label_index_zero() -> None: + runner = CliRunner() + result = runner.invoke( + validate, + ["--labels", "BIO", "--ner-label-index", "0", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + ) + assert result.exit_code != 0 + assert "ner_label_index cannot be 0" in str(result.exception) From 42cae908004550db194ead172260929da736eaa6 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 2 Jun 2026 10:06:39 -0400 Subject: [PATCH 2/5] Remove Python 3.9 support --- README.md | 6 +++--- pyproject.toml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e894734..3f6c148 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ To install the latest official release of SeqScore, run: `pip install seqscore`. This will install the package and add the command `seqscore` in your Python environment. -SeqScore requires Python 3.9 or higher. It is tested on Python 3.9, -3.10, 3.11, 3.12, 3.13, and 3.14. +SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12, +3.13, and 3.14. ## License @@ -600,7 +600,7 @@ To install from a clone of this repository, use: ## Setting up an environment for development -1. Create an environment: `conda create -yn seqscore python=3.9` +1. Create an environment: `conda create -yn seqscore python=3.10` 2. Activate the environment: `conda activate seqscore` 3. Install seqscore: `pip install -e .` 4. Install development dependencies: `pip install -r requirements.txt` diff --git a/pyproject.toml b/pyproject.toml index f8d810e..212b567 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.mypy] -python_version = 3.10 +python_version = "3.10" strict_optional = false disallow_untyped_defs = true disallow_untyped_calls = true @@ -13,4 +13,4 @@ ignore_missing_imports = true [tool.ruff] line-length = 90 -target-version = "py39" +target-version = "py310" From b1f06b07554cdadb2ef2cb8a84ad3da5f554684d Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 2 Jun 2026 10:42:05 -0400 Subject: [PATCH 3/5] Update development dependencies --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e976d18..8ac2e1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,12 +6,12 @@ types-tabulate # For testing -pytest==8.3.5 -pytest-cov==5.0.0 +pytest==9.0.3 +pytest-cov>=7.1.0 # For development -mypy==1.19.0 -ruff==0.14.9 +mypy==2.1.0 +ruff==0.15.15 # Documentation build # Disabled for now since we don't need them From b12291b1a8392326254c5c15418ae88192552ddb Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Tue, 2 Jun 2026 13:21:13 -0400 Subject: [PATCH 4/5] Support setting token and label index across more commands --- seqscore/conll.py | 127 ++++++++++++------ seqscore/model.py | 12 +- seqscore/scripts/seqscore.py | 59 +++++++- seqscore/util.py | 4 +- seqscore/validation.py | 3 +- .../diff_token_label_indices.bio | 17 +++ .../diff_token_label_indices.bioes | 17 +++ .../conll_annotation/labels_not_last_col.bio | 1 + .../labels_not_last_col.bioes | 17 +++ tests/test_conll_format.py | 29 ++-- tests/test_conversion_click.py | 40 +++++- tests/test_model.py | 2 +- tests/test_validation.py | 5 +- tests/test_validation_click.py | 43 ++++-- 14 files changed, 283 insertions(+), 93 deletions(-) create mode 100644 tests/conll_annotation/diff_token_label_indices.bio create mode 100644 tests/conll_annotation/diff_token_label_indices.bioes create mode 100644 tests/conll_annotation/labels_not_last_col.bioes diff --git a/seqscore/conll.py b/seqscore/conll.py index b429a7f..9b9fedd 100644 --- a/seqscore/conll.py +++ b/seqscore/conll.py @@ -43,16 +43,36 @@ class CoNLLFormatError(Exception): pass +@attrs(frozen=True) +class LineSpec: + """Defines the fields and delimiters for a CoNLL-format line""" + + token_index: int = attrib() + ner_label_index: int = attrib() + + def __attrs_post_init__(self) -> None: + # This will only catch cases where the indices are identical, not + # when they refer to the same position, such as 1 and -1 in a + # sequence of length two + if self.token_index == self.ner_label_index: + raise ValueError( + f"Token index ({self.token_index}) and " + f"label index ({self.ner_label_index}) cannot be the same" + ) + + @attrs(frozen=True) class _CoNLLToken: text: str = attrib() label: str = attrib() is_docstart: bool = attrib() line_num: int = attrib() - other_fields: tuple[str, ...] = attrib() + orig_fields: tuple[str, ...] = attrib() @classmethod - def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: int) -> "_CoNLLToken": + def from_line( + cls, line: str, line_num: int, source_name: str, line_spec: LineSpec + ) -> "_CoNLLToken": # Note: The caller must strip the line of any trailing whitespace # TODO: Sense the file rather than the line so we get consistency across lines # Try tab first since it's safer, then space @@ -72,24 +92,19 @@ def from_line(cls, line: str, line_num: int, source_name: str, ner_label_index: f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}" ) - if ner_label_index == 0: - raise ValueError("ner_label_index cannot be 0") - - text = splits[0] - label = splits[ner_label_index] - other_fields = tuple(splits[1:ner_label_index]) - if ner_label_index != -1: - other_fields += tuple(splits[ner_label_index + 1:]) + text = splits[line_spec.token_index] + label = splits[line_spec.ner_label_index] + orig_fields = tuple(splits) is_docstart = text == DOCSTART - return cls(text, label, is_docstart, line_num, other_fields) + return cls(text, label, is_docstart, line_num, orig_fields) @attrs(frozen=True) class CoNLLIngester: encoding: Encoding = attrib() + line_spec: LineSpec = attrib() parse_comment_lines: bool = attrib(default=False, kw_only=True) ignore_document_boundaries: bool = attrib(default=True, kw_only=True) - ner_label_index: int = attrib(default=-1, kw_only=True) def ingest( self, @@ -119,7 +134,7 @@ def ingest( continue # Create mentions from tokens in sequence - tokens, labels, line_nums, other_fields = self._decompose_sequence( + tokens, labels, line_nums, orig_fields = self._decompose_sequence( source_sequence ) @@ -193,7 +208,7 @@ def ingest( tokens, labels, mentions, - other_fields=other_fields, + orig_fields=orig_fields, provenance=SequenceProvenance(line_nums[0], source_name), comment=comment, ) @@ -210,13 +225,17 @@ def ingest( yield document def validate( - self, source: TextIO, source_name: str + self, + source: TextIO, + source_name: str, ) -> list[list[SequenceValidationResult]]: all_results: list[list[SequenceValidationResult]] = [] document_results: list[SequenceValidationResult] = [] for source_sequence, _ in self._parse_file( - source, source_name, parse_comments=self.parse_comment_lines, ner_label_index=self.ner_label_index + source, + source_name, + parse_comments=self.parse_comment_lines, ): if source_sequence[0].is_docstart: # We can ony receive DOCSTART in a sequence by itself, see _parse_file. @@ -254,12 +273,15 @@ def _decompose_sequence( tokens = tuple(tok.text for tok in source_sequence) labels = tuple(tok.label for tok in source_sequence) line_nums = tuple(tok.line_num for tok in source_sequence) - other_fields = tuple(tok.other_fields for tok in source_sequence) - return tokens, labels, line_nums, other_fields + orig_fields = tuple(tok.orig_fields for tok in source_sequence) + return tokens, labels, line_nums, orig_fields - @classmethod def _parse_file( - cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False, ner_label_index: int = -1 + self, + input_file: TextIO, + source_name: str, + *, + parse_comments: bool = False, ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]: sequence: list = [] comment: Optional[str] = None @@ -284,14 +306,14 @@ def _parse_file( if not line.strip(): # Clear out sequence if there's anything in it if sequence: - cls._check_sequence(sequence) + self._check_sequence(sequence) yield tuple(sequence), comment sequence = [] comment = None # Always skip empty lines continue - token = _CoNLLToken.from_line(line, line_num, source_name, ner_label_index) + token = _CoNLLToken.from_line(line, line_num, source_name, self.line_spec) # Skip document starts, but ensure sequence is empty when we reach them if token.is_docstart: if sequence: @@ -301,7 +323,7 @@ def _parse_file( else: # Yield it by itself. Since the sequence variable is empty, leave it unchanged. tmp_sent = (token,) - cls._check_sequence(tmp_sent) + self._check_sequence(tmp_sent) # Don't return the comment yet, it will be returned with the sequence yield tmp_sent, None else: @@ -309,7 +331,7 @@ def _parse_file( # Finish the last sequence if needed if sequence: - cls._check_sequence(sequence) + self._check_sequence(sequence) yield tuple(sequence), comment @staticmethod @@ -327,6 +349,7 @@ def ingest_conll_file( input_path: PathType, mention_encoding_name: str, file_encoding: str, + line_spec: LineSpec, *, repair: Optional[str] = None, ignore_document_boundaries: bool, @@ -343,6 +366,7 @@ def ingest_conll_file( ingester = CoNLLIngester( mention_encoding, + line_spec, parse_comment_lines=parse_comment_lines, ignore_document_boundaries=ignore_document_boundaries, ) @@ -355,17 +379,17 @@ def validate_conll_file( input_path: str, mention_encoding_name: str, file_encoding: str, + line_spec: LineSpec, *, ignore_document_boundaries: bool, parse_comment_lines: bool, - ner_label_index: int, ) -> ValidationResult: encoding = get_encoding(mention_encoding_name) ingester = CoNLLIngester( encoding, + line_spec, parse_comment_lines=parse_comment_lines, ignore_document_boundaries=ignore_document_boundaries, - ner_label_index=ner_label_index, ) with open(input_path, encoding=file_encoding) as input_file: results = ingester.validate(input_file, input_path) @@ -388,6 +412,7 @@ def repair_conll_file( mention_encoding_name: str, repair: Optional[str], file_encoding: str, + line_spec: LineSpec, output_delim: str, *, ignore_document_boundaries: bool, @@ -398,6 +423,7 @@ def repair_conll_file( input_file, mention_encoding_name, file_encoding, + line_spec, repair=repair, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, @@ -429,6 +455,7 @@ def write_docs_using_encoding( mention_encoding_name: str, file_encoding: str, delim: str, + line_spec: LineSpec, output_path: PathType, ) -> None: mention_encoding = get_encoding(mention_encoding_name) @@ -437,7 +464,12 @@ def write_docs_using_encoding( with open(output_path, "w", encoding=file_encoding) as file: for doc in docs: write_doc_using_encoding( - doc, mention_encoding, delim, file, output_docstart=output_docstart + doc, + mention_encoding, + delim, + file, + line_spec, + output_docstart=output_docstart, ) @@ -446,32 +478,42 @@ def write_doc_using_encoding( encoding: Encoding, delim: str, file: TextIO, + line_spec: LineSpec, *, output_docstart: bool, ) -> None: if output_docstart: - # Get a single token to figure out how many other_fields entries it has - sequence_other_fields = doc[0].other_fields - fields = [DOCSTART] - if sequence_other_fields: - fields.extend([EMPTY_OTHER_FIELD for _ in sequence_other_fields[0]]) - fields.append(encoding.dialect.outside) - + # Get the fields of the first token of the first sentence + if doc[0].orig_fields: + # to figure out how many fields there are + sequence_orig_fields = doc[0].orig_fields[0] + # Create the write number of fields + fields = [EMPTY_OTHER_FIELD] * len(sequence_orig_fields) + # Fill in the token and label + fields[line_spec.token_index] = DOCSTART + fields[line_spec.ner_label_index] = encoding.dialect.outside + else: + fields = [DOCSTART, encoding.dialect.outside] + # Write output print(delim.join(fields), file=file) print(file=file) for sequence in doc: labels = encoding.encode_sequence(sequence) - # Lengths of labels and other_fields have previously been checked to match tokens - for (token, other_fields), label in zip( - sequence.tokens_with_other_fields(), labels + # Lengths of labels and orig_fields have previously been checked to match tokens + for (token, orig_fields), label in zip( + sequence.tokens_with_orig_fields(), labels ): - fields = [token] - if other_fields: - fields.extend(other_fields) - fields.append(label) + if orig_fields: + fields = list(orig_fields) + fields[line_spec.token_index] = token + fields[line_spec.ner_label_index] = label + else: + fields = [token, label] + # Write output print(delim.join(fields), file=file) + # Print an emtpy line after each sequence print(file=file) @@ -482,6 +524,7 @@ def score_conll_files( mention_encoding_name: str, repair: Optional[str], file_encoding: str, + line_spec: LineSpec, *, ignore_document_boundaries: bool, parse_comment_lines: bool, @@ -497,6 +540,7 @@ def score_conll_files( reference_file, mention_encoding_name, file_encoding, + line_spec, repair=repair, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, @@ -521,6 +565,7 @@ def score_conll_files( pred_file, mention_encoding_name, file_encoding, + line_spec, repair=repair, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, diff --git a/seqscore/model.py b/seqscore/model.py index 6df73b3..d554371 100644 --- a/seqscore/model.py +++ b/seqscore/model.py @@ -60,7 +60,7 @@ class LabeledSequence(Sequence[str]): tokens: tuple[str, ...] = attrib(converter=tuplify_strs) labels: tuple[str, ...] = attrib(converter=tuplify_strs) mentions: tuple[Mention, ...] = attrib(default=(), converter=_tuplify_mentions) - other_fields: Optional[tuple[tuple[str, ...], ...]] = attrib( + orig_fields: Optional[tuple[tuple[str, ...], ...]] = attrib( default=None, kw_only=True, converter=tuplify_optional_nested_strs ) provenance: Optional[SequenceProvenance] = attrib( @@ -79,9 +79,9 @@ def __attrs_post_init__(self) -> None: if not self.tokens: raise ValueError("Tokens and labels must be non-empty") - if self.other_fields and len(self.tokens) != len(self.other_fields): + if self.orig_fields and len(self.tokens) != len(self.orig_fields): raise ValueError( - f"Tokens ({len(self.tokens)}) and other_fields ({len(self.other_fields)}) " + f"Tokens ({len(self.tokens)}) and orig_fields ({len(self.orig_fields)}) " "must be of the same length" ) @@ -126,11 +126,11 @@ def __str__(self) -> str: def tokens_with_labels(self) -> tuple[tuple[str, str], ...]: return tuple(zip(self.tokens, self.labels)) - def tokens_with_other_fields( + def tokens_with_orig_fields( self, ) -> tuple[tuple[str, Optional[tuple[str, ...]]], ...]: - if self.other_fields: - return tuple(zip(self.tokens, self.other_fields)) + if self.orig_fields: + return tuple(zip(self.tokens, self.orig_fields)) else: return tuple(zip(self.tokens, repeat(None))) diff --git a/seqscore/scripts/seqscore.py b/seqscore/scripts/seqscore.py index 20606f3..c2e9265 100644 --- a/seqscore/scripts/seqscore.py +++ b/seqscore/scripts/seqscore.py @@ -11,6 +11,7 @@ from seqscore.conll import ( FORMAT_DELIM, SUPPORTED_SCORE_FORMATS, + LineSpec, ingest_conll_file, repair_conll_file, score_conll_files, @@ -44,6 +45,20 @@ def _input_file_options() -> list[Callable]: click.option( "--ignore-document-boundaries/--use-document-boundaries", default=False ), + click.option( + "--token-index", + default=0, + show_default=True, + type=int, + help="Index of the input field to use for the token", + ), + click.option( + "--label-index", + default=-1, + show_default=True, + type=int, + help="Index of the input field to use for the label", + ), ] @@ -127,7 +142,6 @@ def _quiet_option() -> Callable: @cli.command(help="validate labels") @_multi_input_file_arguments @_labels_option() -@_ner_label_index_option() @_quiet_option() def validate( file: list[str], # Name is "file" to make sense on the command line, but it's a list @@ -136,18 +150,20 @@ def validate( *, ignore_document_boundaries: bool, parse_comment_lines: bool, - ner_label_index: int, + token_index: int, + label_index: int, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) error = False for each_file in file: result = validate_conll_file( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, - ner_label_index=ner_label_index, ) if result.errors: print( @@ -178,6 +194,8 @@ def repair( output_file: str, labels: str, file_encoding: str, + token_index: int, + label_index: int, repair_method: str, output_delim: str, *, @@ -188,6 +206,7 @@ def repair( output_delim = _normalize_tab(output_delim) if repair_method == REPAIR_NONE: raise ValueError(f"Cannot repair with repair strategy {repr(repair_method)}") + line_spec = LineSpec(token_index, label_index) repair_conll_file( file, @@ -195,6 +214,7 @@ def repair( labels, repair_method, file_encoding, + line_spec, output_delim, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, @@ -212,6 +232,8 @@ def convert( file: str, output_file: str, file_encoding: str, + token_index: int, + label_index: int, output_delim: str, input_labels: str, output_labels: str, @@ -220,19 +242,19 @@ def convert( parse_comment_lines: bool, ) -> None: output_delim = _normalize_tab(output_delim) - if input_labels == output_labels: - raise ValueError("Conversion requires different input and output labels") + line_spec = LineSpec(token_index, label_index) docs = ingest_conll_file( file, input_labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, ) write_docs_using_encoding( - docs, output_labels, file_encoding, output_delim, output_file + docs, output_labels, file_encoding, output_delim, line_spec, output_file ) @@ -260,6 +282,8 @@ def process( file: str, output_file: str, file_encoding: str, + token_index: int, + label_index: int, output_delim: str, labels: str, keep_types: str, @@ -270,6 +294,7 @@ def process( parse_comment_lines: bool, ) -> None: output_delim = _normalize_tab(output_delim) + line_spec = LineSpec(token_index, label_index) keep_types_set = _parse_type_list(keep_types) remove_types_set = _parse_type_list(remove_types) type_map_dict: dict[str, list[str]] = _load_type_map(type_map, file_encoding) @@ -286,13 +311,16 @@ def process( file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, ) mod_docs = modify_types(docs, keep_types_set, remove_types_set, type_map_dict) - write_docs_using_encoding(mod_docs, labels, file_encoding, output_delim, output_file) + write_docs_using_encoding( + mod_docs, labels, file_encoding, output_delim, line_spec, output_file + ) @cli.command(help="show counts for all the mentions contained in a file") @@ -309,6 +337,8 @@ def process( def count( file: list[str], # Name is "file" to make sense on the command line, but it's a list file_encoding: str, + token_index: int, + label_index: int, output_file: Optional[str], labels: str, *, @@ -318,6 +348,7 @@ def count( repair_method: str, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) if repair_method == REPAIR_NONE: repair_method = None @@ -334,6 +365,7 @@ def count( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, repair=repair_method, @@ -368,6 +400,8 @@ def count( def summarize( file: list[str], # Name is "file" to make sense on the command line, but it's a list file_encoding: str, + token_index: int, + label_index: int, labels: str, *, ignore_document_boundaries: bool, @@ -375,6 +409,7 @@ def summarize( repair_method: str, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) if repair_method == REPAIR_NONE: repair_method = None @@ -386,6 +421,7 @@ def summarize( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, repair=repair_method, @@ -453,11 +489,15 @@ def score( reference: str, score_format: str, delim: str, + token_index: int, + label_index: int, repair_method: str, error_counts: bool, full_precision: bool, quiet: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) + if repair_method == REPAIR_NONE: repair_method = None @@ -475,6 +515,7 @@ def score( labels, repair_method, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, output_format=score_format, @@ -492,18 +533,22 @@ def score( def extract_text( file: list[str], # Name is "file" to make sense on the command line, but it's a list file_encoding: str, + token_index: int, + label_index: int, labels: str, output_file: str, *, ignore_document_boundaries: bool, parse_comment_lines: bool, ) -> None: + line_spec = LineSpec(token_index, label_index) all_docs = [] for each_file in file: docs = ingest_conll_file( each_file, labels, file_encoding, + line_spec, ignore_document_boundaries=ignore_document_boundaries, parse_comment_lines=parse_comment_lines, ) diff --git a/seqscore/util.py b/seqscore/util.py index 70677fd..666ead2 100644 --- a/seqscore/util.py +++ b/seqscore/util.py @@ -30,13 +30,15 @@ def tuplify_optional_nested_strs( def file_fields_match(path1: PathType, path2: PathType, *, debug: bool = False) -> bool: """Return whether the whitespace-delimited fields of two files are identical.""" with open(path1, encoding="utf8") as f1, open(path2, encoding="utf8") as f2: + line_count = 1 for l1, l2 in zip_longest(f1, f2): if l1 is None or l2 is None or l1.split() != l2.split(): if debug: # pragma: no cover - print("Non-matching lines:") + print(f"Failed to match at line {line_count}:") print(repr(l1)) print(repr(l2)) return False + line_count += 1 return True diff --git a/seqscore/validation.py b/seqscore/validation.py index 6d3c756..9593713 100644 --- a/seqscore/validation.py +++ b/seqscore/validation.py @@ -106,7 +106,8 @@ def validate_labels( raise InvalidLabelError( label, f"Could not parse label {repr(label)}{line_msg}{source_msg} during validation: " - + str(e), + + str(e) + + " Use the --label-index argument if the label is not the last field.", ) from e if not encoding.is_valid_state(state): diff --git a/tests/conll_annotation/diff_token_label_indices.bio b/tests/conll_annotation/diff_token_label_indices.bio new file mode 100644 index 0000000..ba68d3f --- /dev/null +++ b/tests/conll_annotation/diff_token_label_indices.bio @@ -0,0 +1,17 @@ +1 This O DET +2 is O VERB +3 a O DET +4 sentence O NOUN +5 . O PUNCT + +6 University B-ORG NOUN +7 of I-ORG ADP +8 Pennsylvania I-ORG NOUN +9 is O VERB +10 in O ADP +11 West B-LOC NOUN +12 Philadelphia I-LOC NOUN +13 , O PUNCT +14 Pennsylvania B-LOC NOUN +15 . O PUNCT + diff --git a/tests/conll_annotation/diff_token_label_indices.bioes b/tests/conll_annotation/diff_token_label_indices.bioes new file mode 100644 index 0000000..46a6398 --- /dev/null +++ b/tests/conll_annotation/diff_token_label_indices.bioes @@ -0,0 +1,17 @@ +1 This O DET +2 is O VERB +3 a O DET +4 sentence O NOUN +5 . O PUNCT + +6 University B-ORG NOUN +7 of I-ORG ADP +8 Pennsylvania E-ORG NOUN +9 is O VERB +10 in O ADP +11 West B-LOC NOUN +12 Philadelphia E-LOC NOUN +13 , O PUNCT +14 Pennsylvania S-LOC NOUN +15 . O PUNCT + diff --git a/tests/conll_annotation/labels_not_last_col.bio b/tests/conll_annotation/labels_not_last_col.bio index cc91cf4..67d4b7f 100644 --- a/tests/conll_annotation/labels_not_last_col.bio +++ b/tests/conll_annotation/labels_not_last_col.bio @@ -14,3 +14,4 @@ Philadelphia I-LOC NOUN , O PUNCT Pennsylvania B-LOC NOUN . O PUNCT + diff --git a/tests/conll_annotation/labels_not_last_col.bioes b/tests/conll_annotation/labels_not_last_col.bioes new file mode 100644 index 0000000..0e55cd2 --- /dev/null +++ b/tests/conll_annotation/labels_not_last_col.bioes @@ -0,0 +1,17 @@ +This O DET +is O VERB +a O DET +sentence O NOUN +. O PUNCT + +University B-ORG NOUN +of I-ORG ADP +Pennsylvania E-ORG NOUN +is O VERB +in O ADP +West B-LOC NOUN +Philadelphia E-LOC NOUN +, O PUNCT +Pennsylvania S-LOC NOUN +. O PUNCT + diff --git a/tests/test_conll_format.py b/tests/test_conll_format.py index bc4d196..f32506b 100644 --- a/tests/test_conll_format.py +++ b/tests/test_conll_format.py @@ -2,14 +2,15 @@ import pytest -from seqscore.conll import CoNLLFormatError, CoNLLIngester +from seqscore.conll import CoNLLFormatError, CoNLLIngester, LineSpec from seqscore.encoding import REPAIR_NONE, get_encoding from seqscore.validation import InvalidLabelError def test_parse_comments_true() -> None: mention_encoding = get_encoding("BIO") - ingester = CoNLLIngester(mention_encoding, parse_comment_lines=True) + line_spec = LineSpec(0, 1) + ingester = CoNLLIngester(mention_encoding, line_spec, parse_comment_lines=True) comments_path = Path("tests") / "test_files" / "minimal_comments.bio" with comments_path.open(encoding="utf8") as file: documents = list(ingester.ingest(file, "test", REPAIR_NONE)) @@ -32,7 +33,8 @@ def test_parse_comments_true() -> None: def test_parse_comments_false() -> None: mention_encoding = get_encoding("BIO") - ingester = CoNLLIngester(mention_encoding) + line_spec = LineSpec(0, 1) + ingester = CoNLLIngester(mention_encoding, line_spec) comments_path = Path("tests") / "test_files" / "minimal_comments_1.bio" with comments_path.open(encoding="utf8") as file: @@ -46,35 +48,24 @@ def test_parse_comments_false() -> None: comments_path = Path("tests") / "test_files" / "minimal_comments_2.bio" with comments_path.open(encoding="utf8") as file: - with pytest.raises(InvalidLabelError) as err: + with pytest.raises(InvalidLabelError): list(ingester.ingest(file, "test", REPAIR_NONE)) - assert ( - str(err.value) - == "Could not parse label 'Comment' on line 1 of test during validation: Label 'Comment' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." - ) comments_path = Path("tests") / "test_files" / "minimal_comments_3.bio" with comments_path.open(encoding="utf8") as file: - with pytest.raises(InvalidLabelError) as err: + with pytest.raises(InvalidLabelError): list(ingester.ingest(file, "test", REPAIR_NONE)) - assert ( - str(err.value) - == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." - ) comments_path = Path("tests") / "test_files" / "minimal_comments_4.bio" with comments_path.open(encoding="utf8") as file: - with pytest.raises(InvalidLabelError) as err: + with pytest.raises(InvalidLabelError): list(ingester.ingest(file, "test", REPAIR_NONE)) - assert ( - str(err.value) - == "Could not parse label 'fields' on line 1 of test during validation: Label 'fields' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'. The first token '#' of this sentence starts with '#'. If it's a comment, consider enabling --parse-comment-lines." - ) def test_invalid_token_leading_space() -> None: mention_encoding = get_encoding("BIO") - ingester = CoNLLIngester(mention_encoding) + line_spec = LineSpec(0, -1) + ingester = CoNLLIngester(mention_encoding, line_spec) path = Path("tests") / "test_files" / "minimal_bio_empty_token.txt" with path.open(encoding="utf8") as file: diff --git a/tests/test_conversion_click.py b/tests/test_conversion_click.py index 8fafcf6..b41a32b 100644 --- a/tests/test_conversion_click.py +++ b/tests/test_conversion_click.py @@ -174,6 +174,28 @@ def test_IOB_to_BIO_fields() -> None: ) +def test_IOB_to_BIO_fields_and_specified_indices() -> None: + runner = CliRunner() + result = runner.invoke( + convert, + [ + "--input-labels", + "BIO", + "--output-labels", + "BIOES", + "--label-index", + "1", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"), + ], + ) + assert result.exit_code == 0 + assert file_fields_match( + os.path.join(TMP_DIR.name, "labels_not_last_col.bioes"), + os.path.join("tests", "conll_annotation", "labels_not_last_col.bioes"), + ) + + def test_IO_to_BIOES() -> None: runner = CliRunner() result = runner.invoke( @@ -215,7 +237,7 @@ def test_BIOES_to_IO() -> None: ) -def test_same_input_and_output_labels_raises_error() -> None: +def test_diff_token_label_indices() -> None: runner = CliRunner() result = runner.invoke( convert, @@ -223,9 +245,17 @@ def test_same_input_and_output_labels_raises_error() -> None: "--input-labels", "BIO", "--output-labels", - "BIO", - os.path.join("tests", "conll_annotation", "minimal.bio"), - os.path.join(TMP_DIR.name, "temp.txt"), + "BIOES", + "--token-index", + "1", + "--label-index", + "2", + os.path.join("tests", "conll_annotation", "diff_token_label_indices.bio"), + os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"), ], ) - assert result.exit_code != 0 + assert result.exit_code == 0 + assert file_fields_match( + os.path.join(TMP_DIR.name, "diff_token_label_indices_BIOES.txt"), + os.path.join("tests", "conll_annotation", "diff_token_label_indices.bioes"), + ) diff --git a/tests/test_model.py b/tests/test_model.py index 50dcee1..0853d16 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -73,4 +73,4 @@ def test_labeled_sentence() -> None: with pytest.raises(ValueError): # Mismatched length between tokens and other_fields - LabeledSequence(["a", "b"], ["B-PER", "I-PER"], other_fields=[["DT"]]) + LabeledSequence(["a", "b"], ["B-PER", "I-PER"], orig_fields=[["DT"]]) diff --git a/tests/test_validation.py b/tests/test_validation.py index c8f121e..9fbde19 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -340,9 +340,8 @@ def test_validation_bad_label() -> None: labels = ["O", "PER", "PER"] with pytest.raises(EncodingError) as err: validate_labels(labels, encoding, tokens=tokens, line_nums=line_nums) - assert ( - str(err.value) - == "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." + assert str(err.value).startswith( + "Could not parse label 'PER' on line 8 during validation: Label 'PER' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) diff --git a/tests/test_validation_click.py b/tests/test_validation_click.py index 3cb4a98..5cf201c 100644 --- a/tests/test_validation_click.py +++ b/tests/test_validation_click.py @@ -210,9 +210,8 @@ def test_bad_label() -> None: ["--labels", "BIO", os.path.join("tests", "conll_annotation", "bad_label2.bio")], ) assert result.exit_code != 0 - assert ( - str(result.exception) - == "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." + assert str(result.exception).startswith( + "Could not parse label 'GPE' on line 4 during validation: Label 'GPE' does not have a state and entity type but is not outside ('O'). Expected the label to be of a format like '-'." ) @@ -220,9 +219,18 @@ def test_ner_label_index_pos() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", "--ner-label-index", "1", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + [ + "--labels", + "BIO", + "--label-index", + "1", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + ], + ) + assert ( + result.output + == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" ) - assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" assert result.exit_code == 0 @@ -230,9 +238,18 @@ def test_ner_label_index_neg() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", "--ner-label-index", "-2", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + [ + "--labels", + "BIO", + "--label-index", + "-2", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + ], + ) + assert ( + result.output + == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" ) - assert result.output == "No errors found in 15 tokens, 2 sequences, and 1 document(s) in tests/conll_annotation/labels_not_last_col.bio\n" assert result.exit_code == 0 @@ -240,7 +257,15 @@ def test_ner_label_index_zero() -> None: runner = CliRunner() result = runner.invoke( validate, - ["--labels", "BIO", "--ner-label-index", "0", os.path.join("tests", "conll_annotation", "labels_not_last_col.bio")], + [ + "--labels", + "BIO", + "--label-index", + "0", + os.path.join("tests", "conll_annotation", "labels_not_last_col.bio"), + ], ) assert result.exit_code != 0 - assert "ner_label_index cannot be 0" in str(result.exception) + assert "Token index (0) and label index (0) cannot be the same" in str( + result.exception + ) From 318e01ca79b80922f548f208c5771efed93c9061 Mon Sep 17 00:00:00 2001 From: Constantine Lignos Date: Wed, 3 Jun 2026 12:13:10 -0400 Subject: [PATCH 5/5] Update contributors in README --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3f6c148..6fc280d 100644 --- a/README.md +++ b/README.md @@ -608,6 +608,7 @@ To install from a clone of this repository, use: # Contributors SeqScore was developed by the BLT Lab at Brandeis University under the -direction of PI and lead developer Constantine Lignos. Chester Palen-Michel -and Nolan Holley contributed to its development. Gordon Dou, Maya Kruse, and -Andrew Rueda gave feedback on its features and assisted in README writing. +direction of PI and lead developer Constantine Lignos. Chester +Palen-Michel, Nolan Holley, and Claire Wang contributed to its +development. Gordon Dou, Maya Kruse, and Andrew Rueda gave feedback +on its features and assisted in README writing.