From 7f27c5581d50d3aa47b99f29cf5245c9618e24ae Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:48:50 +0000 Subject: [PATCH 1/2] Initial plan From c03b72c5edcf84285db6f82cb16c3e1a7a7921eb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:59:34 +0000 Subject: [PATCH 2/2] Apply code review fixes: classmethod signatures, parameter names, docstrings, streaming IO, type annotations, warning message Co-authored-by: hollenstein <11833102+hollenstein@users.noreply.github.com> --- profasta/db.py | 7 ++++--- profasta/io.py | 24 +++++++++++++++--------- profasta/parser.py | 22 +++++++++++----------- tests/integration/test_integration.py | 2 +- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/profasta/db.py b/profasta/db.py index 8295b0c..cbb8e9f 100644 --- a/profasta/db.py +++ b/profasta/db.py @@ -63,7 +63,7 @@ class ProteinDatabase: db: dict[str, AbstractDatabaseEntry] added_fasta_files: list[str] - skipped_fasta_entries: dict[str, list] + skipped_fasta_entries: dict[str, list[str]] def __init__(self): self.db = {} @@ -128,10 +128,11 @@ def add_fasta( if skipped_entry_headers: num_skipped = len(skipped_entry_headers) num_total = num_skipped + len(parsed_protein_entries) + skipped_list = "\n ".join(skipped_entry_headers) logger.warning( f"Skipped {num_skipped}/{num_total} entries while adding " f"'{fasta_name}' to a ProteinDatabase because their headers could not " - f"be parsed:" + f"be parsed:\n {skipped_list}" ) def add_entry(self, protein_entry: AbstractDatabaseEntry, overwrite: bool = False): @@ -152,7 +153,7 @@ def add_entry(self, protein_entry: AbstractDatabaseEntry, overwrite: bool = Fals def write_fasta( self, - path, + path: str, append: bool = False, header_writer: Optional[str] = None, line_width: int = 60, diff --git a/profasta/io.py b/profasta/io.py index 77f3ed5..2732103 100644 --- a/profasta/io.py +++ b/profasta/io.py @@ -7,7 +7,7 @@ FastaRecord: Representation of a protein record in a FASTA file. Functions: - parse_fasta: Parse a FASTA file object and return a list of FastaRecords. + parse_fasta: Parse a FASTA file object and yield FastaRecords. write_fasta: Write a list of FastaRecords to a file object. """ @@ -55,14 +55,20 @@ def parse_fasta(file_object: IO[str]) -> Generator[FastaRecord, None, None]: Yields: Yields FastaRecords. """ - fasta_text = file_object.read() - if not fasta_text.startswith("\n"): - fasta_text = "\n" + fasta_text - - for block in fasta_text.split("\n>")[1:]: - lines = block.split("\n") - header = lines[0].strip() - sequence = "".join(lines[1:]).replace(" ", "").rstrip("*").upper() + header = None + sequence_lines: list[str] = [] + for line in file_object: + line = line.rstrip("\n") + if line.startswith(">"): + if header is not None: + sequence = "".join(sequence_lines).replace(" ", "").rstrip("*").upper() + yield FastaRecord(header, sequence) + header = line[1:].strip() + sequence_lines = [] + else: + sequence_lines.append(line) + if header is not None: + sequence = "".join(sequence_lines).replace(" ", "").rstrip("*").upper() yield FastaRecord(header, sequence) diff --git a/profasta/parser.py b/profasta/parser.py index 3b75eb8..91e7529 100644 --- a/profasta/parser.py +++ b/profasta/parser.py @@ -16,8 +16,8 @@ UniprotParser: Parser for Uniprot FASTA headers. UniprotLikeParser: Parser for less strict Uniprot like FASTA headers. DefaultWriter: Default FASTA header writer. - UniprotWriter: Parser for Uniprot FASTA writer. - UniprotLikeWriter: Parser for less strict Uniprot like FASTA writer. + UniprotWriter: Writer for Uniprot FASTA headers. + UniprotLikeWriter: Writer for less strict Uniprot like FASTA headers. Functions: register_parser: Register a custom FASTA header parser by name. @@ -57,7 +57,7 @@ class AbstractHeaderParser(Protocol): """Abstract header parser.""" @classmethod - def parse(self, header: str) -> AbstractParsedHeader: + def parse(cls, header: str) -> AbstractParsedHeader: """Parse a FASTA header string into a ParsedHeader object. Raises: @@ -70,7 +70,7 @@ class AbstractHeaderWriter(Protocol): """Abstract header writer.""" @classmethod - def write(self, parsed_header: AbstractParsedHeader) -> str: + def write(cls, parsed_header: AbstractParsedHeader) -> str: """Write a FASTA header string from a ParsedHeader object.""" ... @@ -95,9 +95,9 @@ class DefaultParser: The `parse` method returns a ParsedHeader object with the identifier being the first whitespace-separated word of the header. The rest of the header is stored - in the "description" field of the `header_fields` dictionary, which might be an - empty string. This parser is guaranteed to work for any FASTA header string and - never fail. + in the "description" field of the `header_fields` dictionary. If there is no + description, the "description" key is absent from `header_fields`. This parser + is guaranteed to work for any FASTA header string and never fail. """ @classmethod @@ -315,14 +315,14 @@ def get_parser(parser_name: str) -> AbstractHeaderParser: return PARSER_REGISTRY[parser_name] -def register_writer(name: str, parser: AbstractHeaderWriter): +def register_writer(name: str, writer: AbstractHeaderWriter): """Register a custom writer by name.""" - WRITER_REGISTRY[name] = parser + WRITER_REGISTRY[name] = writer -def get_writer(parser_name: str) -> AbstractHeaderWriter: +def get_writer(writer_name: str) -> AbstractHeaderWriter: """Get a registered writer by name.""" - return WRITER_REGISTRY[parser_name] + return WRITER_REGISTRY[writer_name] PARSER_REGISTRY: dict[str, AbstractHeaderParser] = { diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 2c4099a..d4258de 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -15,7 +15,7 @@ def parse(cls, header): class CustomWriter: @classmethod - def write(self, parsed_header): + def write(cls, parsed_header): return parsed_header.header