-
Notifications
You must be signed in to change notification settings - Fork 8
feat(prepro): basic validation of raw read submissions #6773
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: raw-reads
Are you sure you want to change the base?
Changes from all commits
48870da
fd25010
91f4f2e
0c6d635
9c0baa0
a9b40a5
2ec13bf
f7be2e2
35c71b4
ee80c85
9406c29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,10 +27,17 @@ | |
| ProcessingAnnotationAlignment: Final = "alignment" | ||
|
|
||
|
|
||
| @unique | ||
| class FileCategory(StrEnum): | ||
| RAW_READS = "raw_reads" | ||
| ANNOTATIONS = "annotations" | ||
|
|
||
|
|
||
| @unique | ||
| class AnnotationSourceType(StrEnum): | ||
| METADATA = "Metadata" | ||
| NUCLEOTIDE_SEQUENCE = "NucleotideSequence" | ||
| SUBMITTED_FILE = "SubmittedFile" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this name is a bit confusing as it implies this is a file submitted by the user but annotations are created by the prepro pipeline, I think just "File" is ok |
||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
|
|
@@ -74,6 +81,12 @@ def from_single(cls, name: str, type, message: str): | |
| return cls.from_fields([name], [name], type, message) | ||
|
|
||
|
|
||
| @dataclass | ||
| class FileIdAndName: | ||
| fileId: str # noqa: N815 | ||
| name: str | ||
|
|
||
|
|
||
| @dataclass | ||
| class UnprocessedData: | ||
| submitter: str | ||
|
|
@@ -82,6 +95,7 @@ class UnprocessedData: | |
| submissionId: str # noqa: N815 | ||
| metadata: InputMetadata | ||
| unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 | ||
| files: dict[FileCategory, list[FileIdAndName]] | None | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
@@ -97,6 +111,7 @@ class UnprocessedEntry: | |
| @dataclass | ||
| class UnprocessedAfterNextclade: | ||
| inputMetadata: InputMetadata # noqa: N815 | ||
| files: dict[FileCategory, list[FileIdAndName]] | None | ||
| # Derived metadata produced by Nextclade | ||
| nextcladeMetadata: dict[SequenceName, Any] | None # noqa: N815 | ||
| unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 | ||
|
|
@@ -109,22 +124,16 @@ class UnprocessedAfterNextclade: | |
| warnings: list[ProcessingAnnotation] | ||
|
|
||
|
|
||
| @dataclass | ||
| class FileIdAndName: | ||
| fileId: str # noqa: N815 | ||
| name: str | ||
|
|
||
|
|
||
| @dataclass | ||
| class ProcessedData: | ||
| metadata: ProcessedMetadata | ||
| files: dict[FileCategory, list[FileIdAndName]] | None | ||
| unalignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 | ||
| alignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 | ||
| nucleotideInsertions: dict[SequenceName, Any] # noqa: N815 | ||
| alignedAminoAcidSequences: dict[GeneName, Any] # noqa: N815 | ||
| aminoAcidInsertions: dict[GeneName, Any] # noqa: N815 | ||
| sequenceNameToFastaId: dict[SequenceName, FastaId] # noqa: N815 | ||
| files: dict[str, list[FileIdAndName]] | None = None | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| AminoAcidSequence, | ||
| AnnotationSource, | ||
| AnnotationSourceType, | ||
| FileCategory, | ||
| FileIdAndName, | ||
| GeneName, | ||
| InputData, | ||
|
|
@@ -60,6 +61,7 @@ | |
| process_mutations_from_clade_founder, | ||
| process_phenotype_values, | ||
| process_stop_codons, | ||
| validate_raw_reads_submission, | ||
| ) | ||
| from .sequence_checks import error_on_excess_sequences, errors_if_non_iupac | ||
|
|
||
|
|
@@ -296,6 +298,7 @@ def processed_entry_no_alignment( # noqa: PLR0913, PLR0917 | |
| version=version_from_str(accession_version), | ||
| data=ProcessedData( | ||
| metadata=output_metadata, | ||
| files=unprocessed.files, | ||
| unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences, | ||
| alignedNucleotideSequences=aligned_nucleotide_sequences, | ||
| nucleotideInsertions=nucleotide_insertions, | ||
|
|
@@ -435,6 +438,27 @@ def get_output_metadata( | |
| return output_metadata, errors, warnings | ||
|
|
||
|
|
||
| def process_submitted_files( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe worth moving the file related functions (this one and validate raw reads) to a new file to not grow this 2k line monstrosity even further |
||
| file_mapping: dict[FileCategory, list[FileIdAndName]], | ||
| ) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: | ||
| errors: list[ProcessingAnnotation] = [] | ||
| warnings: list[ProcessingAnnotation] = [] | ||
|
|
||
| for category, files in file_mapping.items(): | ||
| if not files: | ||
| # Backend always includes a key with empty list for enabled categories | ||
| continue | ||
| match category: | ||
| case FileCategory.RAW_READS: | ||
| rr_errors, rr_warnings = validate_raw_reads_submission(files) | ||
| errors.extend(rr_errors) | ||
| warnings.extend(rr_warnings) | ||
| case _: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes currently when we add a new input file category, this match statement will need to be updated to handle it. That is as intended (see discussion of alternative in the PR description)
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if this should emit a hard exception, as (if my understanding is correct) a file category could be added to the
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess a hard exception would prevent other categories being allowed for other users of loculus which might not be a good thing |
||
| logger.warning(f"Submitted file is of unexpected category {category}") | ||
|
|
||
| return errors, warnings | ||
|
|
||
|
|
||
| def alignment_errors_warnings( | ||
| unprocessed: UnprocessedAfterNextclade, | ||
| config: Config, | ||
|
|
@@ -529,11 +553,14 @@ def process_single( | |
| accession_version, unprocessed, config | ||
| ) | ||
|
|
||
| file_errors, file_warnings = process_submitted_files(unprocessed.files or {}) | ||
|
|
||
| processed_entry = ProcessedEntry( | ||
| accession=accession_from_str(accession_version), | ||
| version=version_from_str(accession_version), | ||
| data=ProcessedData( | ||
| metadata=output_metadata, | ||
| files=unprocessed.files, | ||
| unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences, | ||
| alignedNucleotideSequences=unprocessed.alignedNucleotideSequences, | ||
| nucleotideInsertions=unprocessed.nucleotideInsertions, | ||
|
|
@@ -548,9 +575,12 @@ def process_single( | |
| + max_seq_errors | ||
| + alignment_errors | ||
| + metadata_errors | ||
| + file_errors | ||
| ) | ||
| ), | ||
| warnings=list(set(unprocessed.warnings + alignment_warnings + metadata_warnings)), | ||
| warnings=list( | ||
| set(unprocessed.warnings + alignment_warnings + metadata_warnings + file_warnings) | ||
| ), | ||
| ) | ||
|
|
||
| return SubmissionData( | ||
|
|
@@ -578,12 +608,16 @@ def process_single_unaligned( | |
| accession_version, unprocessed, config | ||
| ) | ||
|
|
||
| file_errors, file_warnings = process_submitted_files(unprocessed.files or {}) | ||
|
|
||
| return processed_entry_no_alignment( | ||
| accession_version=accession_version, | ||
| unprocessed=unprocessed, | ||
| output_metadata=output_metadata, | ||
| errors=list(set(iupac_errors + metadata_errors + segment_assignment.alert.errors)), | ||
| warnings=list(set(metadata_warnings)), | ||
| errors=list( | ||
| set(iupac_errors + metadata_errors + segment_assignment.alert.errors + file_errors) | ||
| ), | ||
| warnings=list(set(metadata_warnings + file_warnings)), | ||
| sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId, | ||
| ) | ||
|
|
||
|
|
@@ -595,6 +629,7 @@ def processed_entry_with_errors(id) -> SubmissionData: | |
| version=version_from_str(id), | ||
| data=ProcessedData( | ||
| metadata=dict[str, ProcessedMetadataValue](), | ||
| files=None, | ||
| unalignedNucleotideSequences=defaultdict(dict[str, Any]), | ||
| alignedNucleotideSequences=defaultdict(dict[str, Any]), | ||
| nucleotideInsertions=defaultdict(dict[str, Any]), | ||
|
|
@@ -660,9 +695,11 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non | |
| file_id = upload_info.fileId | ||
| url = upload_info.url | ||
| upload_embl_file_to_presigned_url(file_content, url) | ||
| submission_data.processed_entry.data.files = { | ||
| "annotations": [FileIdAndName(fileId=file_id, name=file_name)] | ||
| } | ||
| processed_files = submission_data.processed_entry.data.files or {} | ||
| processed_files.setdefault(FileCategory.ANNOTATIONS, []).append( | ||
| FileIdAndName(fileId=file_id, name=file_name) | ||
| ) | ||
| submission_data.processed_entry.data.files = processed_files | ||
| except Exception as e: | ||
| logger.error("Error creating or uploading EMBL file: %s", e) | ||
| submission_data.processed_entry.errors.append( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| from .datatypes import ( | ||
| AnnotationSource, | ||
| AnnotationSourceType, | ||
| FileIdAndName, | ||
| FunctionArgs, | ||
| InputData, | ||
| InputMetadata, | ||
|
|
@@ -1948,6 +1949,39 @@ def single_metadata_annotation( | |
| ] | ||
|
|
||
|
|
||
| def validate_raw_reads_submission( | ||
| files: list[FileIdAndName], | ||
| ) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: | ||
| errors: list[ProcessingAnnotation] = [] | ||
| warnings: list[ProcessingAnnotation] = [] | ||
|
|
||
| if len(files) > 2: # noqa: PLR2004 | ||
| message = f"Raw reads must be submitted as one or two files, got {len(files)}" | ||
|
maverbiest marked this conversation as resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a message for submitters so I would actually state we want to have paired-end or single-end raw reads, and thus accept a max of 2 files. |
||
| errors.append( | ||
| ProcessingAnnotation.from_fields( | ||
| input_fields=[f.name for f in files], | ||
| output_fields=[f.name for f in files], | ||
| type=AnnotationSourceType.SUBMITTED_FILE, | ||
| message=message, | ||
| ) | ||
| ) | ||
|
|
||
| allowed_extensions = [".fastq", ".fastq.gz", ".fq", ".fq.gz"] | ||
| for file in files: | ||
| if not any(file.name.endswith(extension) for extension in allowed_extensions): | ||
|
maverbiest marked this conversation as resolved.
|
||
| message = ( | ||
| f"Raw reads file '{file.name}' has unrecognized extension." | ||
| f" Allowed extensions: {', '.join(allowed_extensions)}" | ||
| ) | ||
| errors.append( | ||
| ProcessingAnnotation.from_single( | ||
| name=file.name, type=AnnotationSourceType.SUBMITTED_FILE, message=message | ||
| ) | ||
| ) | ||
|
|
||
| return errors, warnings | ||
|
|
||
|
|
||
| def process_frameshifts(input: str | None) -> InputData: | ||
| """Converts frameshift string to InputData for processing""" | ||
| try: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file caught some stray updates from running the formatter