diff --git a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt index 4ee737b238..6340f660b9 100644 --- a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt +++ b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt @@ -284,6 +284,7 @@ data class PreprocessingAnnotationSource( enum class PreprocessingAnnotationSourceType { Metadata, NucleotideSequence, + SubmittedFile, } data class GetSequenceResponse( diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ac55e61ca3..76f821fd83 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -80,6 +80,11 @@ defaultOrganismConfig: &defaultOrganismConfig submissionDataTypes: &defaultSubmissionDataTypes consensusSequences: true maxSequencesPerEntry: 1 + files: + enabled: true + categories: + - name: raw_reads + displayName: Raw reads loadSequencesAutomatically: true earliestReleaseDate: enabled: true @@ -89,6 +94,8 @@ defaultOrganismConfig: &defaultOrganismConfig files: - name: annotations displayName: Annotations + - name: raw_reads + displayName: Raw reads ### Field list ## General fields # name: Key used across app to refer to this field (required) diff --git a/preprocessing/dummy/main.py b/preprocessing/dummy/main.py index e3a2b9cc14..647c8b883a 100644 --- a/preprocessing/dummy/main.py +++ b/preprocessing/dummy/main.py @@ -129,7 +129,7 @@ def parse_ndjson(ndjson_data: str) -> list[Sequence]: def process(unprocessed: list[Sequence]) -> list[Sequence]: - with open("mock-sequences.json", "r") as f: + with open("mock-sequences.json") as f: mock_sequences = json.load(f) possible_lineages = ["A.1", "A.1.1", "A.2"] @@ -160,7 +160,7 @@ def process(unprocessed: list[Sequence]) -> list[Sequence]: "nucleotideInsertions": {}, "aminoAcidInsertions": {}, } - + if not disableConsensusSequences: data = {**data, **mock_sequences} data["sequenceNameToFastaId"] = {"main": submissionId} diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py index 099ebc997d..9b362ec580 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py @@ -17,6 +17,7 @@ from .config import Config from .datatypes import ( + FileIdAndName, FileUploadInfo, ProcessedEntry, UnprocessedData, @@ -93,6 +94,15 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]: key: trim_ns(value) if value else None for key, value in unaligned_nucleotide_sequences.items() } + submitted_files = json_object["data"].get("files") + file_mapping = ( + { + category: [FileIdAndName(fileId=f["fileId"], name=f["name"]) for f in files] + for category, files in submitted_files.items() + } + if submitted_files + else None + ) unprocessed_data = UnprocessedData( submitter=json_object["submitter"], group_id=json_object["groupId"], @@ -102,6 +112,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]: unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences if unaligned_nucleotide_sequences else {}, + files=file_mapping, ) entry = UnprocessedEntry( accessionVersion=f"{json_object['accession']}.{json_object['version']}", diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index a3f2129652..a59abb50c7 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -27,10 +27,17 @@ ProcessingAnnotationAlignment: Final = "alignment" +@unique +class FileCategory(StrEnum): + RAW_READS = "raw_reads" + ANNOTATIONS = "annotations" + + @unique class AnnotationSourceType(StrEnum): METADATA = "Metadata" NUCLEOTIDE_SEQUENCE = "NucleotideSequence" + SUBMITTED_FILE = "SubmittedFile" @dataclass(frozen=True) @@ -74,6 +81,12 @@ def from_single(cls, name: str, type, message: str): return cls.from_fields([name], [name], type, message) +@dataclass +class FileIdAndName: + fileId: str # noqa: N815 + name: str + + @dataclass class UnprocessedData: submitter: str @@ -82,6 +95,7 @@ class UnprocessedData: submissionId: str # noqa: N815 metadata: InputMetadata unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 + files: dict[FileCategory, list[FileIdAndName]] | None @dataclass @@ -97,6 +111,7 @@ class UnprocessedEntry: @dataclass class UnprocessedAfterNextclade: inputMetadata: InputMetadata # noqa: N815 + files: dict[FileCategory, list[FileIdAndName]] | None # Derived metadata produced by Nextclade nextcladeMetadata: dict[SequenceName, Any] | None # noqa: N815 unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 @@ -109,22 +124,16 @@ class UnprocessedAfterNextclade: warnings: list[ProcessingAnnotation] -@dataclass -class FileIdAndName: - fileId: str # noqa: N815 - name: str - - @dataclass class ProcessedData: metadata: ProcessedMetadata + files: dict[FileCategory, list[FileIdAndName]] | None unalignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 alignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 nucleotideInsertions: dict[SequenceName, Any] # noqa: N815 alignedAminoAcidSequences: dict[GeneName, Any] # noqa: N815 aminoAcidInsertions: dict[GeneName, Any] # noqa: N815 sequenceNameToFastaId: dict[SequenceName, FastaId] # noqa: N815 - files: dict[str, list[FileIdAndName]] | None = None @dataclass diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 7cca9f7835..8545472b63 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -26,6 +26,8 @@ AminoAcidSequence, AnnotationSourceType, FastaId, + FileCategory, + FileIdAndName, GeneName, GenericSequence, NucleotideInsertion, @@ -801,6 +803,9 @@ def enrich_with_nextclade( # noqa: PLR0914 } for entry in unprocessed } + input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = { + entry.accessionVersion: entry.data.files for entry in unprocessed + } batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir) unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences @@ -893,6 +898,7 @@ def enrich_with_nextclade( # noqa: PLR0914 return { id: UnprocessedAfterNextclade( inputMetadata=input_metadata[id], + files=input_files[id], nextcladeMetadata=nextclade_metadata[id], unalignedNucleotideSequences=unaligned_nucleotide_sequences[id], alignedNucleotideSequences=aligned_nucleotide_sequences[id], diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 8a752c9982..be70462c18 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -26,6 +26,7 @@ AminoAcidSequence, AnnotationSource, AnnotationSourceType, + FileCategory, FileIdAndName, GeneName, InputData, @@ -60,6 +61,7 @@ process_mutations_from_clade_founder, process_phenotype_values, process_stop_codons, + validate_raw_reads_submission, ) from .sequence_checks import error_on_excess_sequences, errors_if_non_iupac @@ -296,6 +298,7 @@ def processed_entry_no_alignment( # noqa: PLR0913, PLR0917 version=version_from_str(accession_version), data=ProcessedData( metadata=output_metadata, + files=unprocessed.files, unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences, alignedNucleotideSequences=aligned_nucleotide_sequences, nucleotideInsertions=nucleotide_insertions, @@ -435,6 +438,27 @@ def get_output_metadata( return output_metadata, errors, warnings +def process_submitted_files( + file_mapping: dict[FileCategory, list[FileIdAndName]], +) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: + errors: list[ProcessingAnnotation] = [] + warnings: list[ProcessingAnnotation] = [] + + for category, files in file_mapping.items(): + if not files: + # Backend always includes a key with empty list for enabled categories + continue + match category: + case FileCategory.RAW_READS: + rr_errors, rr_warnings = validate_raw_reads_submission(files) + errors.extend(rr_errors) + warnings.extend(rr_warnings) + case _: + logger.warning(f"Submitted file is of unexpected category {category}") + + return errors, warnings + + def alignment_errors_warnings( unprocessed: UnprocessedAfterNextclade, config: Config, @@ -529,11 +553,14 @@ def process_single( accession_version, unprocessed, config ) + file_errors, file_warnings = process_submitted_files(unprocessed.files or {}) + processed_entry = ProcessedEntry( accession=accession_from_str(accession_version), version=version_from_str(accession_version), data=ProcessedData( metadata=output_metadata, + files=unprocessed.files, unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences, alignedNucleotideSequences=unprocessed.alignedNucleotideSequences, nucleotideInsertions=unprocessed.nucleotideInsertions, @@ -548,9 +575,12 @@ def process_single( + max_seq_errors + alignment_errors + metadata_errors + + file_errors ) ), - warnings=list(set(unprocessed.warnings + alignment_warnings + metadata_warnings)), + warnings=list( + set(unprocessed.warnings + alignment_warnings + metadata_warnings + file_warnings) + ), ) return SubmissionData( @@ -578,12 +608,16 @@ def process_single_unaligned( accession_version, unprocessed, config ) + file_errors, file_warnings = process_submitted_files(unprocessed.files or {}) + return processed_entry_no_alignment( accession_version=accession_version, unprocessed=unprocessed, output_metadata=output_metadata, - errors=list(set(iupac_errors + metadata_errors + segment_assignment.alert.errors)), - warnings=list(set(metadata_warnings)), + errors=list( + set(iupac_errors + metadata_errors + segment_assignment.alert.errors + file_errors) + ), + warnings=list(set(metadata_warnings + file_warnings)), sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId, ) @@ -595,6 +629,7 @@ def processed_entry_with_errors(id) -> SubmissionData: version=version_from_str(id), data=ProcessedData( metadata=dict[str, ProcessedMetadataValue](), + files=None, unalignedNucleotideSequences=defaultdict(dict[str, Any]), alignedNucleotideSequences=defaultdict(dict[str, Any]), nucleotideInsertions=defaultdict(dict[str, Any]), @@ -660,9 +695,11 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non file_id = upload_info.fileId url = upload_info.url upload_embl_file_to_presigned_url(file_content, url) - submission_data.processed_entry.data.files = { - "annotations": [FileIdAndName(fileId=file_id, name=file_name)] - } + processed_files = submission_data.processed_entry.data.files or {} + processed_files.setdefault(FileCategory.ANNOTATIONS, []).append( + FileIdAndName(fileId=file_id, name=file_name) + ) + submission_data.processed_entry.data.files = processed_files except Exception as e: logger.error("Error creating or uploading EMBL file: %s", e) submission_data.processed_entry.errors.append( diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index e2428bd2b7..aa5dc32903 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -25,6 +25,7 @@ from .datatypes import ( AnnotationSource, AnnotationSourceType, + FileIdAndName, FunctionArgs, InputData, InputMetadata, @@ -1948,6 +1949,39 @@ def single_metadata_annotation( ] +def validate_raw_reads_submission( + files: list[FileIdAndName], +) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: + errors: list[ProcessingAnnotation] = [] + warnings: list[ProcessingAnnotation] = [] + + if len(files) > 2: # noqa: PLR2004 + message = f"Raw reads must be submitted as one or two files, got {len(files)}" + errors.append( + ProcessingAnnotation.from_fields( + input_fields=[f.name for f in files], + output_fields=[f.name for f in files], + type=AnnotationSourceType.SUBMITTED_FILE, + message=message, + ) + ) + + allowed_extensions = [".fastq", ".fastq.gz", ".fq", ".fq.gz"] + for file in files: + if not any(file.name.endswith(extension) for extension in allowed_extensions): + message = ( + f"Raw reads file '{file.name}' has unrecognized extension." + f" Allowed extensions: {', '.join(allowed_extensions)}" + ) + errors.append( + ProcessingAnnotation.from_single( + name=file.name, type=AnnotationSourceType.SUBMITTED_FILE, message=message + ) + ) + + return errors, warnings + + def process_frameshifts(input: str | None) -> InputData: """Converts frameshift string to InputData for processing""" try: diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 41b26062ae..679601e744 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -9,6 +9,8 @@ from loculus_preprocessing.datatypes import ( AnnotationSource, AnnotationSourceType, + FileCategory, + FileIdAndName, NucleotideSequence, ProcessedData, ProcessedEntry, @@ -77,6 +79,7 @@ def create_unprocessed_entry( accession_id: str, sequences: dict[SegmentName, NucleotideSequence | None], group_id: int = 2, + files: dict[FileCategory, list[FileIdAndName]] | None = None, ) -> UnprocessedEntry: return UnprocessedEntry( accessionVersion=f"LOC_{accession_id}.1", @@ -89,6 +92,7 @@ def create_unprocessed_entry( group_id=group_id, metadata=metadata_dict, unalignedNucleotideSequences=sequences, + files=files, ), ) @@ -130,6 +134,7 @@ def create_processed_entry( errors: list[ProcessingAnnotation] | None = None, warnings: list[ProcessingAnnotation] | None = None, processed_alignment: ProcessedAlignment | None = None, + files: dict[FileCategory, list[FileIdAndName]] | None = None, ) -> ProcessedEntry: if errors is None: errors = [] @@ -147,6 +152,7 @@ def create_processed_entry( version=1, data=ProcessedData( metadata=base_metadata_dict, + files=files, unalignedNucleotideSequences=processed_alignment.unalignedNucleotideSequences, alignedNucleotideSequences=processed_alignment.alignedNucleotideSequences, nucleotideInsertions=processed_alignment.nucleotideInsertions, @@ -163,9 +169,11 @@ def create_processed_entry( class Case: name: str input_metadata: dict[str, str | None] = field(default_factory=dict) + input_files: dict[FileCategory, list[FileIdAndName]] | None = None input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None}) accession_id: str = "000999" expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict) + expected_files: dict[FileCategory, list[FileIdAndName]] | None = None expected_errors: list[ProcessingAnnotation] | None = None expected_warnings: list[ProcessingAnnotation] | None = None expected_processed_alignment: ProcessedAlignment | None = None @@ -179,6 +187,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT accession_id=self.accession_id, sequences=self.input_sequence, group_id=self.group_id, + files=self.input_files, ) expected_output = factory_custom.create_processed_entry( metadata_dict=self.expected_metadata, @@ -186,6 +195,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT errors=self.expected_errors or [], warnings=self.expected_warnings or [], processed_alignment=self.expected_processed_alignment, + files=self.expected_files, ) return ProcessingTestCase( name=self.name, input=unprocessed_entry, expected_output=expected_output @@ -265,3 +275,8 @@ def verify_processed_entry( f"{test_name}: sequence name to fasta header map '{actual.sequenceNameToFastaId}' do not " f"match expectation '{expected.sequenceNameToFastaId}'." ) + + # Check files + assert actual.files == expected.files, ( + f"{test_name}: files '{actual.files}' do not match expectation '{expected.files}'." + ) diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py index d97cdbe97e..62cdf75768 100644 --- a/preprocessing/nextclade/tests/test_host_name_validation.py +++ b/preprocessing/nextclade/tests/test_host_name_validation.py @@ -34,6 +34,7 @@ def make_entry(metadata: dict, group_id: int) -> UnprocessedEntry: group_id=group_id, metadata=metadata, unalignedNucleotideSequences={}, + files=None, ), ) diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index d17ae42244..f3961c4beb 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -783,6 +783,7 @@ def test_preprocessing_without_consensus_sequences(config: Config) -> None: "name_required": sequence_name, }, unalignedNucleotideSequences={}, + files=None, ), ) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 05edf3a4cf..f1fc7fc641 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -26,6 +26,8 @@ ) from loculus_preprocessing.datatypes import ( AnnotationSourceType, + FileCategory, + FileIdAndName, SegmentClassificationMethod, SubmissionData, UnprocessedData, @@ -284,6 +286,141 @@ def invalid_sequence() -> str: sequenceNameToFastaId={"main": "fastaHeader"}, ), ), + Case( + name="with file", + input_metadata={}, + input_files={ + FileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + ] + }, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + "nonExistentField": "None", + "variant": True, + }, + expected_files={ + FileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + ] + }, + expected_errors=[], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + sequenceNameToFastaId={"main": "fastaHeader"}, + ), + ), + Case( + name="with too many raw read files", + input_metadata={}, + input_files={ + FileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"), + ] + }, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + "nonExistentField": "None", + "variant": True, + }, + expected_files={ + FileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"), + ] + }, + expected_errors=build_processing_annotations( + [ + ProcessingAnnotationHelper( + ["reads_R1.fastq", "reads_R2.fastq", "reads_R3.fastq"], + ["reads_R1.fastq", "reads_R2.fastq", "reads_R3.fastq"], + "Raw reads must be submitted as one or two files, got 3", + AnnotationSourceType.SUBMITTED_FILE, + ) + ] + ), + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + sequenceNameToFastaId={"main": "fastaHeader"}, + ), + ), + Case( + name="with unrecognized raw read file extension", + input_metadata={}, + input_files={ + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] + }, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + "nonExistentField": "None", + "variant": True, + }, + expected_files={ + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] + }, + expected_errors=build_processing_annotations( + [ + ProcessingAnnotationHelper( + ["reads.txt"], + ["reads.txt"], + "Raw reads file 'reads.txt' has unrecognized extension. " + "Allowed extensions: .fastq, .fastq.gz, .fq, .fq.gz", + AnnotationSourceType.SUBMITTED_FILE, + ) + ] + ), + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + sequenceNameToFastaId={"main": "fastaHeader"}, + ), + ), ] single_segment_failed_case_definitions = [ @@ -1134,6 +1271,51 @@ def invalid_sequence() -> str: sequenceNameToFastaId={"ebola-sudan": "ebola-sudan"}, ), ), + Case( + name="validate raw read files when sequences are not aligned", + input_metadata={}, + input_sequence={ + "prefix_ebola-sudan": sequence_with_mutation("ebola-sudan"), + "other_prefix_ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + accession_id="1", + input_files={ + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] + }, + expected_files={ + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] + }, + expected_metadata={ + "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), + "length_ebola-zaire": len(consensus_sequence("ebola-zaire")), + }, + expected_errors=build_processing_annotations( + [ + ProcessingAnnotationHelper( + ["reads.txt"], + ["reads.txt"], + "Raw reads file 'reads.txt' has unrecognized extension. " + "Allowed extensions: .fastq, .fastq.gz, .fq, .fq.gz", + AnnotationSourceType.SUBMITTED_FILE, + ) + ] + ), + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + alignedNucleotideSequences={}, + nucleotideInsertions={}, + alignedAminoAcidSequences={}, + aminoAcidInsertions={}, + sequenceNameToFastaId={ + "ebola-sudan": "prefix_ebola-sudan", + "ebola-zaire": "other_prefix_ebola-zaire", + }, + ), + ), ] @@ -1285,6 +1467,7 @@ def test_max_sequences_per_entry_batch_isolation() -> None: "ebola-sudan": sequence_with_mutation("ebola-sudan"), "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, + files=None, ), ) @@ -1299,6 +1482,7 @@ def test_max_sequences_per_entry_batch_isolation() -> None: unalignedNucleotideSequences={ "ebola-sudan": sequence_with_mutation("ebola-sudan"), }, + files=None, ), ) @@ -1332,6 +1516,7 @@ def test_preprocessing_without_metadata() -> None: "ebola-sudan": sequence_with_mutation("ebola-sudan"), "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, + files=None, ), ) @@ -1455,6 +1640,7 @@ def test_create_flatfile(): "authors": "Smith, Doe A;", }, unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + files=None, ), ) diff --git a/website/src/types/backend.ts b/website/src/types/backend.ts index 225da65ecc..da2d2c2f73 100644 --- a/website/src/types/backend.ts +++ b/website/src/types/backend.ts @@ -24,7 +24,11 @@ export const sequenceEntryProcessingResultNames = z.union([ ]); export type SequenceEntryProcessingResultNames = z.infer; -const processingAnnotationSourceType = z.union([z.literal('Metadata'), z.literal('NucleotideSequence')]); +const processingAnnotationSourceType = z.union([ + z.literal('Metadata'), + z.literal('NucleotideSequence'), + z.literal('SubmittedFile'), +]); export type ProcessingAnnotationSourceType = z.infer; const processingAnnotation = z.object({