From 48870daf108e130aef1058edd5d09fe69e9ae02c Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 12 Jun 2026 15:27:30 +0200 Subject: [PATCH 01/11] Enable raw reads sharing by default --- kubernetes/loculus/values.yaml | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ac55e61ca3..02eafc0a58 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -80,6 +80,11 @@ defaultOrganismConfig: &defaultOrganismConfig submissionDataTypes: &defaultSubmissionDataTypes consensusSequences: true maxSequencesPerEntry: 1 + files: + enabled: true + categories: + - name: raw_reads + displayName: Raw reads loadSequencesAutomatically: true earliestReleaseDate: enabled: true @@ -1886,35 +1891,6 @@ defaultOrganisms: sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/ORF9b.fasta]]" - name: S sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/S.fasta]]" - dummy-organism-with-files: - schema: - image: "/images/organisms/hmpv.jpg" - organismName: "Test organism (with files)" - submissionDataTypes: - consensusSequences: false - files: - enabled: true - categories: - - name: raw_reads - displayName: Raw reads - files: - - name: raw_reads - displayName: Raw reads - metadata: *dummyMetadata - website: - tableColumns: - - country - - division - - date - defaultOrder: descending - defaultOrderBy: date - preprocessing: - - version: 1 - image: ghcr.io/loculus-project/preprocessing-dummy - args: - - "--watch" - - "--disableConsensusSequences" - referenceGenomes: [] not-aligned-organism: enabled: true schema: From fd2501014bc6babbec1ee66d25244be09ad1b4fd Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 15 Jun 2026 09:42:40 +0200 Subject: [PATCH 02/11] Add back dummy organism with files --- kubernetes/loculus/values.yaml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 02eafc0a58..525ca21d3b 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1891,6 +1891,35 @@ defaultOrganisms: sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/ORF9b.fasta]]" - name: S sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/S.fasta]]" + dummy-organism-with-files: + schema: + image: "/images/organisms/hmpv.jpg" + organismName: "Test organism (with files)" + submissionDataTypes: + consensusSequences: false + files: + enabled: true + categories: + - name: raw_reads + displayName: Raw reads + files: + - name: raw_reads + displayName: Raw reads + metadata: *dummyMetadata + website: + tableColumns: + - country + - division + - date + defaultOrder: descending + defaultOrderBy: date + preprocessing: + - version: 1 + image: ghcr.io/loculus-project/preprocessing-dummy + args: + - "--watch" + - "--disableConsensusSequences" + referenceGenomes: [] not-aligned-organism: enabled: true schema: From 91f4f2ea041feda2a423abcf96ac308bafdc6e1a Mon Sep 17 00:00:00 2001 From: maverbiest <59956362+maverbiest@users.noreply.github.com> Date: Thu, 25 Jun 2026 14:54:14 +0200 Subject: [PATCH 03/11] feat(prepro): Pass files through nextclade pipeline (#6731) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Partly resolves https://github.com/loculus-project/loculus/issues/6758 Currently, user-submitted data are not read in by the nextclade preprocessing pipeline, causing them to be dropped form the submission process. Since we're now working to implement file sharing on Loculus, we need to make it so user-submitted files make it through the preprocessing pipeline as well. This PR makes it so user-submitted files are forwarded through the nextclade pipeline without any processing or checking of file contents. Future PRs will add functionality to, for example, check for host sequences in submitted raw reads. ## Implementation - `parse_ndjson` now also parses the file related information sent to preprocessing by the backend - `UnprocessedData` and `UnprocessedAfterNextclade` both get a `files` attribute - test factories in `factory_methods.py` can now be given files to add to test objects, also added a test case in `test_nextclade_preprocessing.py` that carries file information ## Manual testing I created a preview to test whether files now make it through the nextclade pipeline. When I submit sequences to the preview with attached raw reads, this file now appears in the submission review page: grafik And also on the sequence details page after the sequence is released: grafik ## Open questions One thing I ran in to when implementing this is that you need to add file categories two times in the config: one time under `submissionDataTypes` (file categories that users are allowed to submit) and then again under a top-level `files` field (files accepted as outputs of prepro pipelines): ``` defaultOrganismConfig: &defaultOrganismConfig schema: &schema submissionDataTypes: &defaultSubmissionDataTypes consensusSequences: true maxSequencesPerEntry: 1 files: enabled: true categories: - name: raw_reads displayName: Raw reads ... files: - name: annotations displayName: Annotations - name: raw_reads displayName: Raw reads ``` Would it be nicer to always allow file categories listed under `submissionDataTypes.files` to be output but prepro? Or will we ever have cases where users submit one thing, prepro processes it, and then outputs another filetype? Probably safest to keep as-is for now but just wanted to flag since not doing this properly got me into a weird state where submissions stay in 'processing' indefinitely but never error because the backend doesn't accept the preprocessing output (it only logs errors in the backend). ### PR Checklist ~- [ ] All necessary documentation has been adapted.~ - [x] The implemented feature is covered by appropriate, automated tests. - [x] Any manual testing that has been done is documented (i.e. what exactly was tested?) 🚀 Preview: https://pass-files-through.loculus.org --- kubernetes/loculus/values.yaml | 2 + .../src/loculus_preprocessing/backend.py | 11 +++++ .../src/loculus_preprocessing/datatypes.py | 17 ++++--- .../src/loculus_preprocessing/nextclade.py | 6 +++ .../src/loculus_preprocessing/prepro.py | 11 +++-- .../nextclade/tests/factory_methods.py | 15 +++++++ .../tests/test_host_name_validation.py | 1 + .../test_metadata_processing_functions.py | 1 + .../tests/test_nextclade_preprocessing.py | 45 +++++++++++++++++++ 9 files changed, 99 insertions(+), 10 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 525ca21d3b..76f821fd83 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -94,6 +94,8 @@ defaultOrganismConfig: &defaultOrganismConfig files: - name: annotations displayName: Annotations + - name: raw_reads + displayName: Raw reads ### Field list ## General fields # name: Key used across app to refer to this field (required) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py index 099ebc997d..9b362ec580 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py @@ -17,6 +17,7 @@ from .config import Config from .datatypes import ( + FileIdAndName, FileUploadInfo, ProcessedEntry, UnprocessedData, @@ -93,6 +94,15 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]: key: trim_ns(value) if value else None for key, value in unaligned_nucleotide_sequences.items() } + submitted_files = json_object["data"].get("files") + file_mapping = ( + { + category: [FileIdAndName(fileId=f["fileId"], name=f["name"]) for f in files] + for category, files in submitted_files.items() + } + if submitted_files + else None + ) unprocessed_data = UnprocessedData( submitter=json_object["submitter"], group_id=json_object["groupId"], @@ -102,6 +112,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]: unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences if unaligned_nucleotide_sequences else {}, + files=file_mapping, ) entry = UnprocessedEntry( accessionVersion=f"{json_object['accession']}.{json_object['version']}", diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index a3f2129652..872c4bb059 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -23,6 +23,7 @@ InputMetadataValue = str | None InputMetadata = dict[str, InputMetadataValue] FastaId = str +FileCategory = str ProcessingAnnotationAlignment: Final = "alignment" @@ -74,6 +75,12 @@ def from_single(cls, name: str, type, message: str): return cls.from_fields([name], [name], type, message) +@dataclass +class FileIdAndName: + fileId: str # noqa: N815 + name: str + + @dataclass class UnprocessedData: submitter: str @@ -82,6 +89,7 @@ class UnprocessedData: submissionId: str # noqa: N815 metadata: InputMetadata unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 + files: dict[FileCategory, list[FileIdAndName]] | None @dataclass @@ -97,6 +105,7 @@ class UnprocessedEntry: @dataclass class UnprocessedAfterNextclade: inputMetadata: InputMetadata # noqa: N815 + files: dict[FileCategory, list[FileIdAndName]] | None # Derived metadata produced by Nextclade nextcladeMetadata: dict[SequenceName, Any] | None # noqa: N815 unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 @@ -109,22 +118,16 @@ class UnprocessedAfterNextclade: warnings: list[ProcessingAnnotation] -@dataclass -class FileIdAndName: - fileId: str # noqa: N815 - name: str - - @dataclass class ProcessedData: metadata: ProcessedMetadata + files: dict[FileCategory, list[FileIdAndName]] | None unalignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 alignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 nucleotideInsertions: dict[SequenceName, Any] # noqa: N815 alignedAminoAcidSequences: dict[GeneName, Any] # noqa: N815 aminoAcidInsertions: dict[GeneName, Any] # noqa: N815 sequenceNameToFastaId: dict[SequenceName, FastaId] # noqa: N815 - files: dict[str, list[FileIdAndName]] | None = None @dataclass diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 7cca9f7835..8545472b63 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -26,6 +26,8 @@ AminoAcidSequence, AnnotationSourceType, FastaId, + FileCategory, + FileIdAndName, GeneName, GenericSequence, NucleotideInsertion, @@ -801,6 +803,9 @@ def enrich_with_nextclade( # noqa: PLR0914 } for entry in unprocessed } + input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = { + entry.accessionVersion: entry.data.files for entry in unprocessed + } batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir) unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences @@ -893,6 +898,7 @@ def enrich_with_nextclade( # noqa: PLR0914 return { id: UnprocessedAfterNextclade( inputMetadata=input_metadata[id], + files=input_files[id], nextcladeMetadata=nextclade_metadata[id], unalignedNucleotideSequences=unaligned_nucleotide_sequences[id], alignedNucleotideSequences=aligned_nucleotide_sequences[id], diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 8a752c9982..e9d4300b8e 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -296,6 +296,7 @@ def processed_entry_no_alignment( # noqa: PLR0913, PLR0917 version=version_from_str(accession_version), data=ProcessedData( metadata=output_metadata, + files=unprocessed.files, unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences, alignedNucleotideSequences=aligned_nucleotide_sequences, nucleotideInsertions=nucleotide_insertions, @@ -534,6 +535,7 @@ def process_single( version=version_from_str(accession_version), data=ProcessedData( metadata=output_metadata, + files=unprocessed.files, unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences, alignedNucleotideSequences=unprocessed.alignedNucleotideSequences, nucleotideInsertions=unprocessed.nucleotideInsertions, @@ -595,6 +597,7 @@ def processed_entry_with_errors(id) -> SubmissionData: version=version_from_str(id), data=ProcessedData( metadata=dict[str, ProcessedMetadataValue](), + files=None, unalignedNucleotideSequences=defaultdict(dict[str, Any]), alignedNucleotideSequences=defaultdict(dict[str, Any]), nucleotideInsertions=defaultdict(dict[str, Any]), @@ -660,9 +663,11 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non file_id = upload_info.fileId url = upload_info.url upload_embl_file_to_presigned_url(file_content, url) - submission_data.processed_entry.data.files = { - "annotations": [FileIdAndName(fileId=file_id, name=file_name)] - } + processed_files = submission_data.processed_entry.data.files or {} + processed_files.setdefault("annotations", []).append( + FileIdAndName(fileId=file_id, name=file_name) + ) + submission_data.processed_entry.data.files = processed_files except Exception as e: logger.error("Error creating or uploading EMBL file: %s", e) submission_data.processed_entry.errors.append( diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 41b26062ae..679601e744 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -9,6 +9,8 @@ from loculus_preprocessing.datatypes import ( AnnotationSource, AnnotationSourceType, + FileCategory, + FileIdAndName, NucleotideSequence, ProcessedData, ProcessedEntry, @@ -77,6 +79,7 @@ def create_unprocessed_entry( accession_id: str, sequences: dict[SegmentName, NucleotideSequence | None], group_id: int = 2, + files: dict[FileCategory, list[FileIdAndName]] | None = None, ) -> UnprocessedEntry: return UnprocessedEntry( accessionVersion=f"LOC_{accession_id}.1", @@ -89,6 +92,7 @@ def create_unprocessed_entry( group_id=group_id, metadata=metadata_dict, unalignedNucleotideSequences=sequences, + files=files, ), ) @@ -130,6 +134,7 @@ def create_processed_entry( errors: list[ProcessingAnnotation] | None = None, warnings: list[ProcessingAnnotation] | None = None, processed_alignment: ProcessedAlignment | None = None, + files: dict[FileCategory, list[FileIdAndName]] | None = None, ) -> ProcessedEntry: if errors is None: errors = [] @@ -147,6 +152,7 @@ def create_processed_entry( version=1, data=ProcessedData( metadata=base_metadata_dict, + files=files, unalignedNucleotideSequences=processed_alignment.unalignedNucleotideSequences, alignedNucleotideSequences=processed_alignment.alignedNucleotideSequences, nucleotideInsertions=processed_alignment.nucleotideInsertions, @@ -163,9 +169,11 @@ def create_processed_entry( class Case: name: str input_metadata: dict[str, str | None] = field(default_factory=dict) + input_files: dict[FileCategory, list[FileIdAndName]] | None = None input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None}) accession_id: str = "000999" expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict) + expected_files: dict[FileCategory, list[FileIdAndName]] | None = None expected_errors: list[ProcessingAnnotation] | None = None expected_warnings: list[ProcessingAnnotation] | None = None expected_processed_alignment: ProcessedAlignment | None = None @@ -179,6 +187,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT accession_id=self.accession_id, sequences=self.input_sequence, group_id=self.group_id, + files=self.input_files, ) expected_output = factory_custom.create_processed_entry( metadata_dict=self.expected_metadata, @@ -186,6 +195,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT errors=self.expected_errors or [], warnings=self.expected_warnings or [], processed_alignment=self.expected_processed_alignment, + files=self.expected_files, ) return ProcessingTestCase( name=self.name, input=unprocessed_entry, expected_output=expected_output @@ -265,3 +275,8 @@ def verify_processed_entry( f"{test_name}: sequence name to fasta header map '{actual.sequenceNameToFastaId}' do not " f"match expectation '{expected.sequenceNameToFastaId}'." ) + + # Check files + assert actual.files == expected.files, ( + f"{test_name}: files '{actual.files}' do not match expectation '{expected.files}'." + ) diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py index d97cdbe97e..62cdf75768 100644 --- a/preprocessing/nextclade/tests/test_host_name_validation.py +++ b/preprocessing/nextclade/tests/test_host_name_validation.py @@ -34,6 +34,7 @@ def make_entry(metadata: dict, group_id: int) -> UnprocessedEntry: group_id=group_id, metadata=metadata, unalignedNucleotideSequences={}, + files=None, ), ) diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index d17ae42244..f3961c4beb 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -783,6 +783,7 @@ def test_preprocessing_without_consensus_sequences(config: Config) -> None: "name_required": sequence_name, }, unalignedNucleotideSequences={}, + files=None, ), ) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 05edf3a4cf..0b07af4c31 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -26,6 +26,7 @@ ) from loculus_preprocessing.datatypes import ( AnnotationSourceType, + FileIdAndName, SegmentClassificationMethod, SubmissionData, UnprocessedData, @@ -284,6 +285,46 @@ def invalid_sequence() -> str: sequenceNameToFastaId={"main": "fastaHeader"}, ), ), + Case( + name="with file", + input_metadata={}, + input_files={ + "raw_reads": [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + ] + }, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + "nonExistentField": "None", + "variant": True, + }, + expected_files={ + "raw_reads": [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + ] + }, + expected_errors=[], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + sequenceNameToFastaId={"main": "fastaHeader"}, + ), + ), ] single_segment_failed_case_definitions = [ @@ -1285,6 +1326,7 @@ def test_max_sequences_per_entry_batch_isolation() -> None: "ebola-sudan": sequence_with_mutation("ebola-sudan"), "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, + files=None, ), ) @@ -1299,6 +1341,7 @@ def test_max_sequences_per_entry_batch_isolation() -> None: unalignedNucleotideSequences={ "ebola-sudan": sequence_with_mutation("ebola-sudan"), }, + files=None, ), ) @@ -1332,6 +1375,7 @@ def test_preprocessing_without_metadata() -> None: "ebola-sudan": sequence_with_mutation("ebola-sudan"), "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, + files=None, ), ) @@ -1455,6 +1499,7 @@ def test_create_flatfile(): "authors": "Smith, Doe A;", }, unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + files=None, ), ) From 0c6d635864d4c650755a15174d42652b670a7014 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 26 Jun 2026 14:54:05 +0200 Subject: [PATCH 04/11] Very basic validation of raw read submissions --- .../src/loculus_preprocessing/datatypes.py | 13 +- .../src/loculus_preprocessing/nextclade.py | 8 +- .../src/loculus_preprocessing/prepro.py | 35 +++- .../processing_functions.py | 33 ++++ .../nextclade/tests/factory_methods.py | 10 +- .../tests/test_nextclade_preprocessing.py | 153 +++++++++++++++++- 6 files changed, 234 insertions(+), 18 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index 872c4bb059..41fbe95c43 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -23,15 +23,20 @@ InputMetadataValue = str | None InputMetadata = dict[str, InputMetadataValue] FastaId = str -FileCategory = str ProcessingAnnotationAlignment: Final = "alignment" +@unique +class SubmissionFileCategory(StrEnum): + RAW_READS = "raw_reads" + + @unique class AnnotationSourceType(StrEnum): METADATA = "Metadata" NUCLEOTIDE_SEQUENCE = "NucleotideSequence" + SUBMITTED_FILE = "SubmittedFile" @dataclass(frozen=True) @@ -89,7 +94,7 @@ class UnprocessedData: submissionId: str # noqa: N815 metadata: InputMetadata unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 - files: dict[FileCategory, list[FileIdAndName]] | None + files: dict[SubmissionFileCategory, list[FileIdAndName]] | None @dataclass @@ -105,7 +110,7 @@ class UnprocessedEntry: @dataclass class UnprocessedAfterNextclade: inputMetadata: InputMetadata # noqa: N815 - files: dict[FileCategory, list[FileIdAndName]] | None + files: dict[SubmissionFileCategory, list[FileIdAndName]] | None # Derived metadata produced by Nextclade nextcladeMetadata: dict[SequenceName, Any] | None # noqa: N815 unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 @@ -121,7 +126,7 @@ class UnprocessedAfterNextclade: @dataclass class ProcessedData: metadata: ProcessedMetadata - files: dict[FileCategory, list[FileIdAndName]] | None + files: dict[SubmissionFileCategory, list[FileIdAndName]] | None unalignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 alignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 nucleotideInsertions: dict[SequenceName, Any] # noqa: N815 diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 8545472b63..87a9b65291 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -26,7 +26,7 @@ AminoAcidSequence, AnnotationSourceType, FastaId, - FileCategory, + SubmissionFileCategory, FileIdAndName, GeneName, GenericSequence, @@ -803,9 +803,9 @@ def enrich_with_nextclade( # noqa: PLR0914 } for entry in unprocessed } - input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = { - entry.accessionVersion: entry.data.files for entry in unprocessed - } + input_files: dict[ + AccessionVersion, dict[SubmissionFileCategory, list[FileIdAndName]] | None + ] = {entry.accessionVersion: entry.data.files for entry in unprocessed} batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir) unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index e9d4300b8e..cb874d47c9 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -42,6 +42,7 @@ SegmentClassificationMethod, SegmentName, SubmissionData, + SubmissionFileCategory, UnprocessedAfterNextclade, UnprocessedData, UnprocessedEntry, @@ -60,6 +61,7 @@ process_mutations_from_clade_founder, process_phenotype_values, process_stop_codons, + validate_raw_reads_submission, ) from .sequence_checks import error_on_excess_sequences, errors_if_non_iupac @@ -436,6 +438,24 @@ def get_output_metadata( return output_metadata, errors, warnings +def process_submitted_files( + file_mapping: dict[SubmissionFileCategory, list[FileIdAndName]], +) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: + errors: list[ProcessingAnnotation] = [] + warnings: list[ProcessingAnnotation] = [] + + for category, files in file_mapping.items(): + match category: + case SubmissionFileCategory.RAW_READS: + rr_errors, rr_warnings = validate_raw_reads_submission(files) + errors.extend(rr_errors) + warnings.extend(rr_warnings) + case _: + logger.warning(f"Submitted file is of unrecognized category {category}") + + return errors, warnings + + def alignment_errors_warnings( unprocessed: UnprocessedAfterNextclade, config: Config, @@ -530,6 +550,8 @@ def process_single( accession_version, unprocessed, config ) + file_errors, file_warnings = process_submitted_files(unprocessed.files or {}) + processed_entry = ProcessedEntry( accession=accession_from_str(accession_version), version=version_from_str(accession_version), @@ -550,9 +572,12 @@ def process_single( + max_seq_errors + alignment_errors + metadata_errors + + file_errors ) ), - warnings=list(set(unprocessed.warnings + alignment_warnings + metadata_warnings)), + warnings=list( + set(unprocessed.warnings + alignment_warnings + metadata_warnings + file_warnings) + ), ) return SubmissionData( @@ -580,12 +605,16 @@ def process_single_unaligned( accession_version, unprocessed, config ) + file_errors, file_warnings = process_submitted_files(unprocessed.files or {}) + return processed_entry_no_alignment( accession_version=accession_version, unprocessed=unprocessed, output_metadata=output_metadata, - errors=list(set(iupac_errors + metadata_errors + segment_assignment.alert.errors)), - warnings=list(set(metadata_warnings)), + errors=list( + set(iupac_errors + metadata_errors + segment_assignment.alert.errors + file_errors) + ), + warnings=list(set(metadata_warnings + file_warnings)), sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId, ) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index e2428bd2b7..0288f17e3a 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -25,6 +25,7 @@ from .datatypes import ( AnnotationSource, AnnotationSourceType, + FileIdAndName, FunctionArgs, InputData, InputMetadata, @@ -1948,6 +1949,38 @@ def single_metadata_annotation( ] +def validate_raw_reads_submission( + files: list[FileIdAndName], +) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: + errors: list[ProcessingAnnotation] = [] + warnings: list[ProcessingAnnotation] = [] + + min_files, max_files = 1, 2 + if not (min_files <= len(files) <= max_files): + name = "; ".join(f.name for f in files) + message = f"Raw reads must be submitted as one or two files, got {len(files)}" + errors.append( + ProcessingAnnotation.from_single( + name=name, type=AnnotationSourceType.SUBMITTED_FILE, message=message + ) + ) + + allowed_extensions = ["fastq", "fastq.gz", "fq", "fq.gz"] + for file in files: + if not any(file.name.endswith(extension) for extension in allowed_extensions): + message = ( + f"Raw reads file '{file.name}' has unrecognized extension." + f" Allowed extensions: {allowed_extensions}" + ) + errors.append( + ProcessingAnnotation.from_single( + name=file.name, type=AnnotationSourceType.SUBMITTED_FILE, message=message + ) + ) + + return errors, warnings + + def process_frameshifts(input: str | None) -> InputData: """Converts frameshift string to InputData for processing""" try: diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 679601e744..8b25fe099f 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -9,7 +9,7 @@ from loculus_preprocessing.datatypes import ( AnnotationSource, AnnotationSourceType, - FileCategory, + SubmissionFileCategory, FileIdAndName, NucleotideSequence, ProcessedData, @@ -79,7 +79,7 @@ def create_unprocessed_entry( accession_id: str, sequences: dict[SegmentName, NucleotideSequence | None], group_id: int = 2, - files: dict[FileCategory, list[FileIdAndName]] | None = None, + files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None, ) -> UnprocessedEntry: return UnprocessedEntry( accessionVersion=f"LOC_{accession_id}.1", @@ -134,7 +134,7 @@ def create_processed_entry( errors: list[ProcessingAnnotation] | None = None, warnings: list[ProcessingAnnotation] | None = None, processed_alignment: ProcessedAlignment | None = None, - files: dict[FileCategory, list[FileIdAndName]] | None = None, + files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None, ) -> ProcessedEntry: if errors is None: errors = [] @@ -169,11 +169,11 @@ def create_processed_entry( class Case: name: str input_metadata: dict[str, str | None] = field(default_factory=dict) - input_files: dict[FileCategory, list[FileIdAndName]] | None = None + input_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None}) accession_id: str = "000999" expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict) - expected_files: dict[FileCategory, list[FileIdAndName]] | None = None + expected_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None expected_errors: list[ProcessingAnnotation] | None = None expected_warnings: list[ProcessingAnnotation] | None = None expected_processed_alignment: ProcessedAlignment | None = None diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 0b07af4c31..a1f6814bc9 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -29,6 +29,7 @@ FileIdAndName, SegmentClassificationMethod, SubmissionData, + SubmissionFileCategory, UnprocessedData, UnprocessedEntry, ) @@ -289,7 +290,7 @@ def invalid_sequence() -> str: name="with file", input_metadata={}, input_files={ - "raw_reads": [ + SubmissionFileCategory.RAW_READS: [ FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), ] @@ -306,7 +307,7 @@ def invalid_sequence() -> str: "variant": True, }, expected_files={ - "raw_reads": [ + SubmissionFileCategory.RAW_READS: [ FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), ] @@ -325,6 +326,105 @@ def invalid_sequence() -> str: sequenceNameToFastaId={"main": "fastaHeader"}, ), ), + Case( + name="with too many raw read files", + input_metadata={}, + input_files={ + SubmissionFileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"), + ] + }, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + "nonExistentField": "None", + "variant": True, + }, + expected_files={ + SubmissionFileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), + FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), + FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"), + ] + }, + expected_errors=build_processing_annotations( + [ + ProcessingAnnotationHelper( + ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"], + ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"], + "Raw reads must be submitted as one or two files, got 3", + AnnotationSourceType.SUBMITTED_FILE, + ) + ] + ), + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + sequenceNameToFastaId={"main": "fastaHeader"}, + ), + ), + Case( + name="with unrecognized raw read file extension", + input_metadata={}, + input_files={ + SubmissionFileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads.txt") + ] + }, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + "nonExistentField": "None", + "variant": True, + }, + expected_files={ + SubmissionFileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads.txt") + ] + }, + expected_errors=build_processing_annotations( + [ + ProcessingAnnotationHelper( + ["reads.txt"], + ["reads.txt"], + "Raw reads file 'reads.txt' has unrecognized extension. " + "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']", + AnnotationSourceType.SUBMITTED_FILE, + ) + ] + ), + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + sequenceNameToFastaId={"main": "fastaHeader"}, + ), + ), ] single_segment_failed_case_definitions = [ @@ -1175,6 +1275,55 @@ def invalid_sequence() -> str: sequenceNameToFastaId={"ebola-sudan": "ebola-sudan"}, ), ), + Case( + name="validate raw read files when sequences are not aligned", + input_metadata={}, + input_sequence={ + "prefix_ebola-sudan": sequence_with_mutation("ebola-sudan"), + "other_prefix_ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + accession_id="1", + input_files={ + SubmissionFileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads.txt") + ] + }, + expected_files={ + SubmissionFileCategory.RAW_READS: [ + FileIdAndName(fileId="file-id-0001", name="reads.txt") + ] + }, + expected_metadata={ + "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), + "length_ebola-zaire": len(consensus_sequence("ebola-zaire")), + }, + expected_errors=build_processing_annotations( + [ + ProcessingAnnotationHelper( + ["reads.txt"], + ["reads.txt"], + "Raw reads file 'reads.txt' has unrecognized extension. " + "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']", + AnnotationSourceType.SUBMITTED_FILE, + ) + ] + ), + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + alignedNucleotideSequences={}, + nucleotideInsertions={}, + alignedAminoAcidSequences={}, + aminoAcidInsertions={}, + sequenceNameToFastaId={ + "ebola-sudan": "prefix_ebola-sudan", + "ebola-zaire": "other_prefix_ebola-zaire", + }, + ), + ), ] From 9c0baa0c0945bc3fa889d3170dcf5281c7bb192f Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 26 Jun 2026 15:08:39 +0200 Subject: [PATCH 05/11] ruff --- preprocessing/nextclade/src/loculus_preprocessing/nextclade.py | 2 +- preprocessing/nextclade/tests/factory_methods.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 87a9b65291..7642b3df17 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -26,7 +26,6 @@ AminoAcidSequence, AnnotationSourceType, FastaId, - SubmissionFileCategory, FileIdAndName, GeneName, GenericSequence, @@ -38,6 +37,7 @@ SegmentName, SequenceAssignment, SequenceAssignmentBatch, + SubmissionFileCategory, UnprocessedAfterNextclade, UnprocessedEntry, ) diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 8b25fe099f..277fc7ae4a 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -9,7 +9,6 @@ from loculus_preprocessing.datatypes import ( AnnotationSource, AnnotationSourceType, - SubmissionFileCategory, FileIdAndName, NucleotideSequence, ProcessedData, @@ -18,6 +17,7 @@ ProcessingAnnotation, ProcessingAnnotationAlignment, SegmentName, + SubmissionFileCategory, UnprocessedData, UnprocessedEntry, ) From a9b40a5a6c9af6e49df71bfbba92afba9f51345e Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 26 Jun 2026 15:17:19 +0200 Subject: [PATCH 06/11] Update type --- preprocessing/dummy/main.py | 4 +-- .../src/loculus_preprocessing/datatypes.py | 9 ++++--- .../src/loculus_preprocessing/nextclade.py | 8 +++--- .../src/loculus_preprocessing/prepro.py | 10 +++---- .../nextclade/tests/factory_methods.py | 10 +++---- .../tests/test_nextclade_preprocessing.py | 26 +++++++------------ 6 files changed, 30 insertions(+), 37 deletions(-) diff --git a/preprocessing/dummy/main.py b/preprocessing/dummy/main.py index e3a2b9cc14..647c8b883a 100644 --- a/preprocessing/dummy/main.py +++ b/preprocessing/dummy/main.py @@ -129,7 +129,7 @@ def parse_ndjson(ndjson_data: str) -> list[Sequence]: def process(unprocessed: list[Sequence]) -> list[Sequence]: - with open("mock-sequences.json", "r") as f: + with open("mock-sequences.json") as f: mock_sequences = json.load(f) possible_lineages = ["A.1", "A.1.1", "A.2"] @@ -160,7 +160,7 @@ def process(unprocessed: list[Sequence]) -> list[Sequence]: "nucleotideInsertions": {}, "aminoAcidInsertions": {}, } - + if not disableConsensusSequences: data = {**data, **mock_sequences} data["sequenceNameToFastaId"] = {"main": submissionId} diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index 41fbe95c43..a59abb50c7 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -28,8 +28,9 @@ @unique -class SubmissionFileCategory(StrEnum): +class FileCategory(StrEnum): RAW_READS = "raw_reads" + ANNOTATIONS = "annotations" @unique @@ -94,7 +95,7 @@ class UnprocessedData: submissionId: str # noqa: N815 metadata: InputMetadata unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 - files: dict[SubmissionFileCategory, list[FileIdAndName]] | None + files: dict[FileCategory, list[FileIdAndName]] | None @dataclass @@ -110,7 +111,7 @@ class UnprocessedEntry: @dataclass class UnprocessedAfterNextclade: inputMetadata: InputMetadata # noqa: N815 - files: dict[SubmissionFileCategory, list[FileIdAndName]] | None + files: dict[FileCategory, list[FileIdAndName]] | None # Derived metadata produced by Nextclade nextcladeMetadata: dict[SequenceName, Any] | None # noqa: N815 unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 @@ -126,7 +127,7 @@ class UnprocessedAfterNextclade: @dataclass class ProcessedData: metadata: ProcessedMetadata - files: dict[SubmissionFileCategory, list[FileIdAndName]] | None + files: dict[FileCategory, list[FileIdAndName]] | None unalignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 alignedNucleotideSequences: dict[SequenceName, Any] # noqa: N815 nucleotideInsertions: dict[SequenceName, Any] # noqa: N815 diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 7642b3df17..8545472b63 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -26,6 +26,7 @@ AminoAcidSequence, AnnotationSourceType, FastaId, + FileCategory, FileIdAndName, GeneName, GenericSequence, @@ -37,7 +38,6 @@ SegmentName, SequenceAssignment, SequenceAssignmentBatch, - SubmissionFileCategory, UnprocessedAfterNextclade, UnprocessedEntry, ) @@ -803,9 +803,9 @@ def enrich_with_nextclade( # noqa: PLR0914 } for entry in unprocessed } - input_files: dict[ - AccessionVersion, dict[SubmissionFileCategory, list[FileIdAndName]] | None - ] = {entry.accessionVersion: entry.data.files for entry in unprocessed} + input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = { + entry.accessionVersion: entry.data.files for entry in unprocessed + } batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir) unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index cb874d47c9..da7736e2ea 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -26,6 +26,7 @@ AminoAcidSequence, AnnotationSource, AnnotationSourceType, + FileCategory, FileIdAndName, GeneName, InputData, @@ -42,7 +43,6 @@ SegmentClassificationMethod, SegmentName, SubmissionData, - SubmissionFileCategory, UnprocessedAfterNextclade, UnprocessedData, UnprocessedEntry, @@ -439,19 +439,19 @@ def get_output_metadata( def process_submitted_files( - file_mapping: dict[SubmissionFileCategory, list[FileIdAndName]], + file_mapping: dict[FileCategory, list[FileIdAndName]], ) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: errors: list[ProcessingAnnotation] = [] warnings: list[ProcessingAnnotation] = [] for category, files in file_mapping.items(): match category: - case SubmissionFileCategory.RAW_READS: + case FileCategory.RAW_READS: rr_errors, rr_warnings = validate_raw_reads_submission(files) errors.extend(rr_errors) warnings.extend(rr_warnings) case _: - logger.warning(f"Submitted file is of unrecognized category {category}") + logger.warning(f"Submitted file is of unexpected category {category}") return errors, warnings @@ -693,7 +693,7 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non url = upload_info.url upload_embl_file_to_presigned_url(file_content, url) processed_files = submission_data.processed_entry.data.files or {} - processed_files.setdefault("annotations", []).append( + processed_files.setdefault(FileCategory.ANNOTATIONS, []).append( FileIdAndName(fileId=file_id, name=file_name) ) submission_data.processed_entry.data.files = processed_files diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 277fc7ae4a..679601e744 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -9,6 +9,7 @@ from loculus_preprocessing.datatypes import ( AnnotationSource, AnnotationSourceType, + FileCategory, FileIdAndName, NucleotideSequence, ProcessedData, @@ -17,7 +18,6 @@ ProcessingAnnotation, ProcessingAnnotationAlignment, SegmentName, - SubmissionFileCategory, UnprocessedData, UnprocessedEntry, ) @@ -79,7 +79,7 @@ def create_unprocessed_entry( accession_id: str, sequences: dict[SegmentName, NucleotideSequence | None], group_id: int = 2, - files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None, + files: dict[FileCategory, list[FileIdAndName]] | None = None, ) -> UnprocessedEntry: return UnprocessedEntry( accessionVersion=f"LOC_{accession_id}.1", @@ -134,7 +134,7 @@ def create_processed_entry( errors: list[ProcessingAnnotation] | None = None, warnings: list[ProcessingAnnotation] | None = None, processed_alignment: ProcessedAlignment | None = None, - files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None, + files: dict[FileCategory, list[FileIdAndName]] | None = None, ) -> ProcessedEntry: if errors is None: errors = [] @@ -169,11 +169,11 @@ def create_processed_entry( class Case: name: str input_metadata: dict[str, str | None] = field(default_factory=dict) - input_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None + input_files: dict[FileCategory, list[FileIdAndName]] | None = None input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None}) accession_id: str = "000999" expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict) - expected_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None + expected_files: dict[FileCategory, list[FileIdAndName]] | None = None expected_errors: list[ProcessingAnnotation] | None = None expected_warnings: list[ProcessingAnnotation] | None = None expected_processed_alignment: ProcessedAlignment | None = None diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index a1f6814bc9..a05efcba82 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -26,10 +26,10 @@ ) from loculus_preprocessing.datatypes import ( AnnotationSourceType, + FileCategory, FileIdAndName, SegmentClassificationMethod, SubmissionData, - SubmissionFileCategory, UnprocessedData, UnprocessedEntry, ) @@ -290,7 +290,7 @@ def invalid_sequence() -> str: name="with file", input_metadata={}, input_files={ - SubmissionFileCategory.RAW_READS: [ + FileCategory.RAW_READS: [ FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), ] @@ -307,7 +307,7 @@ def invalid_sequence() -> str: "variant": True, }, expected_files={ - SubmissionFileCategory.RAW_READS: [ + FileCategory.RAW_READS: [ FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), ] @@ -330,7 +330,7 @@ def invalid_sequence() -> str: name="with too many raw read files", input_metadata={}, input_files={ - SubmissionFileCategory.RAW_READS: [ + FileCategory.RAW_READS: [ FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"), @@ -348,7 +348,7 @@ def invalid_sequence() -> str: "variant": True, }, expected_files={ - SubmissionFileCategory.RAW_READS: [ + FileCategory.RAW_READS: [ FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"), FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"), FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"), @@ -381,9 +381,7 @@ def invalid_sequence() -> str: name="with unrecognized raw read file extension", input_metadata={}, input_files={ - SubmissionFileCategory.RAW_READS: [ - FileIdAndName(fileId="file-id-0001", name="reads.txt") - ] + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] }, input_sequence={"fastaHeader": sequence_with_mutation("single")}, accession_id="1", @@ -397,9 +395,7 @@ def invalid_sequence() -> str: "variant": True, }, expected_files={ - SubmissionFileCategory.RAW_READS: [ - FileIdAndName(fileId="file-id-0001", name="reads.txt") - ] + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] }, expected_errors=build_processing_annotations( [ @@ -1284,14 +1280,10 @@ def invalid_sequence() -> str: }, accession_id="1", input_files={ - SubmissionFileCategory.RAW_READS: [ - FileIdAndName(fileId="file-id-0001", name="reads.txt") - ] + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] }, expected_files={ - SubmissionFileCategory.RAW_READS: [ - FileIdAndName(fileId="file-id-0001", name="reads.txt") - ] + FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")] }, expected_metadata={ "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), From 2ec13bf332319f260f31c22b234126700ee8b742 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 26 Jun 2026 16:08:00 +0200 Subject: [PATCH 07/11] Add SubmittedFile to backend --- .../src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt index 4ee737b238..6340f660b9 100644 --- a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt +++ b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt @@ -284,6 +284,7 @@ data class PreprocessingAnnotationSource( enum class PreprocessingAnnotationSourceType { Metadata, NucleotideSequence, + SubmittedFile, } data class GetSequenceResponse( From f7be2e2f3eb4423c70fd3408704d3e376c1d2fc8 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 26 Jun 2026 16:30:56 +0200 Subject: [PATCH 08/11] Add SubmittedFile to website --- website/src/types/backend.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/website/src/types/backend.ts b/website/src/types/backend.ts index 225da65ecc..da2d2c2f73 100644 --- a/website/src/types/backend.ts +++ b/website/src/types/backend.ts @@ -24,7 +24,11 @@ export const sequenceEntryProcessingResultNames = z.union([ ]); export type SequenceEntryProcessingResultNames = z.infer; -const processingAnnotationSourceType = z.union([z.literal('Metadata'), z.literal('NucleotideSequence')]); +const processingAnnotationSourceType = z.union([ + z.literal('Metadata'), + z.literal('NucleotideSequence'), + z.literal('SubmittedFile'), +]); export type ProcessingAnnotationSourceType = z.infer; const processingAnnotation = z.object({ From 35c71b4b93a53b9fd08104de1d2aca4db81cfdd3 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 26 Jun 2026 17:58:13 +0200 Subject: [PATCH 09/11] Fix --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 3 +++ .../src/loculus_preprocessing/processing_functions.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index da7736e2ea..be70462c18 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -445,6 +445,9 @@ def process_submitted_files( warnings: list[ProcessingAnnotation] = [] for category, files in file_mapping.items(): + if not files: + # Backend always includes a key with empty list for enabled categories + continue match category: case FileCategory.RAW_READS: rr_errors, rr_warnings = validate_raw_reads_submission(files) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 0288f17e3a..ec9a52f06b 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1955,8 +1955,7 @@ def validate_raw_reads_submission( errors: list[ProcessingAnnotation] = [] warnings: list[ProcessingAnnotation] = [] - min_files, max_files = 1, 2 - if not (min_files <= len(files) <= max_files): + if len(files) > 2: # noqa: PLR2004 name = "; ".join(f.name for f in files) message = f"Raw reads must be submitted as one or two files, got {len(files)}" errors.append( From ee80c85cb1dd5ee77dc6fce39af956ec5c367c81 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 29 Jun 2026 14:33:11 +0200 Subject: [PATCH 10/11] Addressing comments --- .../loculus_preprocessing/processing_functions.py | 12 +++++++----- .../nextclade/tests/test_nextclade_preprocessing.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index ec9a52f06b..aa5dc32903 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1956,20 +1956,22 @@ def validate_raw_reads_submission( warnings: list[ProcessingAnnotation] = [] if len(files) > 2: # noqa: PLR2004 - name = "; ".join(f.name for f in files) message = f"Raw reads must be submitted as one or two files, got {len(files)}" errors.append( - ProcessingAnnotation.from_single( - name=name, type=AnnotationSourceType.SUBMITTED_FILE, message=message + ProcessingAnnotation.from_fields( + input_fields=[f.name for f in files], + output_fields=[f.name for f in files], + type=AnnotationSourceType.SUBMITTED_FILE, + message=message, ) ) - allowed_extensions = ["fastq", "fastq.gz", "fq", "fq.gz"] + allowed_extensions = [".fastq", ".fastq.gz", ".fq", ".fq.gz"] for file in files: if not any(file.name.endswith(extension) for extension in allowed_extensions): message = ( f"Raw reads file '{file.name}' has unrecognized extension." - f" Allowed extensions: {allowed_extensions}" + f" Allowed extensions: {', '.join(allowed_extensions)}" ) errors.append( ProcessingAnnotation.from_single( diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index a05efcba82..8974b55915 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -403,7 +403,7 @@ def invalid_sequence() -> str: ["reads.txt"], ["reads.txt"], "Raw reads file 'reads.txt' has unrecognized extension. " - "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']", + "Allowed extensions: .fastq, .fastq.gz, .fq, .fq.gz", AnnotationSourceType.SUBMITTED_FILE, ) ] @@ -1295,7 +1295,7 @@ def invalid_sequence() -> str: ["reads.txt"], ["reads.txt"], "Raw reads file 'reads.txt' has unrecognized extension. " - "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']", + "Allowed extensions: .fastq, .fastq.gz, .fq, .fq.gz", AnnotationSourceType.SUBMITTED_FILE, ) ] From 9406c2975be8bcde6e063362f0f559108e403725 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 29 Jun 2026 14:39:25 +0200 Subject: [PATCH 11/11] Fix tests again --- preprocessing/nextclade/tests/test_nextclade_preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 8974b55915..f1fc7fc641 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -357,8 +357,8 @@ def invalid_sequence() -> str: expected_errors=build_processing_annotations( [ ProcessingAnnotationHelper( - ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"], - ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"], + ["reads_R1.fastq", "reads_R2.fastq", "reads_R3.fastq"], + ["reads_R1.fastq", "reads_R2.fastq", "reads_R3.fastq"], "Raw reads must be submitted as one or two files, got 3", AnnotationSourceType.SUBMITTED_FILE, )