From 48870daf108e130aef1058edd5d09fe69e9ae02c Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 12 Jun 2026 15:27:30 +0200
Subject: [PATCH 01/11] Enable raw reads sharing by default

---
 kubernetes/loculus/values.yaml | 34 +++++-----------------------------
 1 file changed, 5 insertions(+), 29 deletions(-)

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index ac55e61ca3..02eafc0a58 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -80,6 +80,11 @@ defaultOrganismConfig: &defaultOrganismConfig
     submissionDataTypes: &defaultSubmissionDataTypes
       consensusSequences: true
       maxSequencesPerEntry: 1
+      files:
+        enabled: true
+        categories:
+          - name: raw_reads
+            displayName: Raw reads
     loadSequencesAutomatically: true
     earliestReleaseDate:
       enabled: true
@@ -1886,35 +1891,6 @@ defaultOrganisms:
                 sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/ORF9b.fasta]]"
               - name: S
                 sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/S.fasta]]"
-  dummy-organism-with-files:
-    schema:
-      image: "/images/organisms/hmpv.jpg"
-      organismName: "Test organism (with files)"
-      submissionDataTypes:
-        consensusSequences: false
-        files:
-          enabled: true
-          categories:
-            - name: raw_reads
-              displayName: Raw reads
-      files:
-        - name: raw_reads
-          displayName: Raw reads
-      metadata: *dummyMetadata
-      website:
-        tableColumns:
-          - country
-          - division
-          - date
-        defaultOrder: descending
-        defaultOrderBy: date
-    preprocessing:
-      - version: 1
-        image: ghcr.io/loculus-project/preprocessing-dummy
-        args:
-          - "--watch"
-          - "--disableConsensusSequences"
-    referenceGenomes: []
   not-aligned-organism:
     enabled: true
     schema:

From fd2501014bc6babbec1ee66d25244be09ad1b4fd Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Mon, 15 Jun 2026 09:42:40 +0200
Subject: [PATCH 02/11] Add back dummy organism with files

---
 kubernetes/loculus/values.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 02eafc0a58..525ca21d3b 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -1891,6 +1891,35 @@ defaultOrganisms:
                 sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/ORF9b.fasta]]"
               - name: S
                 sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/S.fasta]]"
+  dummy-organism-with-files:
+    schema:
+      image: "/images/organisms/hmpv.jpg"
+      organismName: "Test organism (with files)"
+      submissionDataTypes:
+        consensusSequences: false
+        files:
+          enabled: true
+          categories:
+            - name: raw_reads
+              displayName: Raw reads
+      files:
+        - name: raw_reads
+          displayName: Raw reads
+      metadata: *dummyMetadata
+      website:
+        tableColumns:
+          - country
+          - division
+          - date
+        defaultOrder: descending
+        defaultOrderBy: date
+    preprocessing:
+      - version: 1
+        image: ghcr.io/loculus-project/preprocessing-dummy
+        args:
+          - "--watch"
+          - "--disableConsensusSequences"
+    referenceGenomes: []
   not-aligned-organism:
     enabled: true
     schema:

From 91f4f2ea041feda2a423abcf96ac308bafdc6e1a Mon Sep 17 00:00:00 2001
From: maverbiest <59956362+maverbiest@users.noreply.github.com>
Date: Thu, 25 Jun 2026 14:54:14 +0200
Subject: [PATCH 03/11] feat(prepro): Pass files through nextclade pipeline
 (#6731)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Partly resolves https://github.com/loculus-project/loculus/issues/6758

Currently, user-submitted data are not read in by the nextclade
preprocessing pipeline, causing them to be dropped form the submission
process. Since we're now working to implement file sharing on Loculus,
we need to make it so user-submitted files make it through the
preprocessing pipeline as well.

This PR makes it so user-submitted files are forwarded through the
nextclade pipeline without any processing or checking of file contents.
Future PRs will add functionality to, for example, check for host
sequences in submitted raw reads.

## Implementation

- `parse_ndjson` now also parses the file related information sent to
preprocessing by the backend
- `UnprocessedData` and `UnprocessedAfterNextclade` both get a `files`
attribute
- test factories in `factory_methods.py` can now be given files to add
to test objects, also added a test case in
`test_nextclade_preprocessing.py` that carries file information

## Manual testing

I created a preview to test whether files now make it through the
nextclade pipeline. When I submit sequences to the preview with attached
raw reads, this file now appears in the submission review page:

<img width="1594" height="581" alt="grafik"
src="https://github.com/user-attachments/assets/45a2c3a7-ef9a-4738-aec7-be8ef5717a1b"
/>

And also on the sequence details page after the sequence is released:

<img width="1124" height="595" alt="grafik"
src="https://github.com/user-attachments/assets/578e16c0-6f0e-4e9b-b5e2-2c544b826086"
/>

## Open questions

One thing I ran in to when implementing this is that you need to add
file categories two times in the config: one time under
`submissionDataTypes` (file categories that users are allowed to submit)
and then again under a top-level `files` field (files accepted as
outputs of prepro pipelines):

```
defaultOrganismConfig: &defaultOrganismConfig
  schema: &schema
    submissionDataTypes: &defaultSubmissionDataTypes
      consensusSequences: true
      maxSequencesPerEntry: 1
      files:
        enabled: true
        categories:
          - name: raw_reads
            displayName: Raw reads
    ...
    files:
      - name: annotations
        displayName: Annotations
      - name: raw_reads
        displayName: Raw reads
```

Would it be nicer to always allow file categories listed under
`submissionDataTypes.files` to be output but prepro? Or will we ever
have cases where users submit one thing, prepro processes it, and then
outputs another filetype?

Probably safest to keep as-is for now but just wanted to flag since not
doing this properly got me into a weird state where submissions stay in
'processing' indefinitely but never error because the backend doesn't
accept the preprocessing output (it only logs errors in the backend).

### PR Checklist
~- [ ] All necessary documentation has been adapted.~
- [x] The implemented feature is covered by appropriate, automated
tests.
- [x] Any manual testing that has been done is documented (i.e. what
exactly was tested?)

🚀 Preview: https://pass-files-through.loculus.org
---
 kubernetes/loculus/values.yaml                |  2 +
 .../src/loculus_preprocessing/backend.py      | 11 +++++
 .../src/loculus_preprocessing/datatypes.py    | 17 ++++---
 .../src/loculus_preprocessing/nextclade.py    |  6 +++
 .../src/loculus_preprocessing/prepro.py       | 11 +++--
 .../nextclade/tests/factory_methods.py        | 15 +++++++
 .../tests/test_host_name_validation.py        |  1 +
 .../test_metadata_processing_functions.py     |  1 +
 .../tests/test_nextclade_preprocessing.py     | 45 +++++++++++++++++++
 9 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 525ca21d3b..76f821fd83 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -94,6 +94,8 @@ defaultOrganismConfig: &defaultOrganismConfig
     files:
       - name: annotations
         displayName: Annotations
+      - name: raw_reads
+        displayName: Raw reads
     ### Field list
     ## General fields
     # name: Key used across app to refer to this field (required)
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
index 099ebc997d..9b362ec580 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -17,6 +17,7 @@
 
 from .config import Config
 from .datatypes import (
+    FileIdAndName,
     FileUploadInfo,
     ProcessedEntry,
     UnprocessedData,
@@ -93,6 +94,15 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]:
             key: trim_ns(value) if value else None
             for key, value in unaligned_nucleotide_sequences.items()
         }
+        submitted_files = json_object["data"].get("files")
+        file_mapping = (
+            {
+                category: [FileIdAndName(fileId=f["fileId"], name=f["name"]) for f in files]
+                for category, files in submitted_files.items()
+            }
+            if submitted_files
+            else None
+        )
         unprocessed_data = UnprocessedData(
             submitter=json_object["submitter"],
             group_id=json_object["groupId"],
@@ -102,6 +112,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]:
             unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences
             if unaligned_nucleotide_sequences
             else {},
+            files=file_mapping,
         )
         entry = UnprocessedEntry(
             accessionVersion=f"{json_object['accession']}.{json_object['version']}",
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
index a3f2129652..872c4bb059 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -23,6 +23,7 @@
 InputMetadataValue = str | None
 InputMetadata = dict[str, InputMetadataValue]
 FastaId = str
+FileCategory = str
 
 ProcessingAnnotationAlignment: Final = "alignment"
 
@@ -74,6 +75,12 @@ def from_single(cls, name: str, type, message: str):
         return cls.from_fields([name], [name], type, message)
 
 
+@dataclass
+class FileIdAndName:
+    fileId: str  # noqa: N815
+    name: str
+
+
 @dataclass
 class UnprocessedData:
     submitter: str
@@ -82,6 +89,7 @@ class UnprocessedData:
     submissionId: str  # noqa: N815
     metadata: InputMetadata
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
+    files: dict[FileCategory, list[FileIdAndName]] | None
 
 
 @dataclass
@@ -97,6 +105,7 @@ class UnprocessedEntry:
 @dataclass
 class UnprocessedAfterNextclade:
     inputMetadata: InputMetadata  # noqa: N815
+    files: dict[FileCategory, list[FileIdAndName]] | None
     # Derived metadata produced by Nextclade
     nextcladeMetadata: dict[SequenceName, Any] | None  # noqa: N815
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
@@ -109,22 +118,16 @@ class UnprocessedAfterNextclade:
     warnings: list[ProcessingAnnotation]
 
 
-@dataclass
-class FileIdAndName:
-    fileId: str  # noqa: N815
-    name: str
-
-
 @dataclass
 class ProcessedData:
     metadata: ProcessedMetadata
+    files: dict[FileCategory, list[FileIdAndName]] | None
     unalignedNucleotideSequences: dict[SequenceName, Any]  # noqa: N815
     alignedNucleotideSequences: dict[SequenceName, Any]  # noqa: N815
     nucleotideInsertions: dict[SequenceName, Any]  # noqa: N815
     alignedAminoAcidSequences: dict[GeneName, Any]  # noqa: N815
     aminoAcidInsertions: dict[GeneName, Any]  # noqa: N815
     sequenceNameToFastaId: dict[SequenceName, FastaId]  # noqa: N815
-    files: dict[str, list[FileIdAndName]] | None = None
 
 
 @dataclass
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
index 7cca9f7835..8545472b63 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
@@ -26,6 +26,8 @@
     AminoAcidSequence,
     AnnotationSourceType,
     FastaId,
+    FileCategory,
+    FileIdAndName,
     GeneName,
     GenericSequence,
     NucleotideInsertion,
@@ -801,6 +803,9 @@ def enrich_with_nextclade(  # noqa: PLR0914
         }
         for entry in unprocessed
     }
+    input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = {
+        entry.accessionVersion: entry.data.files for entry in unprocessed
+    }
 
     batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir)
     unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences
@@ -893,6 +898,7 @@ def enrich_with_nextclade(  # noqa: PLR0914
     return {
         id: UnprocessedAfterNextclade(
             inputMetadata=input_metadata[id],
+            files=input_files[id],
             nextcladeMetadata=nextclade_metadata[id],
             unalignedNucleotideSequences=unaligned_nucleotide_sequences[id],
             alignedNucleotideSequences=aligned_nucleotide_sequences[id],
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 8a752c9982..e9d4300b8e 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -296,6 +296,7 @@ def processed_entry_no_alignment(  # noqa: PLR0913, PLR0917
             version=version_from_str(accession_version),
             data=ProcessedData(
                 metadata=output_metadata,
+                files=unprocessed.files,
                 unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences,
                 alignedNucleotideSequences=aligned_nucleotide_sequences,
                 nucleotideInsertions=nucleotide_insertions,
@@ -534,6 +535,7 @@ def process_single(
         version=version_from_str(accession_version),
         data=ProcessedData(
             metadata=output_metadata,
+            files=unprocessed.files,
             unalignedNucleotideSequences=unprocessed.unalignedNucleotideSequences,
             alignedNucleotideSequences=unprocessed.alignedNucleotideSequences,
             nucleotideInsertions=unprocessed.nucleotideInsertions,
@@ -595,6 +597,7 @@ def processed_entry_with_errors(id) -> SubmissionData:
             version=version_from_str(id),
             data=ProcessedData(
                 metadata=dict[str, ProcessedMetadataValue](),
+                files=None,
                 unalignedNucleotideSequences=defaultdict(dict[str, Any]),
                 alignedNucleotideSequences=defaultdict(dict[str, Any]),
                 nucleotideInsertions=defaultdict(dict[str, Any]),
@@ -660,9 +663,11 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non
             file_id = upload_info.fileId
             url = upload_info.url
             upload_embl_file_to_presigned_url(file_content, url)
-            submission_data.processed_entry.data.files = {
-                "annotations": [FileIdAndName(fileId=file_id, name=file_name)]
-            }
+            processed_files = submission_data.processed_entry.data.files or {}
+            processed_files.setdefault("annotations", []).append(
+                FileIdAndName(fileId=file_id, name=file_name)
+            )
+            submission_data.processed_entry.data.files = processed_files
         except Exception as e:
             logger.error("Error creating or uploading EMBL file: %s", e)
             submission_data.processed_entry.errors.append(
diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py
index 41b26062ae..679601e744 100644
--- a/preprocessing/nextclade/tests/factory_methods.py
+++ b/preprocessing/nextclade/tests/factory_methods.py
@@ -9,6 +9,8 @@
 from loculus_preprocessing.datatypes import (
     AnnotationSource,
     AnnotationSourceType,
+    FileCategory,
+    FileIdAndName,
     NucleotideSequence,
     ProcessedData,
     ProcessedEntry,
@@ -77,6 +79,7 @@ def create_unprocessed_entry(
         accession_id: str,
         sequences: dict[SegmentName, NucleotideSequence | None],
         group_id: int = 2,
+        files: dict[FileCategory, list[FileIdAndName]] | None = None,
     ) -> UnprocessedEntry:
         return UnprocessedEntry(
             accessionVersion=f"LOC_{accession_id}.1",
@@ -89,6 +92,7 @@ def create_unprocessed_entry(
                 group_id=group_id,
                 metadata=metadata_dict,
                 unalignedNucleotideSequences=sequences,
+                files=files,
             ),
         )
 
@@ -130,6 +134,7 @@ def create_processed_entry(
         errors: list[ProcessingAnnotation] | None = None,
         warnings: list[ProcessingAnnotation] | None = None,
         processed_alignment: ProcessedAlignment | None = None,
+        files: dict[FileCategory, list[FileIdAndName]] | None = None,
     ) -> ProcessedEntry:
         if errors is None:
             errors = []
@@ -147,6 +152,7 @@ def create_processed_entry(
             version=1,
             data=ProcessedData(
                 metadata=base_metadata_dict,
+                files=files,
                 unalignedNucleotideSequences=processed_alignment.unalignedNucleotideSequences,
                 alignedNucleotideSequences=processed_alignment.alignedNucleotideSequences,
                 nucleotideInsertions=processed_alignment.nucleotideInsertions,
@@ -163,9 +169,11 @@ def create_processed_entry(
 class Case:
     name: str
     input_metadata: dict[str, str | None] = field(default_factory=dict)
+    input_files: dict[FileCategory, list[FileIdAndName]] | None = None
     input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None})
     accession_id: str = "000999"
     expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict)
+    expected_files: dict[FileCategory, list[FileIdAndName]] | None = None
     expected_errors: list[ProcessingAnnotation] | None = None
     expected_warnings: list[ProcessingAnnotation] | None = None
     expected_processed_alignment: ProcessedAlignment | None = None
@@ -179,6 +187,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT
             accession_id=self.accession_id,
             sequences=self.input_sequence,
             group_id=self.group_id,
+            files=self.input_files,
         )
         expected_output = factory_custom.create_processed_entry(
             metadata_dict=self.expected_metadata,
@@ -186,6 +195,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT
             errors=self.expected_errors or [],
             warnings=self.expected_warnings or [],
             processed_alignment=self.expected_processed_alignment,
+            files=self.expected_files,
         )
         return ProcessingTestCase(
             name=self.name, input=unprocessed_entry, expected_output=expected_output
@@ -265,3 +275,8 @@ def verify_processed_entry(
         f"{test_name}: sequence name to fasta header map '{actual.sequenceNameToFastaId}' do not "
         f"match expectation '{expected.sequenceNameToFastaId}'."
     )
+
+    # Check files
+    assert actual.files == expected.files, (
+        f"{test_name}: files '{actual.files}' do not match expectation '{expected.files}'."
+    )
diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py
index d97cdbe97e..62cdf75768 100644
--- a/preprocessing/nextclade/tests/test_host_name_validation.py
+++ b/preprocessing/nextclade/tests/test_host_name_validation.py
@@ -34,6 +34,7 @@ def make_entry(metadata: dict, group_id: int) -> UnprocessedEntry:
             group_id=group_id,
             metadata=metadata,
             unalignedNucleotideSequences={},
+            files=None,
         ),
     )
 
diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
index d17ae42244..f3961c4beb 100644
--- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py
+++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
@@ -783,6 +783,7 @@ def test_preprocessing_without_consensus_sequences(config: Config) -> None:
                 "name_required": sequence_name,
             },
             unalignedNucleotideSequences={},
+            files=None,
         ),
     )
 
diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
index 05edf3a4cf..0b07af4c31 100644
--- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
+++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -26,6 +26,7 @@
 )
 from loculus_preprocessing.datatypes import (
     AnnotationSourceType,
+    FileIdAndName,
     SegmentClassificationMethod,
     SubmissionData,
     UnprocessedData,
@@ -284,6 +285,46 @@ def invalid_sequence() -> str:
             sequenceNameToFastaId={"main": "fastaHeader"},
         ),
     ),
+    Case(
+        name="with file",
+        input_metadata={},
+        input_files={
+            "raw_reads": [
+                FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
+                FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
+            ]
+        },
+        input_sequence={"fastaHeader": sequence_with_mutation("single")},
+        accession_id="1",
+        expected_metadata={
+            "completeness": 1.0,
+            "totalInsertedNucs": 0,
+            "totalSnps": 1,
+            "totalDeletedNucs": 0,
+            "length": len(consensus_sequence("single")),
+            "nonExistentField": "None",
+            "variant": True,
+        },
+        expected_files={
+            "raw_reads": [
+                FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
+                FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
+            ]
+        },
+        expected_errors=[],
+        expected_warnings=[],
+        expected_processed_alignment=ProcessedAlignment(
+            unalignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            alignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            nucleotideInsertions={},
+            alignedAminoAcidSequences={
+                "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"),
+                "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"),
+            },
+            aminoAcidInsertions={},
+            sequenceNameToFastaId={"main": "fastaHeader"},
+        ),
+    ),
 ]
 
 single_segment_failed_case_definitions = [
@@ -1285,6 +1326,7 @@ def test_max_sequences_per_entry_batch_isolation() -> None:
                 "ebola-sudan": sequence_with_mutation("ebola-sudan"),
                 "ebola-zaire": sequence_with_mutation("ebola-zaire"),
             },
+            files=None,
         ),
     )
 
@@ -1299,6 +1341,7 @@ def test_max_sequences_per_entry_batch_isolation() -> None:
             unalignedNucleotideSequences={
                 "ebola-sudan": sequence_with_mutation("ebola-sudan"),
             },
+            files=None,
         ),
     )
 
@@ -1332,6 +1375,7 @@ def test_preprocessing_without_metadata() -> None:
                 "ebola-sudan": sequence_with_mutation("ebola-sudan"),
                 "ebola-zaire": sequence_with_mutation("ebola-zaire"),
             },
+            files=None,
         ),
     )
 
@@ -1455,6 +1499,7 @@ def test_create_flatfile():
                 "authors": "Smith, Doe A;",
             },
             unalignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            files=None,
         ),
     )
 

From 0c6d635864d4c650755a15174d42652b670a7014 Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 26 Jun 2026 14:54:05 +0200
Subject: [PATCH 04/11] Very basic validation of raw read submissions

---
 .../src/loculus_preprocessing/datatypes.py    |  13 +-
 .../src/loculus_preprocessing/nextclade.py    |   8 +-
 .../src/loculus_preprocessing/prepro.py       |  35 +++-
 .../processing_functions.py                   |  33 ++++
 .../nextclade/tests/factory_methods.py        |  10 +-
 .../tests/test_nextclade_preprocessing.py     | 153 +++++++++++++++++-
 6 files changed, 234 insertions(+), 18 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
index 872c4bb059..41fbe95c43 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -23,15 +23,20 @@
 InputMetadataValue = str | None
 InputMetadata = dict[str, InputMetadataValue]
 FastaId = str
-FileCategory = str
 
 ProcessingAnnotationAlignment: Final = "alignment"
 
 
+@unique
+class SubmissionFileCategory(StrEnum):
+    RAW_READS = "raw_reads"
+
+
 @unique
 class AnnotationSourceType(StrEnum):
     METADATA = "Metadata"
     NUCLEOTIDE_SEQUENCE = "NucleotideSequence"
+    SUBMITTED_FILE = "SubmittedFile"
 
 
 @dataclass(frozen=True)
@@ -89,7 +94,7 @@ class UnprocessedData:
     submissionId: str  # noqa: N815
     metadata: InputMetadata
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
-    files: dict[FileCategory, list[FileIdAndName]] | None
+    files: dict[SubmissionFileCategory, list[FileIdAndName]] | None
 
 
 @dataclass
@@ -105,7 +110,7 @@ class UnprocessedEntry:
 @dataclass
 class UnprocessedAfterNextclade:
     inputMetadata: InputMetadata  # noqa: N815
-    files: dict[FileCategory, list[FileIdAndName]] | None
+    files: dict[SubmissionFileCategory, list[FileIdAndName]] | None
     # Derived metadata produced by Nextclade
     nextcladeMetadata: dict[SequenceName, Any] | None  # noqa: N815
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
@@ -121,7 +126,7 @@ class UnprocessedAfterNextclade:
 @dataclass
 class ProcessedData:
     metadata: ProcessedMetadata
-    files: dict[FileCategory, list[FileIdAndName]] | None
+    files: dict[SubmissionFileCategory, list[FileIdAndName]] | None
     unalignedNucleotideSequences: dict[SequenceName, Any]  # noqa: N815
     alignedNucleotideSequences: dict[SequenceName, Any]  # noqa: N815
     nucleotideInsertions: dict[SequenceName, Any]  # noqa: N815
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
index 8545472b63..87a9b65291 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
@@ -26,7 +26,7 @@
     AminoAcidSequence,
     AnnotationSourceType,
     FastaId,
-    FileCategory,
+    SubmissionFileCategory,
     FileIdAndName,
     GeneName,
     GenericSequence,
@@ -803,9 +803,9 @@ def enrich_with_nextclade(  # noqa: PLR0914
         }
         for entry in unprocessed
     }
-    input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = {
-        entry.accessionVersion: entry.data.files for entry in unprocessed
-    }
+    input_files: dict[
+        AccessionVersion, dict[SubmissionFileCategory, list[FileIdAndName]] | None
+    ] = {entry.accessionVersion: entry.data.files for entry in unprocessed}
 
     batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir)
     unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index e9d4300b8e..cb874d47c9 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -42,6 +42,7 @@
     SegmentClassificationMethod,
     SegmentName,
     SubmissionData,
+    SubmissionFileCategory,
     UnprocessedAfterNextclade,
     UnprocessedData,
     UnprocessedEntry,
@@ -60,6 +61,7 @@
     process_mutations_from_clade_founder,
     process_phenotype_values,
     process_stop_codons,
+    validate_raw_reads_submission,
 )
 from .sequence_checks import error_on_excess_sequences, errors_if_non_iupac
 
@@ -436,6 +438,24 @@ def get_output_metadata(
     return output_metadata, errors, warnings
 
 
+def process_submitted_files(
+    file_mapping: dict[SubmissionFileCategory, list[FileIdAndName]],
+) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]:
+    errors: list[ProcessingAnnotation] = []
+    warnings: list[ProcessingAnnotation] = []
+
+    for category, files in file_mapping.items():
+        match category:
+            case SubmissionFileCategory.RAW_READS:
+                rr_errors, rr_warnings = validate_raw_reads_submission(files)
+                errors.extend(rr_errors)
+                warnings.extend(rr_warnings)
+            case _:
+                logger.warning(f"Submitted file is of unrecognized category {category}")
+
+    return errors, warnings
+
+
 def alignment_errors_warnings(
     unprocessed: UnprocessedAfterNextclade,
     config: Config,
@@ -530,6 +550,8 @@ def process_single(
         accession_version, unprocessed, config
     )
 
+    file_errors, file_warnings = process_submitted_files(unprocessed.files or {})
+
     processed_entry = ProcessedEntry(
         accession=accession_from_str(accession_version),
         version=version_from_str(accession_version),
@@ -550,9 +572,12 @@ def process_single(
                 + max_seq_errors
                 + alignment_errors
                 + metadata_errors
+                + file_errors
             )
         ),
-        warnings=list(set(unprocessed.warnings + alignment_warnings + metadata_warnings)),
+        warnings=list(
+            set(unprocessed.warnings + alignment_warnings + metadata_warnings + file_warnings)
+        ),
     )
 
     return SubmissionData(
@@ -580,12 +605,16 @@ def process_single_unaligned(
         accession_version, unprocessed, config
     )
 
+    file_errors, file_warnings = process_submitted_files(unprocessed.files or {})
+
     return processed_entry_no_alignment(
         accession_version=accession_version,
         unprocessed=unprocessed,
         output_metadata=output_metadata,
-        errors=list(set(iupac_errors + metadata_errors + segment_assignment.alert.errors)),
-        warnings=list(set(metadata_warnings)),
+        errors=list(
+            set(iupac_errors + metadata_errors + segment_assignment.alert.errors + file_errors)
+        ),
+        warnings=list(set(metadata_warnings + file_warnings)),
         sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId,
     )
 
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index e2428bd2b7..0288f17e3a 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -25,6 +25,7 @@
 from .datatypes import (
     AnnotationSource,
     AnnotationSourceType,
+    FileIdAndName,
     FunctionArgs,
     InputData,
     InputMetadata,
@@ -1948,6 +1949,38 @@ def single_metadata_annotation(
     ]
 
 
+def validate_raw_reads_submission(
+    files: list[FileIdAndName],
+) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]:
+    errors: list[ProcessingAnnotation] = []
+    warnings: list[ProcessingAnnotation] = []
+
+    min_files, max_files = 1, 2
+    if not (min_files <= len(files) <= max_files):
+        name = "; ".join(f.name for f in files)
+        message = f"Raw reads must be submitted as one or two files, got {len(files)}"
+        errors.append(
+            ProcessingAnnotation.from_single(
+                name=name, type=AnnotationSourceType.SUBMITTED_FILE, message=message
+            )
+        )
+
+    allowed_extensions = ["fastq", "fastq.gz", "fq", "fq.gz"]
+    for file in files:
+        if not any(file.name.endswith(extension) for extension in allowed_extensions):
+            message = (
+                f"Raw reads file '{file.name}' has unrecognized extension."
+                f" Allowed extensions: {allowed_extensions}"
+            )
+            errors.append(
+                ProcessingAnnotation.from_single(
+                    name=file.name, type=AnnotationSourceType.SUBMITTED_FILE, message=message
+                )
+            )
+
+    return errors, warnings
+
+
 def process_frameshifts(input: str | None) -> InputData:
     """Converts frameshift string to InputData for processing"""
     try:
diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py
index 679601e744..8b25fe099f 100644
--- a/preprocessing/nextclade/tests/factory_methods.py
+++ b/preprocessing/nextclade/tests/factory_methods.py
@@ -9,7 +9,7 @@
 from loculus_preprocessing.datatypes import (
     AnnotationSource,
     AnnotationSourceType,
-    FileCategory,
+    SubmissionFileCategory,
     FileIdAndName,
     NucleotideSequence,
     ProcessedData,
@@ -79,7 +79,7 @@ def create_unprocessed_entry(
         accession_id: str,
         sequences: dict[SegmentName, NucleotideSequence | None],
         group_id: int = 2,
-        files: dict[FileCategory, list[FileIdAndName]] | None = None,
+        files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None,
     ) -> UnprocessedEntry:
         return UnprocessedEntry(
             accessionVersion=f"LOC_{accession_id}.1",
@@ -134,7 +134,7 @@ def create_processed_entry(
         errors: list[ProcessingAnnotation] | None = None,
         warnings: list[ProcessingAnnotation] | None = None,
         processed_alignment: ProcessedAlignment | None = None,
-        files: dict[FileCategory, list[FileIdAndName]] | None = None,
+        files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None,
     ) -> ProcessedEntry:
         if errors is None:
             errors = []
@@ -169,11 +169,11 @@ def create_processed_entry(
 class Case:
     name: str
     input_metadata: dict[str, str | None] = field(default_factory=dict)
-    input_files: dict[FileCategory, list[FileIdAndName]] | None = None
+    input_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None
     input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None})
     accession_id: str = "000999"
     expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict)
-    expected_files: dict[FileCategory, list[FileIdAndName]] | None = None
+    expected_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None
     expected_errors: list[ProcessingAnnotation] | None = None
     expected_warnings: list[ProcessingAnnotation] | None = None
     expected_processed_alignment: ProcessedAlignment | None = None
diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
index 0b07af4c31..a1f6814bc9 100644
--- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
+++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -29,6 +29,7 @@
     FileIdAndName,
     SegmentClassificationMethod,
     SubmissionData,
+    SubmissionFileCategory,
     UnprocessedData,
     UnprocessedEntry,
 )
@@ -289,7 +290,7 @@ def invalid_sequence() -> str:
         name="with file",
         input_metadata={},
         input_files={
-            "raw_reads": [
+            SubmissionFileCategory.RAW_READS: [
                 FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
                 FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
             ]
@@ -306,7 +307,7 @@ def invalid_sequence() -> str:
             "variant": True,
         },
         expected_files={
-            "raw_reads": [
+            SubmissionFileCategory.RAW_READS: [
                 FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
                 FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
             ]
@@ -325,6 +326,105 @@ def invalid_sequence() -> str:
             sequenceNameToFastaId={"main": "fastaHeader"},
         ),
     ),
+    Case(
+        name="with too many raw read files",
+        input_metadata={},
+        input_files={
+            SubmissionFileCategory.RAW_READS: [
+                FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
+                FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
+                FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"),
+            ]
+        },
+        input_sequence={"fastaHeader": sequence_with_mutation("single")},
+        accession_id="1",
+        expected_metadata={
+            "completeness": 1.0,
+            "totalInsertedNucs": 0,
+            "totalSnps": 1,
+            "totalDeletedNucs": 0,
+            "length": len(consensus_sequence("single")),
+            "nonExistentField": "None",
+            "variant": True,
+        },
+        expected_files={
+            SubmissionFileCategory.RAW_READS: [
+                FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
+                FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
+                FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"),
+            ]
+        },
+        expected_errors=build_processing_annotations(
+            [
+                ProcessingAnnotationHelper(
+                    ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"],
+                    ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"],
+                    "Raw reads must be submitted as one or two files, got 3",
+                    AnnotationSourceType.SUBMITTED_FILE,
+                )
+            ]
+        ),
+        expected_warnings=[],
+        expected_processed_alignment=ProcessedAlignment(
+            unalignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            alignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            nucleotideInsertions={},
+            alignedAminoAcidSequences={
+                "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"),
+                "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"),
+            },
+            aminoAcidInsertions={},
+            sequenceNameToFastaId={"main": "fastaHeader"},
+        ),
+    ),
+    Case(
+        name="with unrecognized raw read file extension",
+        input_metadata={},
+        input_files={
+            SubmissionFileCategory.RAW_READS: [
+                FileIdAndName(fileId="file-id-0001", name="reads.txt")
+            ]
+        },
+        input_sequence={"fastaHeader": sequence_with_mutation("single")},
+        accession_id="1",
+        expected_metadata={
+            "completeness": 1.0,
+            "totalInsertedNucs": 0,
+            "totalSnps": 1,
+            "totalDeletedNucs": 0,
+            "length": len(consensus_sequence("single")),
+            "nonExistentField": "None",
+            "variant": True,
+        },
+        expected_files={
+            SubmissionFileCategory.RAW_READS: [
+                FileIdAndName(fileId="file-id-0001", name="reads.txt")
+            ]
+        },
+        expected_errors=build_processing_annotations(
+            [
+                ProcessingAnnotationHelper(
+                    ["reads.txt"],
+                    ["reads.txt"],
+                    "Raw reads file 'reads.txt' has unrecognized extension. "
+                    "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']",
+                    AnnotationSourceType.SUBMITTED_FILE,
+                )
+            ]
+        ),
+        expected_warnings=[],
+        expected_processed_alignment=ProcessedAlignment(
+            unalignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            alignedNucleotideSequences={"main": sequence_with_mutation("single")},
+            nucleotideInsertions={},
+            alignedAminoAcidSequences={
+                "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"),
+                "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"),
+            },
+            aminoAcidInsertions={},
+            sequenceNameToFastaId={"main": "fastaHeader"},
+        ),
+    ),
 ]
 
 single_segment_failed_case_definitions = [
@@ -1175,6 +1275,55 @@ def invalid_sequence() -> str:
             sequenceNameToFastaId={"ebola-sudan": "ebola-sudan"},
         ),
     ),
+    Case(
+        name="validate raw read files when sequences are not aligned",
+        input_metadata={},
+        input_sequence={
+            "prefix_ebola-sudan": sequence_with_mutation("ebola-sudan"),
+            "other_prefix_ebola-zaire": sequence_with_mutation("ebola-zaire"),
+        },
+        accession_id="1",
+        input_files={
+            SubmissionFileCategory.RAW_READS: [
+                FileIdAndName(fileId="file-id-0001", name="reads.txt")
+            ]
+        },
+        expected_files={
+            SubmissionFileCategory.RAW_READS: [
+                FileIdAndName(fileId="file-id-0001", name="reads.txt")
+            ]
+        },
+        expected_metadata={
+            "length_ebola-sudan": len(consensus_sequence("ebola-sudan")),
+            "length_ebola-zaire": len(consensus_sequence("ebola-zaire")),
+        },
+        expected_errors=build_processing_annotations(
+            [
+                ProcessingAnnotationHelper(
+                    ["reads.txt"],
+                    ["reads.txt"],
+                    "Raw reads file 'reads.txt' has unrecognized extension. "
+                    "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']",
+                    AnnotationSourceType.SUBMITTED_FILE,
+                )
+            ]
+        ),
+        expected_warnings=[],
+        expected_processed_alignment=ProcessedAlignment(
+            unalignedNucleotideSequences={
+                "ebola-sudan": sequence_with_mutation("ebola-sudan"),
+                "ebola-zaire": sequence_with_mutation("ebola-zaire"),
+            },
+            alignedNucleotideSequences={},
+            nucleotideInsertions={},
+            alignedAminoAcidSequences={},
+            aminoAcidInsertions={},
+            sequenceNameToFastaId={
+                "ebola-sudan": "prefix_ebola-sudan",
+                "ebola-zaire": "other_prefix_ebola-zaire",
+            },
+        ),
+    ),
 ]
 
 

From 9c0baa0c0945bc3fa889d3170dcf5281c7bb192f Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 26 Jun 2026 15:08:39 +0200
Subject: [PATCH 05/11] ruff

---
 preprocessing/nextclade/src/loculus_preprocessing/nextclade.py | 2 +-
 preprocessing/nextclade/tests/factory_methods.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
index 87a9b65291..7642b3df17 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
@@ -26,7 +26,6 @@
     AminoAcidSequence,
     AnnotationSourceType,
     FastaId,
-    SubmissionFileCategory,
     FileIdAndName,
     GeneName,
     GenericSequence,
@@ -38,6 +37,7 @@
     SegmentName,
     SequenceAssignment,
     SequenceAssignmentBatch,
+    SubmissionFileCategory,
     UnprocessedAfterNextclade,
     UnprocessedEntry,
 )
diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py
index 8b25fe099f..277fc7ae4a 100644
--- a/preprocessing/nextclade/tests/factory_methods.py
+++ b/preprocessing/nextclade/tests/factory_methods.py
@@ -9,7 +9,6 @@
 from loculus_preprocessing.datatypes import (
     AnnotationSource,
     AnnotationSourceType,
-    SubmissionFileCategory,
     FileIdAndName,
     NucleotideSequence,
     ProcessedData,
@@ -18,6 +17,7 @@
     ProcessingAnnotation,
     ProcessingAnnotationAlignment,
     SegmentName,
+    SubmissionFileCategory,
     UnprocessedData,
     UnprocessedEntry,
 )

From a9b40a5a6c9af6e49df71bfbba92afba9f51345e Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 26 Jun 2026 15:17:19 +0200
Subject: [PATCH 06/11] Update type

---
 preprocessing/dummy/main.py                   |  4 +--
 .../src/loculus_preprocessing/datatypes.py    |  9 ++++---
 .../src/loculus_preprocessing/nextclade.py    |  8 +++---
 .../src/loculus_preprocessing/prepro.py       | 10 +++----
 .../nextclade/tests/factory_methods.py        | 10 +++----
 .../tests/test_nextclade_preprocessing.py     | 26 +++++++------------
 6 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/preprocessing/dummy/main.py b/preprocessing/dummy/main.py
index e3a2b9cc14..647c8b883a 100644
--- a/preprocessing/dummy/main.py
+++ b/preprocessing/dummy/main.py
@@ -129,7 +129,7 @@ def parse_ndjson(ndjson_data: str) -> list[Sequence]:
 
 
 def process(unprocessed: list[Sequence]) -> list[Sequence]:
-    with open("mock-sequences.json", "r") as f:
+    with open("mock-sequences.json") as f:
         mock_sequences = json.load(f)
     possible_lineages = ["A.1", "A.1.1", "A.2"]
 
@@ -160,7 +160,7 @@ def process(unprocessed: list[Sequence]) -> list[Sequence]:
             "nucleotideInsertions": {},
             "aminoAcidInsertions": {},
         }
-        
+
         if not disableConsensusSequences:
             data = {**data, **mock_sequences}
             data["sequenceNameToFastaId"] = {"main": submissionId}
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
index 41fbe95c43..a59abb50c7 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -28,8 +28,9 @@
 
 
 @unique
-class SubmissionFileCategory(StrEnum):
+class FileCategory(StrEnum):
     RAW_READS = "raw_reads"
+    ANNOTATIONS = "annotations"
 
 
 @unique
@@ -94,7 +95,7 @@ class UnprocessedData:
     submissionId: str  # noqa: N815
     metadata: InputMetadata
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
-    files: dict[SubmissionFileCategory, list[FileIdAndName]] | None
+    files: dict[FileCategory, list[FileIdAndName]] | None
 
 
 @dataclass
@@ -110,7 +111,7 @@ class UnprocessedEntry:
 @dataclass
 class UnprocessedAfterNextclade:
     inputMetadata: InputMetadata  # noqa: N815
-    files: dict[SubmissionFileCategory, list[FileIdAndName]] | None
+    files: dict[FileCategory, list[FileIdAndName]] | None
     # Derived metadata produced by Nextclade
     nextcladeMetadata: dict[SequenceName, Any] | None  # noqa: N815
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
@@ -126,7 +127,7 @@ class UnprocessedAfterNextclade:
 @dataclass
 class ProcessedData:
     metadata: ProcessedMetadata
-    files: dict[SubmissionFileCategory, list[FileIdAndName]] | None
+    files: dict[FileCategory, list[FileIdAndName]] | None
     unalignedNucleotideSequences: dict[SequenceName, Any]  # noqa: N815
     alignedNucleotideSequences: dict[SequenceName, Any]  # noqa: N815
     nucleotideInsertions: dict[SequenceName, Any]  # noqa: N815
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
index 7642b3df17..8545472b63 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
@@ -26,6 +26,7 @@
     AminoAcidSequence,
     AnnotationSourceType,
     FastaId,
+    FileCategory,
     FileIdAndName,
     GeneName,
     GenericSequence,
@@ -37,7 +38,6 @@
     SegmentName,
     SequenceAssignment,
     SequenceAssignmentBatch,
-    SubmissionFileCategory,
     UnprocessedAfterNextclade,
     UnprocessedEntry,
 )
@@ -803,9 +803,9 @@ def enrich_with_nextclade(  # noqa: PLR0914
         }
         for entry in unprocessed
     }
-    input_files: dict[
-        AccessionVersion, dict[SubmissionFileCategory, list[FileIdAndName]] | None
-    ] = {entry.accessionVersion: entry.data.files for entry in unprocessed}
+    input_files: dict[AccessionVersion, dict[FileCategory, list[FileIdAndName]] | None] = {
+        entry.accessionVersion: entry.data.files for entry in unprocessed
+    }
 
     batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir)
     unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index cb874d47c9..da7736e2ea 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -26,6 +26,7 @@
     AminoAcidSequence,
     AnnotationSource,
     AnnotationSourceType,
+    FileCategory,
     FileIdAndName,
     GeneName,
     InputData,
@@ -42,7 +43,6 @@
     SegmentClassificationMethod,
     SegmentName,
     SubmissionData,
-    SubmissionFileCategory,
     UnprocessedAfterNextclade,
     UnprocessedData,
     UnprocessedEntry,
@@ -439,19 +439,19 @@ def get_output_metadata(
 
 
 def process_submitted_files(
-    file_mapping: dict[SubmissionFileCategory, list[FileIdAndName]],
+    file_mapping: dict[FileCategory, list[FileIdAndName]],
 ) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]:
     errors: list[ProcessingAnnotation] = []
     warnings: list[ProcessingAnnotation] = []
 
     for category, files in file_mapping.items():
         match category:
-            case SubmissionFileCategory.RAW_READS:
+            case FileCategory.RAW_READS:
                 rr_errors, rr_warnings = validate_raw_reads_submission(files)
                 errors.extend(rr_errors)
                 warnings.extend(rr_warnings)
             case _:
-                logger.warning(f"Submitted file is of unrecognized category {category}")
+                logger.warning(f"Submitted file is of unexpected category {category}")
 
     return errors, warnings
 
@@ -693,7 +693,7 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non
             url = upload_info.url
             upload_embl_file_to_presigned_url(file_content, url)
             processed_files = submission_data.processed_entry.data.files or {}
-            processed_files.setdefault("annotations", []).append(
+            processed_files.setdefault(FileCategory.ANNOTATIONS, []).append(
                 FileIdAndName(fileId=file_id, name=file_name)
             )
             submission_data.processed_entry.data.files = processed_files
diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py
index 277fc7ae4a..679601e744 100644
--- a/preprocessing/nextclade/tests/factory_methods.py
+++ b/preprocessing/nextclade/tests/factory_methods.py
@@ -9,6 +9,7 @@
 from loculus_preprocessing.datatypes import (
     AnnotationSource,
     AnnotationSourceType,
+    FileCategory,
     FileIdAndName,
     NucleotideSequence,
     ProcessedData,
@@ -17,7 +18,6 @@
     ProcessingAnnotation,
     ProcessingAnnotationAlignment,
     SegmentName,
-    SubmissionFileCategory,
     UnprocessedData,
     UnprocessedEntry,
 )
@@ -79,7 +79,7 @@ def create_unprocessed_entry(
         accession_id: str,
         sequences: dict[SegmentName, NucleotideSequence | None],
         group_id: int = 2,
-        files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None,
+        files: dict[FileCategory, list[FileIdAndName]] | None = None,
     ) -> UnprocessedEntry:
         return UnprocessedEntry(
             accessionVersion=f"LOC_{accession_id}.1",
@@ -134,7 +134,7 @@ def create_processed_entry(
         errors: list[ProcessingAnnotation] | None = None,
         warnings: list[ProcessingAnnotation] | None = None,
         processed_alignment: ProcessedAlignment | None = None,
-        files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None,
+        files: dict[FileCategory, list[FileIdAndName]] | None = None,
     ) -> ProcessedEntry:
         if errors is None:
             errors = []
@@ -169,11 +169,11 @@ def create_processed_entry(
 class Case:
     name: str
     input_metadata: dict[str, str | None] = field(default_factory=dict)
-    input_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None
+    input_files: dict[FileCategory, list[FileIdAndName]] | None = None
     input_sequence: dict[str, str | None] = field(default_factory=lambda: {"main": None})
     accession_id: str = "000999"
     expected_metadata: dict[str, ProcessedMetadataValue] = field(default_factory=dict)
-    expected_files: dict[SubmissionFileCategory, list[FileIdAndName]] | None = None
+    expected_files: dict[FileCategory, list[FileIdAndName]] | None = None
     expected_errors: list[ProcessingAnnotation] | None = None
     expected_warnings: list[ProcessingAnnotation] | None = None
     expected_processed_alignment: ProcessedAlignment | None = None
diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
index a1f6814bc9..a05efcba82 100644
--- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
+++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -26,10 +26,10 @@
 )
 from loculus_preprocessing.datatypes import (
     AnnotationSourceType,
+    FileCategory,
     FileIdAndName,
     SegmentClassificationMethod,
     SubmissionData,
-    SubmissionFileCategory,
     UnprocessedData,
     UnprocessedEntry,
 )
@@ -290,7 +290,7 @@ def invalid_sequence() -> str:
         name="with file",
         input_metadata={},
         input_files={
-            SubmissionFileCategory.RAW_READS: [
+            FileCategory.RAW_READS: [
                 FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
                 FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
             ]
@@ -307,7 +307,7 @@ def invalid_sequence() -> str:
             "variant": True,
         },
         expected_files={
-            SubmissionFileCategory.RAW_READS: [
+            FileCategory.RAW_READS: [
                 FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
                 FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
             ]
@@ -330,7 +330,7 @@ def invalid_sequence() -> str:
         name="with too many raw read files",
         input_metadata={},
         input_files={
-            SubmissionFileCategory.RAW_READS: [
+            FileCategory.RAW_READS: [
                 FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
                 FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
                 FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"),
@@ -348,7 +348,7 @@ def invalid_sequence() -> str:
             "variant": True,
         },
         expected_files={
-            SubmissionFileCategory.RAW_READS: [
+            FileCategory.RAW_READS: [
                 FileIdAndName(fileId="file-id-0001", name="reads_R1.fastq"),
                 FileIdAndName(fileId="file-id-0002", name="reads_R2.fastq"),
                 FileIdAndName(fileId="file-id-0003", name="reads_R3.fastq"),
@@ -381,9 +381,7 @@ def invalid_sequence() -> str:
         name="with unrecognized raw read file extension",
         input_metadata={},
         input_files={
-            SubmissionFileCategory.RAW_READS: [
-                FileIdAndName(fileId="file-id-0001", name="reads.txt")
-            ]
+            FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")]
         },
         input_sequence={"fastaHeader": sequence_with_mutation("single")},
         accession_id="1",
@@ -397,9 +395,7 @@ def invalid_sequence() -> str:
             "variant": True,
         },
         expected_files={
-            SubmissionFileCategory.RAW_READS: [
-                FileIdAndName(fileId="file-id-0001", name="reads.txt")
-            ]
+            FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")]
         },
         expected_errors=build_processing_annotations(
             [
@@ -1284,14 +1280,10 @@ def invalid_sequence() -> str:
         },
         accession_id="1",
         input_files={
-            SubmissionFileCategory.RAW_READS: [
-                FileIdAndName(fileId="file-id-0001", name="reads.txt")
-            ]
+            FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")]
         },
         expected_files={
-            SubmissionFileCategory.RAW_READS: [
-                FileIdAndName(fileId="file-id-0001", name="reads.txt")
-            ]
+            FileCategory.RAW_READS: [FileIdAndName(fileId="file-id-0001", name="reads.txt")]
         },
         expected_metadata={
             "length_ebola-sudan": len(consensus_sequence("ebola-sudan")),

From 2ec13bf332319f260f31c22b234126700ee8b742 Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 26 Jun 2026 16:08:00 +0200
Subject: [PATCH 07/11] Add SubmittedFile to backend

---
 .../src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt
index 4ee737b238..6340f660b9 100644
--- a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt
+++ b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt
@@ -284,6 +284,7 @@ data class PreprocessingAnnotationSource(
 enum class PreprocessingAnnotationSourceType {
     Metadata,
     NucleotideSequence,
+    SubmittedFile,
 }
 
 data class GetSequenceResponse(

From f7be2e2f3eb4423c70fd3408704d3e376c1d2fc8 Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 26 Jun 2026 16:30:56 +0200
Subject: [PATCH 08/11] Add SubmittedFile to website

---
 website/src/types/backend.ts | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/website/src/types/backend.ts b/website/src/types/backend.ts
index 225da65ecc..da2d2c2f73 100644
--- a/website/src/types/backend.ts
+++ b/website/src/types/backend.ts
@@ -24,7 +24,11 @@ export const sequenceEntryProcessingResultNames = z.union([
 ]);
 export type SequenceEntryProcessingResultNames = z.infer<typeof sequenceEntryProcessingResultNames>;
 
-const processingAnnotationSourceType = z.union([z.literal('Metadata'), z.literal('NucleotideSequence')]);
+const processingAnnotationSourceType = z.union([
+    z.literal('Metadata'),
+    z.literal('NucleotideSequence'),
+    z.literal('SubmittedFile'),
+]);
 export type ProcessingAnnotationSourceType = z.infer<typeof processingAnnotationSourceType>;
 
 const processingAnnotation = z.object({

From 35c71b4b93a53b9fd08104de1d2aca4db81cfdd3 Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Fri, 26 Jun 2026 17:58:13 +0200
Subject: [PATCH 09/11] Fix

---
 preprocessing/nextclade/src/loculus_preprocessing/prepro.py    | 3 +++
 .../src/loculus_preprocessing/processing_functions.py          | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index da7736e2ea..be70462c18 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -445,6 +445,9 @@ def process_submitted_files(
     warnings: list[ProcessingAnnotation] = []
 
     for category, files in file_mapping.items():
+        if not files:
+            # Backend always includes a key with empty list for enabled categories
+            continue
         match category:
             case FileCategory.RAW_READS:
                 rr_errors, rr_warnings = validate_raw_reads_submission(files)
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 0288f17e3a..ec9a52f06b 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -1955,8 +1955,7 @@ def validate_raw_reads_submission(
     errors: list[ProcessingAnnotation] = []
     warnings: list[ProcessingAnnotation] = []
 
-    min_files, max_files = 1, 2
-    if not (min_files <= len(files) <= max_files):
+    if len(files) > 2:  # noqa: PLR2004
         name = "; ".join(f.name for f in files)
         message = f"Raw reads must be submitted as one or two files, got {len(files)}"
         errors.append(

From ee80c85cb1dd5ee77dc6fce39af956ec5c367c81 Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Mon, 29 Jun 2026 14:33:11 +0200
Subject: [PATCH 10/11] Addressing comments

---
 .../loculus_preprocessing/processing_functions.py    | 12 +++++++-----
 .../nextclade/tests/test_nextclade_preprocessing.py  |  4 ++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index ec9a52f06b..aa5dc32903 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -1956,20 +1956,22 @@ def validate_raw_reads_submission(
     warnings: list[ProcessingAnnotation] = []
 
     if len(files) > 2:  # noqa: PLR2004
-        name = "; ".join(f.name for f in files)
         message = f"Raw reads must be submitted as one or two files, got {len(files)}"
         errors.append(
-            ProcessingAnnotation.from_single(
-                name=name, type=AnnotationSourceType.SUBMITTED_FILE, message=message
+            ProcessingAnnotation.from_fields(
+                input_fields=[f.name for f in files],
+                output_fields=[f.name for f in files],
+                type=AnnotationSourceType.SUBMITTED_FILE,
+                message=message,
             )
         )
 
-    allowed_extensions = ["fastq", "fastq.gz", "fq", "fq.gz"]
+    allowed_extensions = [".fastq", ".fastq.gz", ".fq", ".fq.gz"]
     for file in files:
         if not any(file.name.endswith(extension) for extension in allowed_extensions):
             message = (
                 f"Raw reads file '{file.name}' has unrecognized extension."
-                f" Allowed extensions: {allowed_extensions}"
+                f" Allowed extensions: {', '.join(allowed_extensions)}"
             )
             errors.append(
                 ProcessingAnnotation.from_single(
diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
index a05efcba82..8974b55915 100644
--- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
+++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -403,7 +403,7 @@ def invalid_sequence() -> str:
                     ["reads.txt"],
                     ["reads.txt"],
                     "Raw reads file 'reads.txt' has unrecognized extension. "
-                    "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']",
+                    "Allowed extensions: .fastq, .fastq.gz, .fq, .fq.gz",
                     AnnotationSourceType.SUBMITTED_FILE,
                 )
             ]
@@ -1295,7 +1295,7 @@ def invalid_sequence() -> str:
                     ["reads.txt"],
                     ["reads.txt"],
                     "Raw reads file 'reads.txt' has unrecognized extension. "
-                    "Allowed extensions: ['fastq', 'fastq.gz', 'fq', 'fq.gz']",
+                    "Allowed extensions: .fastq, .fastq.gz, .fq, .fq.gz",
                     AnnotationSourceType.SUBMITTED_FILE,
                 )
             ]

From 9406c2975be8bcde6e063362f0f559108e403725 Mon Sep 17 00:00:00 2001
From: maverbiest <maxaverbiest@gmail.com>
Date: Mon, 29 Jun 2026 14:39:25 +0200
Subject: [PATCH 11/11] Fix tests again

---
 preprocessing/nextclade/tests/test_nextclade_preprocessing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
index 8974b55915..f1fc7fc641 100644
--- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
+++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -357,8 +357,8 @@ def invalid_sequence() -> str:
         expected_errors=build_processing_annotations(
             [
                 ProcessingAnnotationHelper(
-                    ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"],
-                    ["reads_R1.fastq; reads_R2.fastq; reads_R3.fastq"],
+                    ["reads_R1.fastq", "reads_R2.fastq", "reads_R3.fastq"],
+                    ["reads_R1.fastq", "reads_R2.fastq", "reads_R3.fastq"],
                     "Raw reads must be submitted as one or two files, got 3",
                     AnnotationSourceType.SUBMITTED_FILE,
                 )