From 8d7830cccf6db1d7621204e2d3fb25f52bdb8504 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 29 May 2026 10:18:21 +0000
Subject: [PATCH 01/18] Initial plan


From 030c4fc10213efc906a46bc89e4dce6f2febac79 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 29 May 2026 10:31:45 +0000
Subject: [PATCH 02/18] Refactor processing functions to use
 RawProcessingResult for less verbose error/warning handling

- Add RawProcessingResult dataclass with plain string warnings/errors
- Add processing_error() and processing_warning() helper functions
- Modify call_function to handle both ProcessingResult and RawProcessingResult
- Convert all processing functions to return RawProcessingResult
- Simplify check_latin_characters and taxonomy helper functions
- Remove unused invalid_value_annotation function
- Update tests to match new return types (errors/warnings are now strings)

This reduces the file from 2208 to 1719 lines by eliminating repetitive
ProcessingAnnotation.from_fields(...) boilerplate in every error/warning.
---
 .../src/loculus_preprocessing/datatypes.py    |  24 +
 .../processing_functions.py                   | 903 ++++--------------
 .../tests/test_assign_custom_lineage.py       |   2 +-
 .../test_metadata_processing_functions.py     |  10 +-
 4 files changed, 237 insertions(+), 702 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
index f56dd0cb3a..7004f81354 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -176,6 +176,30 @@ class ProcessingResult:
     errors: list[ProcessingAnnotation] = field(default_factory=list)
 
 
+@dataclass
+class RawProcessingResult:
+    """A simplified ProcessingResult where warnings/errors are plain strings.
+
+    Processing functions return this type; the caller (call_function) converts
+    it into a full ProcessingResult by attaching input_fields, output_field,
+    and annotation source type information.
+    """
+
+    datum: ProcessedMetadataValue = None
+    warnings: list[str] = field(default_factory=list)
+    errors: list[str] = field(default_factory=list)
+
+
+def processing_error(message: str) -> "RawProcessingResult":
+    """Helper to create a RawProcessingResult with a single error and no datum."""
+    return RawProcessingResult(datum=None, errors=[message])
+
+
+def processing_warning(message: str, datum: ProcessedMetadataValue = None) -> "RawProcessingResult":
+    """Helper to create a RawProcessingResult with a single warning."""
+    return RawProcessingResult(datum=datum, warnings=[message])
+
+
 @unique
 class SegmentClassificationMethod(StrEnum):
     """
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index e2428bd2b7..3adb56c556 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -31,6 +31,8 @@
     ProcessedMetadataValue,
     ProcessingAnnotation,
     ProcessingResult,
+    RawProcessingResult,
+    processing_error,
 )
 
 logger = logging.getLogger(__name__)
@@ -102,19 +104,6 @@ def standardize_option(option):
     return " ".join(option.lower().split())
 
 
-def invalid_value_annotation(
-    input_datum, output_field, input_fields, value_type
-) -> ProcessingAnnotation:
-    return ProcessingAnnotation(
-        processedFields=[AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)],
-        unprocessedFields=[
-            AnnotationSource(name=field, type=AnnotationSourceType.METADATA)
-            for field in input_fields
-        ],
-        message=f"Invalid {value_type} value: {input_datum} for field {output_field}.",
-    )
-
-
 def valid_name() -> str:
     chars = (
         r"\u0041-\u005A"  # A-Z
@@ -163,9 +152,9 @@ def reformat_authors_from_latin_to_ascii(authors: str) -> str:
 
 def check_latin_characters(
     authors: str, input_fields: list[str], output_field: str
-) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]:
-    warnings: list[ProcessingAnnotation] = []
-    errors: list[ProcessingAnnotation] = []
+) -> tuple[list[str], list[str]]:
+    warnings: list[str] = []
+    errors: list[str] = []
     # Check if all characters in the authors string are Latin letters or spaces
     # (transformable to ASCII)
     for char in authors:
@@ -174,18 +163,7 @@ def check_latin_characters(
             continue
         if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F):
             errors = [
-                ProcessingAnnotation(
-                    processedFields=[
-                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
-                    ],
-                    unprocessedFields=[
-                        AnnotationSource(name=field, type=AnnotationSourceType.METADATA)
-                        for field in input_fields
-                    ],
-                    message=(
-                        f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."
-                    ),
-                )
+                f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."
             ]
     return (errors, warnings)
 
@@ -236,18 +214,9 @@ def regex_error(
     )
 
 
-def missing_taxonomy_service_error(input_fields: list[str], output_field: str) -> ProcessingResult:
-    return ProcessingResult(
-        datum=None,
-        warnings=[],
-        errors=[
-            ProcessingAnnotation.from_fields(
-                input_fields,
-                [output_field],
-                AnnotationSourceType.METADATA,
-                message="Configuration error: taxonomy_service_url was None. Please contact the administrator.",
-            )
-        ],
+def missing_taxonomy_service_error(input_fields: list[str], output_field: str) -> RawProcessingResult:
+    return processing_error(
+        "Configuration error: taxonomy_service_url was None. Please contact the administrator."
     )
 
 
@@ -257,18 +226,9 @@ def taxonomy_network_error(
     e: Exception,
     input_fields: list[str],
     output_field: str,
-) -> ProcessingResult:
-    return ProcessingResult(
-        datum=None,
-        warnings=[],
-        errors=[
-            ProcessingAnnotation.from_fields(
-                input_fields,
-                [output_field],
-                AnnotationSourceType.METADATA,
-                message=f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator.",
-            )
-        ],
+) -> RawProcessingResult:
+    return processing_error(
+        f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator."
     )
 
 
@@ -406,6 +366,28 @@ def call_function(
                     )
                 ],
             )
+        if isinstance(result, RawProcessingResult):
+            return ProcessingResult(
+                datum=result.datum,
+                warnings=[
+                    ProcessingAnnotation.from_fields(
+                        input_fields,
+                        [output_field],
+                        AnnotationSourceType.METADATA,
+                        message=msg,
+                    )
+                    for msg in result.warnings
+                ],
+                errors=[
+                    ProcessingAnnotation.from_fields(
+                        input_fields,
+                        [output_field],
+                        AnnotationSourceType.METADATA,
+                        message=msg,
+                    )
+                    for msg in result.errors
+                ],
+            )
         if not isinstance(result, ProcessingResult):
             logger.error(
                 f"ERROR: Function {function_name} did not return ProcessingResult "
@@ -436,7 +418,7 @@ def check_date(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,  # args is essential - even if Pylance says it's not used
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Check that date is complete YYYY-MM-DD
         If not according to format return error
         If in future, return warning
@@ -445,40 +427,18 @@ def check_date(
         date = input_data["date"]
 
         if not date:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[],
-            )
+            return RawProcessingResult()
 
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
+        warnings: list[str] = []
         try:
             parsed_date = datetime.strptime(date, "%Y-%m-%d").astimezone(pytz.utc)
             if parsed_date > datetime.now(tz=pytz.utc):
-                warnings.append(
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message="Date is in the future.",
-                    )
-                )
-            return ProcessingResult(datum=date, warnings=warnings, errors=errors)
+                warnings.append("Date is in the future.")
+            return RawProcessingResult(datum=date, warnings=warnings)
         except ValueError as e:
-            error_message = (
-                f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}"
-            )
-            return ProcessingResult(
-                datum=None,
-                warnings=warnings,
+            return RawProcessingResult(
                 errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=error_message,
-                    )
+                    f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}"
                 ],
             )
 
@@ -488,7 +448,7 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,  # args is essential - even if Pylance says it's not used
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """
         Parse date string (`input.date`) with input formats:
         - YYYY
@@ -516,36 +476,23 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
         try:
             submitted_at = datetime.fromtimestamp(float(str(args["submittedAt"])), tz=pytz.utc)
         except Exception:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            f"Internal Error: Function parse_into_ranges did not receive valid "
-                            f"submittedAt date, with input {input_data} and args {args}, "
-                            "please contact the administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                f"Internal Error: Function parse_into_ranges did not receive valid "
+                f"submittedAt date, with input {input_data} and args {args}, "
+                "please contact the administrator."
             )
 
         max_upper_limit = min(submitted_at, release_date) if release_date else submitted_at
 
         if not input_date_str:
-            return ProcessingResult(
+            return RawProcessingResult(
                 datum=max_upper_limit.strftime("%Y-%m-%d")
                 if args["fieldType"] == "dateRangeUpper"
                 else None,
-                warnings=[],
-                errors=[],
             )
 
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
+        warnings: list[str] = []
+        errors: list[str] = []
 
         datum = convert_to_date_range(input_date_str)
 
@@ -566,18 +513,9 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
                 upper_date = convert_to_date_range(match.group(2))
 
                 if lower_date is None or upper_date is None:
-                    return ProcessingResult(
-                        datum=None,
-                        warnings=[],
-                        errors=[
-                            ProcessingAnnotation.from_fields(
-                                input_fields,
-                                [output_field],
-                                AnnotationSourceType.METADATA,
-                                message=f"Metadata field {output_field}: "
-                                f"Detected date range but could not parse date: {input_date_str}.",
-                            )
-                        ],
+                    return processing_error(
+                        f"Metadata field {output_field}: "
+                        f"Detected date range but could not parse date: {input_date_str}."
                     )
                 msg = None
                 if lower_date.message or upper_date.message:
@@ -601,27 +539,13 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
 
         if datum and datum.message:
             warnings.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=f"Metadata field {output_field}:'{input_date_str}' - " + datum.message,
-                )
+                f"Metadata field {output_field}:'{input_date_str}' - " + datum.message
             )
 
         if datum is None:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=f"Metadata field {output_field}: "
-                        f"Date {input_date_str} could not be parsed.",
-                    )
-                ],
+            return processing_error(
+                f"Metadata field {output_field}: "
+                f"Date {input_date_str} could not be parsed."
             )
 
         logger.debug(f"parsed_date: {datum}")
@@ -631,16 +555,9 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
                 f"Lower range of date: {datum.date_range_lower} > upper: {datum.date_range_upper}"
             )
             errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=(
-                        f"Metadata field {output_field}:'{input_date_str}' is an invalid date "
-                        f"range. Lower bound: {datum.date_range_lower} is after upper "
-                        f"bound: {datum.date_range_upper}."
-                    ),
-                )
+                f"Metadata field {output_field}:'{input_date_str}' is an invalid date "
+                f"range. Lower bound: {datum.date_range_lower} is after upper "
+                f"bound: {datum.date_range_upper}."
             )
 
         if datum.date_range_upper > max_upper_limit:
@@ -656,12 +573,7 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
                 f"Lower range of date: {datum.date_range_lower} > {datetime.now(tz=pytz.utc)}"
             )
             errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=(f"Metadata field {output_field}:'{input_date_str}' is in the future."),
-                )
+                f"Metadata field {output_field}:'{input_date_str}' is in the future."
             )
 
         if release_date and datum.date_range_lower and (datum.date_range_lower > release_date):
@@ -669,14 +581,7 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
                 f"Lower range of date: {datum.date_range_lower} > release_date: {release_date}"
             )
             errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=(
-                        f"Metadata field {output_field}:'{input_date_str}' is after release date."
-                    ),
-                )
+                f"Metadata field {output_field}:'{input_date_str}' is after release date."
             )
 
         match args["fieldType"]:
@@ -697,7 +602,7 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
                 msg = f"Config error: Unknown fieldType: {args['fieldType']}"
                 raise ValueError(msg)
 
-        return ProcessingResult(datum=return_value, warnings=warnings, errors=errors)
+        return RawProcessingResult(datum=return_value, warnings=warnings, errors=errors)
 
     @staticmethod
     def parse_and_assert_past_date(  # noqa: C901
@@ -705,7 +610,7 @@ def parse_and_assert_past_date(  # noqa: C901
         output_field,
         input_fields: list[str],
         args: FunctionArgs,  # args is essential - even if Pylance says it's not used
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Parse date string. If it's incomplete, add 01-01, if no year, return null and error
         input_data:
             date: str, date string to parse
@@ -714,11 +619,7 @@ def parse_and_assert_past_date(  # noqa: C901
         date_str = input_data["date"]
 
         if not date_str:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[],
-            )
+            return RawProcessingResult()
         release_date_str = input_data.get("release_date", "") or ""
         try:
             release_date = dateutil.parse(release_date_str)
@@ -731,8 +632,8 @@ def parse_and_assert_past_date(  # noqa: C901
             "%Y": "Month and day are missing. Assuming January 1st.",
         }
 
-        warnings = []
-        errors = []
+        warnings: list[str] = []
+        errors: list[str] = []
 
         for format, message in formats_to_messages.items():
             try:
@@ -749,54 +650,28 @@ def parse_and_assert_past_date(  # noqa: C901
 
                 if message:
                     warnings.append(
-                        ProcessingAnnotation.from_fields(
-                            input_fields,
-                            [output_field],
-                            AnnotationSourceType.METADATA,
-                            message=f"Metadata field {output_field}:'{date_str}' - " + message,
-                        )
+                        f"Metadata field {output_field}:'{date_str}' - " + message
                     )
 
                 if parsed_date > datetime.now(tz=pytz.utc):
                     logger.debug(f"parsed_date: {parsed_date} > {datetime.now(tz=pytz.utc)}")
                     errors.append(
-                        ProcessingAnnotation.from_fields(
-                            input_fields,
-                            [output_field],
-                            AnnotationSourceType.METADATA,
-                            message=f"Metadata field {output_field}:'{date_str}' is in the future.",
-                        )
+                        f"Metadata field {output_field}:'{date_str}' is in the future."
                     )
 
                 if release_date and parsed_date > release_date:
                     logger.debug(f"parsed_date: {parsed_date} > release_date: {release_date}")
                     errors.append(
-                        ProcessingAnnotation.from_fields(
-                            input_fields,
-                            [output_field],
-                            AnnotationSourceType.METADATA,
-                            message=(
-                                f"Metadata field {output_field}:'{date_str}'is after release date."
-                            ),
-                        )
+                        f"Metadata field {output_field}:'{date_str}'is after release date."
                     )
 
-                return ProcessingResult(datum=datum, warnings=warnings, errors=errors)
+                return RawProcessingResult(datum=datum, warnings=warnings, errors=errors)
             except ValueError:
                 continue
 
         # If all parsing attempts fail, it's an unrecognized format
-        return ProcessingResult(
-            datum=None,
-            warnings=[],
-            errors=[
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=f"Metadata field {output_field}: Date format is not recognized.",
-                )
-            ],
+        return processing_error(
+            f"Metadata field {output_field}: Date format is not recognized."
         )
 
     @staticmethod
@@ -805,43 +680,20 @@ def parse_timestamp(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,  # args is essential - even if Pylance says it's not used
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Parse a timestamp string, e.g. 2022-11-01T00:00:00Z and return a YYYY-MM-DD string"""
         timestamp = input_data["timestamp"]
 
         if not timestamp:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[],
-            )
+            return RawProcessingResult()
 
-        # Parse timestamp
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
         try:
             parsed_timestamp = dateutil.parse(timestamp)
-            return ProcessingResult(
-                datum=parsed_timestamp.strftime("%Y-%m-%d"),
-                warnings=warnings,
-                errors=errors,
-            )
+            return RawProcessingResult(datum=parsed_timestamp.strftime("%Y-%m-%d"))
         except ValueError as e:
-            error_message = (
+            return processing_error(
                 f"Timestamp is {timestamp} which is not in parseable YYYY-MM-DD. Parsing error: {e}"
             )
-            return ProcessingResult(
-                datum=None,
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=error_message,
-                    )
-                ],
-                warnings=warnings,
-            )
 
     @staticmethod
     def concatenate(
@@ -849,31 +701,20 @@ def concatenate(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Concatenates input fields using the "/" separator in the order
         specified by the order argument. Optionally, a 'fallback_value' argument can be provided.
         This should be a string to use in place of metadata that is not available. If fallback_value
         is not provided, the empty string will be used in place of missing metadata.
         """
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
+        warnings: list[str] = []
+        errors: list[str] = []
 
         if not isinstance(args["ACCESSION_VERSION"], str):
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            "Internal Error: Function concatenate did not receive "
-                            f"accession_version ProcessingResult with input {input_data} "
-                            f"and args {args}, please contact the administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                "Internal Error: Function concatenate did not receive "
+                f"accession_version ProcessingResult with input {input_data} "
+                f"and args {args}, please contact the administrator."
             )
 
         accession_version: str = args["ACCESSION_VERSION"]
@@ -883,28 +724,18 @@ def concatenate(
             str(args["fallback_value"]).strip() if args.get("fallback_value") is not None else ""
         )
 
-        def add_errors():
-            errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message="Concatenation failed."
-                    "This may be a configuration error, please contact the administrator.",
-                )
-            )
+        error_msg = (
+            "Concatenation failed."
+            "This may be a configuration error, please contact the administrator."
+        )
 
         if not isinstance(order, list):
             logger.error(
                 f"Concatenate: Expected order field to be a list. "
                 f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
             )
-            add_errors()
-            return ProcessingResult(
-                datum=None,
-                warnings=warnings,
-                errors=errors,
-            )
+            errors.append(error_msg)
+            return RawProcessingResult(errors=errors)
 
         n_inputs = len(input_data.keys())
         # exclude ACCESSION_VERSION as it's provided by _call_preprocessing_function() and should not be an input_metadata field
@@ -916,23 +747,15 @@ def add_errors():
                 f"Concatenate: Expected {n_expected} fields, got {n_inputs}. "
                 f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
             )
-            add_errors()
-            return ProcessingResult(
-                datum=None,
-                warnings=warnings,
-                errors=errors,
-            )
+            errors.append(error_msg)
+            return RawProcessingResult(errors=errors)
         if not isinstance(field_types, list):
             logger.error(
                 f"Concatenate: Expected type field to be a list. "
                 f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
             )
-            add_errors()
-            return ProcessingResult(
-                datum=None,
-                warnings=warnings,
-                errors=errors,
-            )
+            errors.append(error_msg)
+            return RawProcessingResult(errors=errors)
 
         formatted_input_data: list[str] = []
         try:
@@ -955,7 +778,7 @@ def add_errors():
                     raw_value = str(raw).strip()
                     if raw_value.count("/") > 1:
                         date_string = None
-                        add_errors()
+                        errors.append(error_msg)
                     else:
                         date_string = raw_value.replace("/", " TO ")
                     formatted_input_data.append(
@@ -978,12 +801,8 @@ def add_errors():
                             f"Concatenate: Missing argument {field_types[i][4:]} in args. "
                             f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
                         )
-                        add_errors()
-                        return ProcessingResult(
-                            datum=None,
-                            warnings=warnings,
-                            errors=errors,
-                        )
+                        errors.append(error_msg)
+                        return RawProcessingResult(errors=errors, warnings=warnings)
                     formatted_input_data.append(str(args[field_types[i][4:]]))
                 elif order[i] in input_data:
                     formatted_input_data.append(
@@ -996,37 +815,22 @@ def add_errors():
                         f"Concatenate: cannot find field {order[i]} of {field_types[i]} in input_data"
                         f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
                     )
-                    add_errors()
-                    return ProcessingResult(
-                        datum=None,
-                        warnings=warnings,
-                        errors=errors,
-                    )
+                    errors.append(error_msg)
+                    return RawProcessingResult(errors=errors, warnings=warnings)
 
             result = "/".join(formatted_input_data)
             # To avoid downstream issues do not let the result start or end in a "/"
             # Also replace white space with '_'
             result = result.strip("/").replace(" ", "_")
 
-            return ProcessingResult(datum=result, warnings=warnings, errors=errors)
+            return RawProcessingResult(datum=result, warnings=warnings, errors=errors)
         except ValueError as e:
             logger.error(f"Concatenate failed with {e} (ACCESSION_VERSION: {accession_version})")
             errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=(
-                        f"Concatenation failed for {output_field}. This is a technical error, "
-                        "please contact the administrator."
-                    ),
-                )
-            )
-            return ProcessingResult(
-                datum=None,
-                errors=errors,
-                warnings=warnings,
+                f"Concatenation failed for {output_field}. This is a technical error, "
+                "please contact the administrator."
             )
+            return RawProcessingResult(errors=errors, warnings=warnings)
 
     @staticmethod
     def check_authors(
@@ -1034,7 +838,7 @@ def check_authors(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         authors = input_data["authors"]
 
         author_format_description = (
@@ -1045,57 +849,29 @@ def check_authors(
             "are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' "
             "or 'Xu,;' if the first name is unknown."
         )
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
+        warnings: list[str] = []
+        errors: list[str] = []
 
         if not authors:
-            return ProcessingResult(
-                datum=None,
-                warnings=warnings,
-                errors=errors,
-            )
+            return RawProcessingResult()
         errors, warnings = check_latin_characters(authors, input_fields, output_field)
         if errors or warnings:
-            return ProcessingResult(
-                datum=None,
-                warnings=warnings,
-                errors=errors,
-            )
+            return RawProcessingResult(warnings=warnings, errors=errors)
 
         if valid_authors(authors):
             formatted_authors = format_authors(authors)
             if warn_potentially_invalid_authors(authors):
-                warning_message = (
+                warnings.append(
                     "The authors list might not be using the Loculus format. "
                     + author_format_description
                 )
-                warnings.append(
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=warning_message,
-                    )
-                )
             if " and " in authors:
-                warning_message = (
+                warnings.append(
                     "Authors list contains 'and'. This may indicate a misformatted "
                     "authors list. Authors should always be separated by semi-colons only e.g. "
                     "`Smith, Anna; Perez, Tom J.; Xu, X.L.`."
                 )
-                warnings.append(
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=warning_message,
-                    )
-                )
-            return ProcessingResult(
-                datum=formatted_authors,
-                warnings=warnings,
-                errors=errors,
-            )
+            return RawProcessingResult(datum=formatted_authors, warnings=warnings)
         invalid_names = get_invalid_author_names(authors)
         if invalid_names:
             names_to_show = "; ".join(f"'{name}'" for name in invalid_names[:3])
@@ -1106,18 +882,7 @@ def check_authors(
             error_message = (
                 "The authors list is not in a recognized format. " + author_format_description
             )
-        return ProcessingResult(
-            datum=None,
-            errors=[
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=error_message,
-                )
-            ],
-            warnings=warnings,
-        )
+        return RawProcessingResult(errors=[error_message], warnings=warnings)
 
     @staticmethod
     def extract_regex(
@@ -1125,7 +890,7 @@ def extract_regex(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """
         Extracts a substring from the `regex_field` using the provided regex `pattern`
         with a `capture_group`, if `uppercase` is set to true the extracted value is capitalized.
@@ -1134,68 +899,41 @@ def extract_regex(
         """
         regex_field = input_data["regex_field"]
 
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
+        errors: list[str] = []
 
         pattern = args.get("pattern")
         capture_group = args.get("capture_group")
         uppercase = args.get("uppercase", False)
 
         if not regex_field:
-            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+            return RawProcessingResult()
         if not isinstance(pattern, str):
-            errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=regex_error("extract_regex", "pattern", input_data, args),
-                )
+            return processing_error(
+                regex_error("extract_regex", "pattern", input_data, args)
             )
-            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
         if not isinstance(capture_group, str):
-            errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=regex_error("extract_regex", "capture_group", input_data, args),
-                )
+            return processing_error(
+                regex_error("extract_regex", "capture_group", input_data, args)
             )
-            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
         match = re.match(pattern, regex_field.strip())
         if match:
             try:
                 result = match.group(capture_group)
                 if result is not None and uppercase:
                     result = result.upper()
-                return ProcessingResult(datum=result, warnings=warnings, errors=errors)
+                return RawProcessingResult(datum=result)
             except IndexError:
                 errors.append(
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            f"The pattern '{pattern}' does not contain a capture group: "
-                            f"'{capture_group}'- this is an internal error,"
-                            " please contact your local administrator."
-                        ),
-                    )
+                    f"The pattern '{pattern}' does not contain a capture group: "
+                    f"'{capture_group}'- this is an internal error,"
+                    " please contact your local administrator."
                 )
         else:
             errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=(
-                        f"The value '{regex_field}' does not match the expected regex "
-                        f"pattern: '{pattern}'."
-                    ),
-                )
+                f"The value '{regex_field}' does not match the expected regex "
+                f"pattern: '{pattern}'."
             )
-        return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+        return RawProcessingResult(errors=errors)
 
     @staticmethod
     def check_regex(
@@ -1203,69 +941,41 @@ def check_regex(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """
         Validates that the field regex_field matches the regex expression.
         If not return error
         """
         regex_field = input_data["regex_field"]
 
-        warnings: list[ProcessingAnnotation] = []
-        errors: list[ProcessingAnnotation] = []
-
         pattern = args["pattern"]
 
         if not regex_field:
-            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+            return RawProcessingResult()
         if not isinstance(pattern, str):
-            errors.append(
-                ProcessingAnnotation.from_fields(
-                    input_fields,
-                    [output_field],
-                    AnnotationSourceType.METADATA,
-                    message=regex_error("check_regex", "pattern", input_data, args),
-                )
+            return processing_error(
+                regex_error("check_regex", "pattern", input_data, args)
             )
-            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
         regex_field = regex_field.strip()
         if re.match(pattern, regex_field):
-            return ProcessingResult(datum=regex_field, warnings=warnings, errors=errors)
-        errors.append(
-            ProcessingAnnotation.from_fields(
-                input_fields,
-                [output_field],
-                AnnotationSourceType.METADATA,
-                message=(
-                    f"The value '{regex_field}' does not match the expected regex "
-                    f"pattern: '{pattern}'."
-                ),
-            )
+            return RawProcessingResult(datum=regex_field)
+        return processing_error(
+            f"The value '{regex_field}' does not match the expected regex "
+            f"pattern: '{pattern}'."
         )
-        return ProcessingResult(datum=None, warnings=warnings, errors=errors)
 
     @staticmethod
     def identity(  # noqa: C901, PLR0912
         input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Identity function, takes input_data["input"] and returns it as output"""
         if "input" not in input_data:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=f"No data found for output field: {output_field}",
-                    )
-                ],
-            )
+            return processing_error(f"No data found for output field: {output_field}")
         input_datum = input_data["input"]
         if not input_datum:
-            return ProcessingResult(datum=None, warnings=[], errors=[])
+            return RawProcessingResult()
 
-        errors: list[ProcessingAnnotation] = []
+        errors: list[str] = []
         output_datum: ProcessedMetadataValue
         if args and "type" in args:
             match args["type"]:
@@ -1275,7 +985,7 @@ def identity(  # noqa: C901, PLR0912
                     except ValueError:
                         output_datum = None
                         errors.append(
-                            invalid_value_annotation(input_datum, output_field, input_fields, "int")
+                            f"Invalid int value: {input_datum} for field {output_field}."
                         )
                 case "float":
                     try:
@@ -1287,9 +997,7 @@ def identity(  # noqa: C901, PLR0912
                     except ValueError:
                         output_datum = None
                         errors.append(
-                            invalid_value_annotation(
-                                input_datum, output_field, input_fields, "float"
-                            )
+                            f"Invalid float value: {input_datum} for field {output_field}."
                         )
                 case "boolean":
                     if input_datum.lower() == "true":
@@ -1299,9 +1007,7 @@ def identity(  # noqa: C901, PLR0912
                     else:
                         output_datum = None
                         errors.append(
-                            invalid_value_annotation(
-                                input_datum, output_field, input_fields, "boolean"
-                            )
+                            f"Invalid boolean value: {input_datum} for field {output_field}."
                         )
                 case _:
                     if isinstance(input_datum, str):
@@ -1310,32 +1016,21 @@ def identity(  # noqa: C901, PLR0912
                         output_datum = input_datum
         else:
             output_datum = input_datum
-        return ProcessingResult(datum=output_datum, warnings=[], errors=errors)
+        return RawProcessingResult(datum=output_datum, errors=errors)
 
     @staticmethod
     def process_options(
         input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Checks that option is in options"""
         if "options" not in args or not isinstance(args["options"], list):
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            "Website configuration error: no options list specified for field "
-                            f"{output_field}, please contact an administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                "Website configuration error: no options list specified for field "
+                f"{output_field}, please contact an administrator."
             )
         input_datum = input_data["input"]
         if not input_datum:
-            return ProcessingResult(datum=None, warnings=[], errors=[])
+            return RawProcessingResult()
 
         output_datum: ProcessedMetadataValue
         standardized_input_datum = standardize_option(input_datum)
@@ -1350,102 +1045,47 @@ def process_options(
             output_datum = options[standardized_input_datum]
         # Allow ingested data to include fields not in options
         elif args["is_insdc_ingest_group"]:
-            return ProcessingResult(
-                datum=input_datum,
-                warnings=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=error_msg,
-                    )
-                ],
-                errors=[],
-            )
+            return RawProcessingResult(datum=input_datum, warnings=[error_msg])
         else:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=error_msg,
-                    )
-                ],
-            )
-        return ProcessingResult(datum=output_datum, warnings=[], errors=[])
+            return processing_error(error_msg)
+        return RawProcessingResult(datum=output_datum)
 
     @staticmethod
     def is_above_threshold(
         input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Flag if input value is above a threshold specified in args"""
         if "threshold" not in args:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            f"Field {output_field} is missing threshold argument."
-                            " Please report this error to the administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                f"Field {output_field} is missing threshold argument."
+                " Please report this error to the administrator."
             )
         input_datum = input_data["input"]
         if input_datum is None or (isinstance(input_datum, str) and not input_datum.strip()):
-            return ProcessingResult(datum=None, warnings=[], errors=[])
+            return RawProcessingResult()
         try:
             threshold = float(args["threshold"])  # type: ignore
             input = float(input_datum)
         except (ValueError, TypeError):
             msg = f"Field {output_field} has non-numeric threshold value."
             logger.error(msg)
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(msg),
-                    )
-                ],
-            )
-        return ProcessingResult(datum=(input > threshold), warnings=[], errors=[])
+            return processing_error(msg)
+        return RawProcessingResult(datum=(input > threshold))
 
     @staticmethod
     def is_variant(
         input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Flag if number of mutations is above mutation rate (specified in args) times length"""
         if "mu" not in args:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            f"Field {output_field} is missing mu argument."
-                            " Please report this error to the administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                f"Field {output_field} is missing mu argument."
+                " Please report this error to the administrator."
             )
         length_datum = input_data.get("length")
         num_mutations_datum = input_data.get("numMutations")
         if length_datum is None or num_mutations_datum is None:
-            return ProcessingResult(datum=None, warnings=[], errors=[])
+            return RawProcessingResult()
         try:
             mu = float(args["mu"])  # type: ignore
             length = float(length_datum)
@@ -1457,21 +1097,10 @@ def is_variant(
                 args={"threshold": threshold},
             )
         except (ValueError, TypeError):
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            f"Field {output_field} has non-numeric length or numMutations value."
-                        ),
-                    )
-                ],
+            return processing_error(
+                f"Field {output_field} has non-numeric length or numMutations value."
             )
-        return ProcessingResult(
+        return RawProcessingResult(
             datum=is_above_threshold_result.datum,
             warnings=is_above_threshold_result.warnings,
             errors=is_above_threshold_result.errors,
@@ -1480,7 +1109,7 @@ def is_variant(
     @staticmethod
     def assign_custom_lineage(  # noqa: C901
         input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """
         Assign flu lineage based on seg4 and seg6.
         Add reassortant flag if different lineages are detected for internal segments,
@@ -1501,7 +1130,7 @@ def assign_custom_lineage(  # noqa: C901
             f"Starting custom lineage assignment with input_data: {input_data} and args: {args}"
         )
         if not input_data:
-            return ProcessingResult(datum=None, warnings=[], errors=[])
+            return RawProcessingResult()
         ha_subtype = input_data.get("subtype_seg4")
         na_subtype = input_data.get("subtype_seg6")
         references: dict[str, str | None] = {}
@@ -1532,7 +1161,7 @@ def assign_custom_lineage(  # noqa: C901
                 ).datum
             logger.debug(f"Extracted lineages: {extracted_lineages} from references: {references}")
             if not ha_subtype or not na_subtype:
-                return ProcessingResult(datum=None, warnings=[], errors=[])
+                return RawProcessingResult()
             lineage = f"{ha_subtype}{na_subtype}"
             if (
                 extracted_lineages.get("seg4") == "H1N1PDM"
@@ -1551,23 +1180,12 @@ def assign_custom_lineage(  # noqa: C901
                     lineage += " reassortant"
                 if any(v for v in variant.values() if v):
                     lineage += " (variant)"
-                return ProcessingResult(datum=lineage, warnings=[], errors=[])
+                return RawProcessingResult(datum=lineage)
         except (ValueError, TypeError):
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            f"Internal error processing custom lineage for field {output_field}."
-                        ),
-                    )
-                ],
+            return processing_error(
+                f"Internal error processing custom lineage for field {output_field}."
             )
-        return ProcessingResult(datum=None, warnings=[], errors=[])
+        return RawProcessingResult()
 
     @staticmethod
     def build_display_name(  # noqa: C901
@@ -1575,7 +1193,7 @@ def build_display_name(  # noqa: C901
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Builds a displayName from input_fields. The identifier field in the displayName is based on
         specimenCollectorSampleId or - if it is not set - submissionId.
 
@@ -1592,21 +1210,10 @@ def build_display_name(  # noqa: C901
         """
         collector_id = input_data.get("specimenCollectorSampleId", None)
         submission_id = input_data.get("submissionId", None)
-        warnings: list[ProcessingAnnotation] = []
+        warnings: list[str] = []
         if submission_id is None:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator."
             )
 
         order = args.get("order")
@@ -1617,19 +1224,8 @@ def build_display_name(  # noqa: C901
             or len(order) != len(field_types)
             or "IDENTIFIER" not in order
         ):
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator."
-                        ),
-                    )
-                ],
+            return processing_error(
+                "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator."
             )
 
         regex_pattern = args.get("regex_pattern")
@@ -1637,19 +1233,8 @@ def build_display_name(  # noqa: C901
             regex_pattern is not None
             and "identifier" not in re.compile(str(regex_pattern)).groupindex
         ):
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=(
-                            "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'"
-                        ),
-                    )
-                ],
+            return processing_error(
+                "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'"
             )
 
         concatenate_order = order.copy()
@@ -1680,14 +1265,7 @@ def replace_identifier(values, replacement):
                 if extract_result.datum is None:
                     # regex extraction of ID field failed, fall back to ACCESSION_VERSION
                     warnings.append(
-                        ProcessingAnnotation.from_fields(
-                            input_fields,
-                            [output_field],
-                            AnnotationSourceType.METADATA,
-                            message=(
-                                f"identifier string '{identifier}' could not be parsed, using ACCESSION_VERSION in displayName instead"
-                            ),
-                        )
+                        f"identifier string '{identifier}' could not be parsed, using ACCESSION_VERSION in displayName instead"
                     )
                 identifier = extract_result.datum
 
@@ -1717,7 +1295,7 @@ def replace_identifier(values, replacement):
             new_args,
         )
 
-        return ProcessingResult(
+        return RawProcessingResult(
             datum=concat_result.datum,
             warnings=warnings + concat_result.warnings,
             errors=concat_result.errors,
@@ -1729,7 +1307,7 @@ def resolve_host_taxon_id(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         """Validates that the host exists
         in NCBI's taxonomy. Checks either the hostTaxonId or the
         hostNameScientific, depending on the input.
@@ -1746,11 +1324,7 @@ def resolve_host_taxon_id(
 
         unvalidated_host = input_data.get("host")
         if not unvalidated_host:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[],
-            )
+            return RawProcessingResult()
 
         if unvalidated_host.isdigit():
             url = f"{tax_service}/taxa/{unvalidated_host}"
@@ -1768,13 +1342,8 @@ def resolve_host_taxon_id(
 
         if response.status_code != requests.codes.ok:
             # an invalid host organism is a warning for INSDC ingested sequences, but an error for everyone else
-            message = ProcessingAnnotation.from_fields(
-                input_fields,
-                [output_field],
-                AnnotationSourceType.METADATA,
-                message=f"Host validation for '{unvalidated_host}' failed with code {response.status_code}: {body.get('detail', '')}",
-            )
-            return ProcessingResult(
+            message = f"Host validation for '{unvalidated_host}' failed with code {response.status_code}: {body.get('detail', '')}"
+            return RawProcessingResult(
                 datum=None,
                 warnings=[message] if args["is_insdc_ingest_group"] else [],
                 errors=[message] if not args["is_insdc_ingest_group"] else [],
@@ -1788,26 +1357,13 @@ def resolve_host_taxon_id(
 
         tax_id = taxon.get("tax_id")
         if tax_id is None:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=f"Internal error: host validation for '{unvalidated_host}' "
-                        f"was successful but response json 'tax_id' was missing. "
-                        f"Please contact the administrator",
-                    )
-                ],
+            return processing_error(
+                f"Internal error: host validation for '{unvalidated_host}' "
+                f"was successful but response json 'tax_id' was missing. "
+                f"Please contact the administrator"
             )
 
-        return ProcessingResult(
-            datum=str(tax_id),
-            warnings=[],
-            errors=[],
-        )
+        return RawProcessingResult(datum=str(tax_id))
 
     @staticmethod
     def scientific_name_from_id(
@@ -1815,17 +1371,17 @@ def scientific_name_from_id(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         tax_service = args.get("taxonomy_service_url")
         if not tax_service:
             return missing_taxonomy_service_error(input_fields, output_field)
 
         tax_id: str | None = input_data.get("hostTaxonId")
         if not tax_id:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[],
+            return RawProcessingResult(
+                datum=input_data.get("hostNameScientific")
+                if args["is_insdc_ingest_group"]
+                else None,
             )
 
         url = f"{tax_service}/taxa/{tax_id}"
@@ -1838,38 +1394,19 @@ def scientific_name_from_id(
         if response.status_code != requests.codes.ok:
             message = f"Could not map '{tax_id}' to scientific name. Code {response.status_code}: {body.get('detail', '')}"
             logger.warning(message)
-            processing_annotation = ProcessingAnnotation.from_fields(
-                input_fields,
-                [output_field],
-                AnnotationSourceType.METADATA,
-                message=message,
-            )
-            return ProcessingResult(
+            return RawProcessingResult(
                 datum=None,
-                warnings=[processing_annotation] if args["is_insdc_ingest_group"] else [],
-                errors=[processing_annotation] if not args["is_insdc_ingest_group"] else [],
+                warnings=[message] if args["is_insdc_ingest_group"] else [],
+                errors=[message] if not args["is_insdc_ingest_group"] else [],
             )
 
         scientific_name = body.get("scientific_name")
         if scientific_name is None:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=f"Internal error: '{tax_id}' is a valid taxon ID but response json had no 'scientific_name'. Please contact the administrator",
-                    )
-                ],
+            return processing_error(
+                f"Internal error: '{tax_id}' is a valid taxon ID but response json had no 'scientific_name'. Please contact the administrator"
             )
 
-        return ProcessingResult(
-            datum=scientific_name,
-            warnings=[],
-            errors=[],
-        )
+        return RawProcessingResult(datum=scientific_name)
 
     @staticmethod
     def common_name_from_id(
@@ -1877,18 +1414,14 @@ def common_name_from_id(
         output_field: str,
         input_fields: list[str],
         args: FunctionArgs,
-    ) -> ProcessingResult:
+    ) -> RawProcessingResult:
         tax_service = args.get("taxonomy_service_url")
         if not tax_service:
             return missing_taxonomy_service_error(input_fields, output_field)
 
         tax_id: str | None = input_data.get("hostTaxonId")
         if not tax_id:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[],
-            )
+            return RawProcessingResult()
 
         url = f"{tax_service}/taxa/{tax_id}?find_common_name=true"
         try:
@@ -1900,39 +1433,19 @@ def common_name_from_id(
             )
 
         if response.status_code != requests.codes.ok:
-            return ProcessingResult(
-                datum=None,
+            return RawProcessingResult(
                 warnings=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=f"Could not map '{tax_id}' to common name. Code {response.status_code}: {body.get('detail', '')}",
-                    )
+                    f"Could not map '{tax_id}' to common name. Code {response.status_code}: {body.get('detail', '')}"
                 ],
-                errors=[],
             )
 
         common_name = body.get("common_name")
         if common_name is None:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=f"Internal error: taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it. Please contact the administrator.",
-                    )
-                ],
+            return processing_error(
+                f"Internal error: taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it. Please contact the administrator."
             )
 
-        return ProcessingResult(
-            datum=common_name,
-            warnings=[],
-            errors=[],
-        )
+        return RawProcessingResult(datum=common_name)
 
 
 def single_metadata_annotation(
diff --git a/preprocessing/nextclade/tests/test_assign_custom_lineage.py b/preprocessing/nextclade/tests/test_assign_custom_lineage.py
index 8ffec69a15..36974815b3 100644
--- a/preprocessing/nextclade/tests/test_assign_custom_lineage.py
+++ b/preprocessing/nextclade/tests/test_assign_custom_lineage.py
@@ -234,7 +234,7 @@ def test_missing_mu_arg_returns_error():
         )
         assert result.datum is None
         assert len(result.errors) == 1
-        assert "missing mu argument" in result.errors[0].message
+        assert "missing mu argument" in result.errors[0]
 
     @staticmethod
     def test_non_numeric_inputs_return_error():
diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
index d17ae42244..a03c3b2826 100644
--- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py
+++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
@@ -1109,7 +1109,6 @@ def test_parse_date_into_range() -> None:
             },
         )
         .errors[0]
-        .message
         == "Metadata field field_name: Detected date range but could not parse date: 20-01-2020/2021-06-30."
     ), "Invalid date range format errors."
     assert (
@@ -1123,7 +1122,6 @@ def test_parse_date_into_range() -> None:
             },
         )
         .errors[0]
-        .message
         == "Metadata field field_name:'2022-01-01/2021-06-30' is an invalid date range. Lower bound: 2022-01-01 00:00:00+00:00 is after upper bound: 2021-06-30 00:00:00+00:00."
     ), "Invalid date range format errors."
     assert (
@@ -1362,7 +1360,7 @@ def args_insdc():
     assert res.datum == "DENV-1/unknown/version.1/2025"
     assert len(res.warnings) == 1
     assert (
-        res.warnings[0].message
+        res.warnings[0]
         == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead"
     )
     assert res_insdc.datum == "DENV-1/unknown/version.1/2025"
@@ -1370,7 +1368,7 @@ def args_insdc():
     assert res_prefix.datum == "hYF/unknown/version.1/2025"
     assert len(res_prefix.warnings) == 1
     assert (
-        res_prefix.warnings[0].message
+        res_prefix.warnings[0]
         == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead"
     )
 
@@ -1396,7 +1394,7 @@ def args_insdc():
     assert res.datum == "DENV-1/another_fallback/version.1/2025"
     assert len(res.warnings) == 1
     assert (
-        res.warnings[0].message
+        res.warnings[0]
         == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead"
     )
     assert res_insdc.datum == "DENV-1/another_fallback/version.1/2025"
@@ -1404,7 +1402,7 @@ def args_insdc():
     assert res_prefix.datum == "hYF/another_fallback/version.1/2025"
     assert len(res_prefix.warnings) == 1
     assert (
-        res_prefix.warnings[0].message
+        res_prefix.warnings[0]
         == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead"
     )
 

From bcbf40617c2446f977d414363ce2347441fb961c Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:34:15 +0200
Subject: [PATCH 03/18] format

---
 .../processing_functions.py                   | 53 ++++++-------------
 .../test_metadata_processing_functions.py     |  6 +--
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 3adb56c556..78b7f34511 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -162,9 +162,7 @@ def check_latin_characters(
         if ord(char) < 128:
             continue
         if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F):
-            errors = [
-                f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."
-            ]
+            errors = [f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."]
     return (errors, warnings)
 
 
@@ -214,7 +212,9 @@ def regex_error(
     )
 
 
-def missing_taxonomy_service_error(input_fields: list[str], output_field: str) -> RawProcessingResult:
+def missing_taxonomy_service_error(
+    input_fields: list[str], output_field: str
+) -> RawProcessingResult:
     return processing_error(
         "Configuration error: taxonomy_service_url was None. Please contact the administrator."
     )
@@ -538,14 +538,11 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
                 )
 
         if datum and datum.message:
-            warnings.append(
-                f"Metadata field {output_field}:'{input_date_str}' - " + datum.message
-            )
+            warnings.append(f"Metadata field {output_field}:'{input_date_str}' - " + datum.message)
 
         if datum is None:
             return processing_error(
-                f"Metadata field {output_field}: "
-                f"Date {input_date_str} could not be parsed."
+                f"Metadata field {output_field}: Date {input_date_str} could not be parsed."
             )
 
         logger.debug(f"parsed_date: {datum}")
@@ -572,9 +569,7 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
             logger.debug(
                 f"Lower range of date: {datum.date_range_lower} > {datetime.now(tz=pytz.utc)}"
             )
-            errors.append(
-                f"Metadata field {output_field}:'{input_date_str}' is in the future."
-            )
+            errors.append(f"Metadata field {output_field}:'{input_date_str}' is in the future.")
 
         if release_date and datum.date_range_lower and (datum.date_range_lower > release_date):
             logger.debug(
@@ -649,15 +644,11 @@ def parse_and_assert_past_date(  # noqa: C901
                 logger.debug(f"parsed_date: {parsed_date}")
 
                 if message:
-                    warnings.append(
-                        f"Metadata field {output_field}:'{date_str}' - " + message
-                    )
+                    warnings.append(f"Metadata field {output_field}:'{date_str}' - " + message)
 
                 if parsed_date > datetime.now(tz=pytz.utc):
                     logger.debug(f"parsed_date: {parsed_date} > {datetime.now(tz=pytz.utc)}")
-                    errors.append(
-                        f"Metadata field {output_field}:'{date_str}' is in the future."
-                    )
+                    errors.append(f"Metadata field {output_field}:'{date_str}' is in the future.")
 
                 if release_date and parsed_date > release_date:
                     logger.debug(f"parsed_date: {parsed_date} > release_date: {release_date}")
@@ -670,9 +661,7 @@ def parse_and_assert_past_date(  # noqa: C901
                 continue
 
         # If all parsing attempts fail, it's an unrecognized format
-        return processing_error(
-            f"Metadata field {output_field}: Date format is not recognized."
-        )
+        return processing_error(f"Metadata field {output_field}: Date format is not recognized.")
 
     @staticmethod
     def parse_timestamp(
@@ -908,13 +897,9 @@ def extract_regex(
         if not regex_field:
             return RawProcessingResult()
         if not isinstance(pattern, str):
-            return processing_error(
-                regex_error("extract_regex", "pattern", input_data, args)
-            )
+            return processing_error(regex_error("extract_regex", "pattern", input_data, args))
         if not isinstance(capture_group, str):
-            return processing_error(
-                regex_error("extract_regex", "capture_group", input_data, args)
-            )
+            return processing_error(regex_error("extract_regex", "capture_group", input_data, args))
         match = re.match(pattern, regex_field.strip())
         if match:
             try:
@@ -930,8 +915,7 @@ def extract_regex(
                 )
         else:
             errors.append(
-                f"The value '{regex_field}' does not match the expected regex "
-                f"pattern: '{pattern}'."
+                f"The value '{regex_field}' does not match the expected regex pattern: '{pattern}'."
             )
         return RawProcessingResult(errors=errors)
 
@@ -953,15 +937,12 @@ def check_regex(
         if not regex_field:
             return RawProcessingResult()
         if not isinstance(pattern, str):
-            return processing_error(
-                regex_error("check_regex", "pattern", input_data, args)
-            )
+            return processing_error(regex_error("check_regex", "pattern", input_data, args))
         regex_field = regex_field.strip()
         if re.match(pattern, regex_field):
             return RawProcessingResult(datum=regex_field)
         return processing_error(
-            f"The value '{regex_field}' does not match the expected regex "
-            f"pattern: '{pattern}'."
+            f"The value '{regex_field}' does not match the expected regex pattern: '{pattern}'."
         )
 
     @staticmethod
@@ -984,9 +965,7 @@ def identity(  # noqa: C901, PLR0912
                         output_datum = int(input_datum)
                     except ValueError:
                         output_datum = None
-                        errors.append(
-                            f"Invalid int value: {input_datum} for field {output_field}."
-                        )
+                        errors.append(f"Invalid int value: {input_datum} for field {output_field}.")
                 case "float":
                     try:
                         output_datum = float(input_datum)
diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
index a03c3b2826..9e6383e63d 100644
--- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py
+++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
@@ -1107,8 +1107,7 @@ def test_parse_date_into_range() -> None:
                 "fieldType": "dateRangeString",
                 "submittedAt": ts_from_ymd(2022, 1, 1),
             },
-        )
-        .errors[0]
+        ).errors[0]
         == "Metadata field field_name: Detected date range but could not parse date: 20-01-2020/2021-06-30."
     ), "Invalid date range format errors."
     assert (
@@ -1120,8 +1119,7 @@ def test_parse_date_into_range() -> None:
                 "fieldType": "dateRangeString",
                 "submittedAt": ts_from_ymd(2022, 1, 1),
             },
-        )
-        .errors[0]
+        ).errors[0]
         == "Metadata field field_name:'2022-01-01/2021-06-30' is an invalid date range. Lower bound: 2022-01-01 00:00:00+00:00 is after upper bound: 2021-06-30 00:00:00+00:00."
     ), "Invalid date range format errors."
     assert (

From 9c7eb5dbedec48951888a3548b73054f1e88e1fb Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:39:06 +0200
Subject: [PATCH 04/18] merge conflict

---
 .../src/loculus_preprocessing/processing_functions.py         | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 78b7f34511..9153bd3e38 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -1358,9 +1358,7 @@ def scientific_name_from_id(
         tax_id: str | None = input_data.get("hostTaxonId")
         if not tax_id:
             return RawProcessingResult(
-                datum=input_data.get("hostNameScientific")
-                if args["is_insdc_ingest_group"]
-                else None,
+                datum=None,
             )
 
         url = f"{tax_service}/taxa/{tax_id}"

From 10eb6e493a0137eec13cc4bbade120af5f92cac4 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:44:25 +0200
Subject: [PATCH 05/18] fixup

---
 .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 9153bd3e38..80765179f8 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -838,8 +838,6 @@ def check_authors(
             "are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' "
             "or 'Xu,;' if the first name is unknown."
         )
-        warnings: list[str] = []
-        errors: list[str] = []
 
         if not authors:
             return RawProcessingResult()

From 8ecdff0f72f2518a62a16e7787fa12f88705189d Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:49:18 +0200
Subject: [PATCH 06/18] clean up more

---
 .../processing_functions.py                   | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 80765179f8..d821f9ecc1 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -150,9 +150,7 @@ def reformat_authors_from_latin_to_ascii(authors: str) -> str:
     return unicodedata.normalize("NFKD", authors).encode("ascii", "ignore").decode("ascii")
 
 
-def check_latin_characters(
-    authors: str, input_fields: list[str], output_field: str
-) -> tuple[list[str], list[str]]:
+def check_latin_characters(authors: str) -> tuple[list[str], list[str]]:
     warnings: list[str] = []
     errors: list[str] = []
     # Check if all characters in the authors string are Latin letters or spaces
@@ -212,9 +210,7 @@ def regex_error(
     )
 
 
-def missing_taxonomy_service_error(
-    input_fields: list[str], output_field: str
-) -> RawProcessingResult:
+def missing_taxonomy_service_error() -> RawProcessingResult:
     return processing_error(
         "Configuration error: taxonomy_service_url was None. Please contact the administrator."
     )
@@ -224,8 +220,6 @@ def taxonomy_network_error(
     subject: str,
     action: str,
     e: Exception,
-    input_fields: list[str],
-    output_field: str,
 ) -> RawProcessingResult:
     return processing_error(
         f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator."
@@ -841,7 +835,7 @@ def check_authors(
 
         if not authors:
             return RawProcessingResult()
-        errors, warnings = check_latin_characters(authors, input_fields, output_field)
+        errors, warnings = check_latin_characters(authors)
         if errors or warnings:
             return RawProcessingResult(warnings=warnings, errors=errors)
 
@@ -1297,7 +1291,7 @@ def resolve_host_taxon_id(
         """
         tax_service = args.get("taxonomy_service_url")
         if not tax_service:
-            return missing_taxonomy_service_error(input_fields, output_field)
+            return missing_taxonomy_service_error()
 
         unvalidated_host = input_data.get("host")
         if not unvalidated_host:
@@ -1313,9 +1307,7 @@ def resolve_host_taxon_id(
             response = taxonomy_cache.get_or_fetch(url)
             body = response.json()
         except requests.exceptions.RequestException as e:
-            return taxonomy_network_error(
-                unvalidated_host, "validating", e, input_fields, output_field
-            )
+            return taxonomy_network_error(unvalidated_host, "validating", e)
 
         if response.status_code != requests.codes.ok:
             # an invalid host organism is a warning for INSDC ingested sequences, but an error for everyone else
@@ -1351,7 +1343,7 @@ def scientific_name_from_id(
     ) -> RawProcessingResult:
         tax_service = args.get("taxonomy_service_url")
         if not tax_service:
-            return missing_taxonomy_service_error(input_fields, output_field)
+            return missing_taxonomy_service_error()
 
         tax_id: str | None = input_data.get("hostTaxonId")
         if not tax_id:
@@ -1364,7 +1356,7 @@ def scientific_name_from_id(
             response = taxonomy_cache.get_or_fetch(url)
             body = response.json()
         except requests.exceptions.RequestException as e:
-            return taxonomy_network_error(tax_id, "validating", e, input_fields, output_field)
+            return taxonomy_network_error(tax_id, "validating", e)
 
         if response.status_code != requests.codes.ok:
             message = f"Could not map '{tax_id}' to scientific name. Code {response.status_code}: {body.get('detail', '')}"
@@ -1392,7 +1384,7 @@ def common_name_from_id(
     ) -> RawProcessingResult:
         tax_service = args.get("taxonomy_service_url")
         if not tax_service:
-            return missing_taxonomy_service_error(input_fields, output_field)
+            return missing_taxonomy_service_error()
 
         tax_id: str | None = input_data.get("hostTaxonId")
         if not tax_id:
@@ -1403,9 +1395,7 @@ def common_name_from_id(
             response = taxonomy_cache.get_or_fetch(url)
             body = response.json()
         except requests.exceptions.RequestException as e:
-            return taxonomy_network_error(
-                tax_id, "getting common name for", e, input_fields, output_field
-            )
+            return taxonomy_network_error(tax_id, "getting common name for", e)
 
         if response.status_code != requests.codes.ok:
             return RawProcessingResult(

From 0d0023f392f5040fc439b487b4a832e6da6558b6 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 09:54:32 +0200
Subject: [PATCH 07/18] feat: add test for conversion

Co-authored-by: Cornelius Roemer <cornelius.roemer@gmail.com>
---
 .../tests/test_host_name_validation.py        | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py
index d97cdbe97e..9f8fd81588 100644
--- a/preprocessing/nextclade/tests/test_host_name_validation.py
+++ b/preprocessing/nextclade/tests/test_host_name_validation.py
@@ -6,8 +6,14 @@
 
 from loculus_preprocessing import processing_functions
 from loculus_preprocessing.config import get_config
-from loculus_preprocessing.datatypes import UnprocessedData, UnprocessedEntry
+from loculus_preprocessing.datatypes import (
+    AnnotationSource,
+    AnnotationSourceType,
+    UnprocessedData,
+    UnprocessedEntry,
+)
 from loculus_preprocessing.prepro import process_all
+from loculus_preprocessing.processing_functions import ProcessingFunctions
 
 HOST_PROCESSING_CONFIG = "tests/host_processing_config.yaml"
 
@@ -188,3 +194,35 @@ def test_host_processing_invalid_host_direct(mock_session: MagicMock) -> None:
     assert result[0].processed_entry.warnings == []
     assert len(result[0].processed_entry.errors) == 1
     assert "Host validation for" in result[0].processed_entry.errors[0].message
+
+
+@patch.object(processing_functions.taxonomy_cache, "session")
+def test_call_function_converts_raw_errors_to_annotations(mock_session: MagicMock) -> None:
+    """call_function must convert RawProcessingResult string errors into ProcessingAnnotations
+    with correct message and field linkage."""
+    mock_session.get.return_value = make_response(404, {"detail": "not found"})
+
+    input_fields = ["host"]
+    output_field = "hostTaxonId"
+    args = {"taxonomy_service_url": "http://localhost:5000", "is_insdc_ingest_group": False}
+
+    result = ProcessingFunctions.call_function(
+        "resolve_host_taxon_id",
+        args=args,
+        input_data={"host": "not a real species"},
+        output_field=output_field,
+        input_fields=input_fields,
+    )
+
+    assert result.datum is None
+    assert result.warnings == []
+    assert len(result.errors) == 1
+
+    annotation = result.errors[0]
+    assert "Host validation for" in annotation.message
+    assert annotation.processedFields == (
+        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA),
+    )
+    assert annotation.unprocessedFields == (
+        AnnotationSource(name="host", type=AnnotationSourceType.METADATA),
+    )

From b10a363fac6e423349c35704aae6430c5928abb4 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 10:42:16 +0200
Subject: [PATCH 08/18] fixup

---
 preprocessing/nextclade/tests/test_host_name_validation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py
index 9f8fd81588..d84b53f662 100644
--- a/preprocessing/nextclade/tests/test_host_name_validation.py
+++ b/preprocessing/nextclade/tests/test_host_name_validation.py
@@ -9,6 +9,7 @@
 from loculus_preprocessing.datatypes import (
     AnnotationSource,
     AnnotationSourceType,
+    FunctionArgs,
     UnprocessedData,
     UnprocessedEntry,
 )
@@ -204,7 +205,10 @@ def test_call_function_converts_raw_errors_to_annotations(mock_session: MagicMoc
 
     input_fields = ["host"]
     output_field = "hostTaxonId"
-    args = {"taxonomy_service_url": "http://localhost:5000", "is_insdc_ingest_group": False}
+    args: FunctionArgs = {
+        "taxonomy_service_url": "http://localhost:5000",
+        "is_insdc_ingest_group": False,
+    }
 
     result = ProcessingFunctions.call_function(
         "resolve_host_taxon_id",

From 62cf0a34da012328de0f5cb226e866514da6dd05 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:13:29 +0200
Subject: [PATCH 09/18] append

---
 .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index d821f9ecc1..f8b1c912ff 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -160,7 +160,7 @@ def check_latin_characters(authors: str) -> tuple[list[str], list[str]]:
         if ord(char) < 128:
             continue
         if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F):
-            errors = [f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."]
+            errors.append(f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X}).")
     return (errors, warnings)
 
 
From 0f040ceb3021cbf1c7ef5eab82717e4d34fda67a Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:15:24 +0200
Subject: [PATCH 10/18] format

---
 .../src/loculus_preprocessing/processing_functions.py         | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index f8b1c912ff..1fb74ced7c 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -160,7 +160,9 @@ def check_latin_characters(authors: str) -> tuple[list[str], list[str]]:
         if ord(char) < 128:
             continue
         if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F):
-            errors.append(f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X}).")
+            errors.append(
+                f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."
+            )
     return (errors, warnings)
 
 
From ee958cd43d9dc2c61d3e4022c48a95725a519659 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:59:11 +0200
Subject: [PATCH 11/18] Update
 preprocessing/nextclade/src/loculus_preprocessing/datatypes.py

Co-authored-by: maverbiest <59956362+maverbiest@users.noreply.github.com>
---
 .../nextclade/src/loculus_preprocessing/datatypes.py         | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
index 7004f81354..ffc183afca 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -195,11 +195,6 @@ def processing_error(message: str) -> "RawProcessingResult":
     return RawProcessingResult(datum=None, errors=[message])
 
 
-def processing_warning(message: str, datum: ProcessedMetadataValue = None) -> "RawProcessingResult":
-    """Helper to create a RawProcessingResult with a single warning."""
-    return RawProcessingResult(datum=datum, warnings=[message])
-
-
 @unique
 class SegmentClassificationMethod(StrEnum):
     """

From 92b72b360254e5baaf93151d654da660fd41f9e1 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 30 Jun 2026 12:06:07 +0200
Subject: [PATCH 12/18] fix up

---
 .../src/loculus_preprocessing/processing_functions.py       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 1fb74ced7c..48ee20ad94 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -432,10 +432,8 @@ def check_date(
                 warnings.append("Date is in the future.")
             return RawProcessingResult(datum=date, warnings=warnings)
         except ValueError as e:
-            return RawProcessingResult(
-                errors=[
-                    f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}"
-                ],
+            return processing_error(
+                f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}"
             )
 
     @staticmethod

From e648833df79bc1d5e53a7945ef58ae280e80df7e Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 2 Jul 2026 08:41:38 +0200
Subject: [PATCH 13/18] fixup

---
 .../src/loculus_preprocessing/processing_functions.py         | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 48ee20ad94..8792c760ac 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -1347,9 +1347,7 @@ def scientific_name_from_id(
 
         tax_id: str | None = input_data.get("hostTaxonId")
         if not tax_id:
-            return RawProcessingResult(
-                datum=None,
-            )
+            return RawProcessingResult()
 
         url = f"{tax_service}/taxa/{tax_id}"
         try:

From 5f649af00c7cd387d846f51cc33945a30387fe60 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 2 Jul 2026 08:43:28 +0200
Subject: [PATCH 14/18] feat(prepro): ensure all internal errors are properly
 logged and include proper information (#6801)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…lude proper information

Follow up to https://github.com/loculus-project/loculus/pull/6537 -
ensure that all internal errors thrown by prepro (these are related to
invalid config, unexpected nextclade format etc.) are properly annotated
with the prefix "Internal Error:" and the suffix "Please contact the
administrator."

### Screenshot

### PR Checklist
- [x] All necessary documentation has been adapted.
- [x] The implemented feature is covered by appropriate, automated
tests.
- [ ] Any manual testing that has been done is documented (i.e. what
exactly was tested?)

🚀 Preview: Add `preview` label to enable

---------

Co-authored-by: maverbiest <59956362+maverbiest@users.noreply.github.com>
---
 .../processing_functions.py                   | 201 +++++++-----------
 .../tests/test_nextclade_preprocessing.py     |   5 +-
 2 files changed, 79 insertions(+), 127 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 8792c760ac..ca73153cd2 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -23,7 +23,6 @@
 from urllib3.util.retry import Retry
 
 from .datatypes import (
-    AnnotationSource,
     AnnotationSourceType,
     FunctionArgs,
     InputData,
@@ -202,20 +201,26 @@ def format_authors(authors: str) -> str:
     return "; ".join(loculus_authors).strip()
 
 
+def _internal_error_message(message: str) -> str:
+    full = f"Internal Error. {message} Please contact the administrator."
+    logger.error(full)
+    return full
+
+
+def raw_internal_error(message: str) -> RawProcessingResult:
+    return processing_error(_internal_error_message(message))
+
+
 def regex_error(
     function_name: str, function_arg: str, input_data: InputMetadata, args: FunctionArgs
-) -> str:
-    return (
-        f"Internal Error: Function {function_name} did not receive valid "
-        f"regex {function_arg}, with input {input_data} and args {args}, "
-        "please contact the administrator."
+) -> RawProcessingResult:
+    return raw_internal_error(
+        f"{function_name} did not receive a valid {function_arg}, with input {input_data} and args {args}."
     )
 
 
 def missing_taxonomy_service_error() -> RawProcessingResult:
-    return processing_error(
-        "Configuration error: taxonomy_service_url was None. Please contact the administrator."
-    )
+    return raw_internal_error("taxonomy_service_url was not configured.")
 
 
 def taxonomy_network_error(
@@ -223,9 +228,7 @@ def taxonomy_network_error(
     action: str,
     e: Exception,
 ) -> RawProcessingResult:
-    return processing_error(
-        f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator."
-    )
+    return raw_internal_error(f"Network error while {action} '{subject}': {e}.")
 
 
 @dataclass
@@ -337,28 +340,17 @@ def call_function(
         try:
             result = func(input_data, output_field, input_fields=input_fields, args=args)
         except Exception as e:
-            message = (
-                f"Error calling function {function_name} for output field {output_field} "
-                f"with input {input_data} and args {args}: {e}"
-            )
+            message = f"Internal Error. {function_name} raised an unexpected exception for output field '{output_field}': {e}. Please contact the administrator."
             logger.exception(message)
             return ProcessingResult(
                 datum=None,
                 warnings=[],
                 errors=[
-                    ProcessingAnnotation(
-                        processedFields=[
-                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
-                        ],
-                        unprocessedFields=[
-                            AnnotationSource(name=field, type=AnnotationSourceType.METADATA)
-                            for field in input_fields
-                        ],
-                        message=(
-                            f"Internal Error: Function {function_name} did not return "
-                            f"ProcessingResult with input {input_data} and args {args}, "
-                            "please contact the administrator."
-                        ),
+                    ProcessingAnnotation.from_fields(
+                        input_fields,
+                        [output_field],
+                        AnnotationSourceType.METADATA,
+                        message=message,
                     )
                 ],
             )
@@ -385,11 +377,8 @@ def call_function(
                 ],
             )
         if not isinstance(result, ProcessingResult):
-            logger.error(
-                f"ERROR: Function {function_name} did not return ProcessingResult "
-                f"given input {input_data} and args {args}. "
-                "This is likely a preprocessing bug."
-            )
+            message = f"Internal Error. {function_name} returned an unexpected type '{type(result).__name__}'. Please contact the administrator."
+            logger.error(message)
             return ProcessingResult(
                 datum=None,
                 warnings=[],
@@ -398,11 +387,7 @@ def call_function(
                         input_fields,
                         [output_field],
                         AnnotationSourceType.METADATA,
-                        message=(
-                            f"Internal Error: Function {function_name} did not return "
-                            f"ProcessingResult with input {input_data} and args {args}, "
-                            "please contact the administrator."
-                        ),
+                        message=message,
                     )
                 ],
             )
@@ -470,10 +455,8 @@ def parse_date_into_range(  # noqa: C901, PLR0912, PLR0915
         try:
             submitted_at = datetime.fromtimestamp(float(str(args["submittedAt"])), tz=pytz.utc)
         except Exception:
-            return processing_error(
-                f"Internal Error: Function parse_into_ranges did not receive valid "
-                f"submittedAt date, with input {input_data} and args {args}, "
-                "please contact the administrator."
+            return raw_internal_error(
+                f"parse_into_ranges did not receive a valid submittedAt date, with input {input_data} and args {args}."
             )
 
         max_upper_limit = min(submitted_at, release_date) if release_date else submitted_at
@@ -694,10 +677,8 @@ def concatenate(
         errors: list[str] = []
 
         if not isinstance(args["ACCESSION_VERSION"], str):
-            return processing_error(
-                "Internal Error: Function concatenate did not receive "
-                f"accession_version ProcessingResult with input {input_data} "
-                f"and args {args}, please contact the administrator."
+            return raw_internal_error(
+                f"concatenate did not receive a valid ACCESSION_VERSION (got: {args['ACCESSION_VERSION']!r})."
             )
 
         accession_version: str = args["ACCESSION_VERSION"]
@@ -707,18 +688,10 @@ def concatenate(
             str(args["fallback_value"]).strip() if args.get("fallback_value") is not None else ""
         )
 
-        error_msg = (
-            "Concatenation failed."
-            "This may be a configuration error, please contact the administrator."
-        )
-
         if not isinstance(order, list):
-            logger.error(
-                f"Concatenate: Expected order field to be a list. "
-                f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
+            return raw_internal_error(
+                f"concatenate expected 'order' to be a list (ACCESSION_VERSION: {accession_version})."
             )
-            errors.append(error_msg)
-            return RawProcessingResult(errors=errors)
 
         n_inputs = len(input_data.keys())
         # exclude ACCESSION_VERSION as it's provided by _call_preprocessing_function() and should not be an input_metadata field
@@ -726,19 +699,13 @@ def concatenate(
             [i for i in order if i != "ACCESSION_VERSION" and not i.startswith("ARG:")]
         )
         if n_inputs < n_expected:
-            logger.error(
-                f"Concatenate: Expected {n_expected} fields, got {n_inputs}. "
-                f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
+            return raw_internal_error(
+                f"concatenate expected {n_expected} input fields but got {n_inputs} (ACCESSION_VERSION: {accession_version})."
             )
-            errors.append(error_msg)
-            return RawProcessingResult(errors=errors)
         if not isinstance(field_types, list):
-            logger.error(
-                f"Concatenate: Expected type field to be a list. "
-                f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
+            return raw_internal_error(
+                f"concatenate expected 'type' to be a list (ACCESSION_VERSION: {accession_version})."
             )
-            errors.append(error_msg)
-            return RawProcessingResult(errors=errors)
 
         formatted_input_data: list[str] = []
         try:
@@ -761,7 +728,11 @@ def concatenate(
                     raw_value = str(raw).strip()
                     if raw_value.count("/") > 1:
                         date_string = None
-                        errors.append(error_msg)
+                        errors.append(
+                            _internal_error_message(
+                                f"dateRangeString field '{order[i]}' has an unexpected format: '{raw_value}' (ACCESSION_VERSION: {accession_version})."
+                            )
+                        )
                     else:
                         date_string = raw_value.replace("/", " TO ")
                     formatted_input_data.append(
@@ -780,12 +751,9 @@ def concatenate(
                     formatted_input_data.append(accession_version)
                 elif field_types[i].startswith("ARG:"):
                     if field_types[i][4:] not in args:
-                        logger.error(
-                            f"Concatenate: Missing argument {field_types[i][4:]} in args. "
-                            f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
+                        return raw_internal_error(
+                            f"concatenate is missing argument '{field_types[i][4:]}' in args (ACCESSION_VERSION: {accession_version})."
                         )
-                        errors.append(error_msg)
-                        return RawProcessingResult(errors=errors, warnings=warnings)
                     formatted_input_data.append(str(args[field_types[i][4:]]))
                 elif order[i] in input_data:
                     formatted_input_data.append(
@@ -794,12 +762,9 @@ def concatenate(
                         else str(input_data[order[i]]).strip()
                     )
                 else:
-                    logger.error(
-                        f"Concatenate: cannot find field {order[i]} of {field_types[i]} in input_data"
-                        f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
+                    return raw_internal_error(
+                        f"concatenate could not find field '{order[i]}' (type: {field_types[i]}) in input_data (ACCESSION_VERSION: {accession_version})."
                     )
-                    errors.append(error_msg)
-                    return RawProcessingResult(errors=errors, warnings=warnings)
 
             result = "/".join(formatted_input_data)
             # To avoid downstream issues do not let the result start or end in a "/"
@@ -808,12 +773,15 @@ def concatenate(
 
             return RawProcessingResult(datum=result, warnings=warnings, errors=errors)
         except ValueError as e:
-            logger.error(f"Concatenate failed with {e} (ACCESSION_VERSION: {accession_version})")
-            errors.append(
-                f"Concatenation failed for {output_field}. This is a technical error, "
-                "please contact the administrator."
+            return RawProcessingResult(
+                warnings=warnings,
+                errors=[
+                    *errors,
+                    _internal_error_message(
+                        f"Concatenation failed for '{output_field}' with error: {e} (ACCESSION_VERSION: {accession_version})."
+                    ),
+                ],
             )
-            return RawProcessingResult(errors=errors, warnings=warnings)
 
     @staticmethod
     def check_authors(
@@ -889,9 +857,9 @@ def extract_regex(
         if not regex_field:
             return RawProcessingResult()
         if not isinstance(pattern, str):
-            return processing_error(regex_error("extract_regex", "pattern", input_data, args))
+            return regex_error("extract_regex", "pattern", input_data, args)
         if not isinstance(capture_group, str):
-            return processing_error(regex_error("extract_regex", "capture_group", input_data, args))
+            return regex_error("extract_regex", "capture_group", input_data, args)
         match = re.match(pattern, regex_field.strip())
         if match:
             try:
@@ -900,10 +868,8 @@ def extract_regex(
                     result = result.upper()
                 return RawProcessingResult(datum=result)
             except IndexError:
-                errors.append(
-                    f"The pattern '{pattern}' does not contain a capture group: "
-                    f"'{capture_group}'- this is an internal error,"
-                    " please contact your local administrator."
+                return raw_internal_error(
+                    f"Pattern '{pattern}' does not contain capture group '{capture_group}'."
                 )
         else:
             errors.append(
@@ -929,7 +895,7 @@ def check_regex(
         if not regex_field:
             return RawProcessingResult()
         if not isinstance(pattern, str):
-            return processing_error(regex_error("check_regex", "pattern", input_data, args))
+            return regex_error("check_regex", "pattern", input_data, args)
         regex_field = regex_field.strip()
         if re.match(pattern, regex_field):
             return RawProcessingResult(datum=regex_field)
@@ -1183,9 +1149,7 @@ def build_display_name(  # noqa: C901
         submission_id = input_data.get("submissionId", None)
         warnings: list[str] = []
         if submission_id is None:
-            return processing_error(
-                "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator."
-            )
+            return raw_internal_error("'submissionId' must not be None for build_display_name().")
 
         order = args.get("order")
         field_types = args.get("type")
@@ -1195,8 +1159,8 @@ def build_display_name(  # noqa: C901
             or len(order) != len(field_types)
             or "IDENTIFIER" not in order
         ):
-            return processing_error(
-                "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator."
+            return raw_internal_error(
+                "'order' and 'type' must be lists of equal length and 'order' must contain IDENTIFIER."
             )
 
         regex_pattern = args.get("regex_pattern")
@@ -1204,8 +1168,8 @@ def build_display_name(  # noqa: C901
             regex_pattern is not None
             and "identifier" not in re.compile(str(regex_pattern)).groupindex
         ):
-            return processing_error(
-                "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'"
+            return raw_internal_error(
+                "If provided, 'regex_pattern' must contain a named capture group called 'identifier'."
             )
 
         concatenate_order = order.copy()
@@ -1326,10 +1290,8 @@ def resolve_host_taxon_id(
 
         tax_id = taxon.get("tax_id")
         if tax_id is None:
-            return processing_error(
-                f"Internal error: host validation for '{unvalidated_host}' "
-                f"was successful but response json 'tax_id' was missing. "
-                f"Please contact the administrator"
+            return raw_internal_error(
+                f"Host validation for '{unvalidated_host}' was successful but response json 'tax_id' was missing."
             )
 
         return RawProcessingResult(datum=str(tax_id))
@@ -1367,8 +1329,8 @@ def scientific_name_from_id(
 
         scientific_name = body.get("scientific_name")
         if scientific_name is None:
-            return processing_error(
-                f"Internal error: '{tax_id}' is a valid taxon ID but response json had no 'scientific_name'. Please contact the administrator"
+            return raw_internal_error(
+                f"'{tax_id}' is a valid taxon ID but response json had no 'scientific_name'."
             )
 
         return RawProcessingResult(datum=scientific_name)
@@ -1404,8 +1366,8 @@ def common_name_from_id(
 
         common_name = body.get("common_name")
         if common_name is None:
-            return processing_error(
-                f"Internal error: taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it. Please contact the administrator."
+            return raw_internal_error(
+                f"Taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it."
             )
 
         return RawProcessingResult(datum=common_name)
@@ -1430,10 +1392,9 @@ def process_frameshifts(input: str | None) -> InputData:
         return InputData(datum=format_frameshift(input))
     except Exception as e:
         msg = (
-            "Was unable to format frameshift - this is likely an internal error. "
-            "Please contact the administrator."
+            f"Internal Error. Frameshift formatting failed: {e}. Please contact the administrator."
         )
-        logger.error(msg + f" Error: {e}")
+        logger.error(msg)
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
@@ -1514,10 +1475,9 @@ def process_stop_codons(input: str | None) -> InputData:
         return InputData(datum=format_stop_codon(input))
     except Exception as e:
         msg = (
-            "Was unable to format stop codon - this is likely an internal error. "
-            "Please contact the administrator."
+            f"Internal Error. Stop codon formatting failed: {e}. Please contact the administrator."
         )
-        logger.error(msg + f" Error: {e}")
+        logger.error(msg)
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
@@ -1573,11 +1533,8 @@ def process_mutations_from_clade_founder(input: str | None, args: FunctionArgs |
         if mutations:
             return InputData(datum=" ".join(mutations))
     except Exception as e:
-        msg = (
-            "Was unable to process mutations from clade founder - this is likely an internal error. "
-            "Please contact the administrator."
-        )
-        logger.error(msg + f" Error: {e}")
+        msg = f"Internal Error. Clade founder mutation processing failed: {e}. Please contact the administrator."
+        logger.error(msg)
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
@@ -1607,11 +1564,7 @@ def process_labeled_mutations(input: str | None, args: FunctionArgs | None) -> I
         if mutations:
             return InputData(datum=" ".join(mutations))
     except Exception as e:
-        msg = (
-            "Was unable to process labeled mutations - this is likely an internal error. "
-            "Please contact the administrator."
-        )
-        logger.error(msg + f" Error: {e}")
+        msg = _internal_error_message(f"Labeled mutation processing failed: {e}. ")
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
@@ -1634,11 +1587,7 @@ def process_phenotype_values(input: str | None, args: FunctionArgs | None) -> In
                 value = entry.get("value")
                 return InputData(datum=str(value) if value is not None else None)
     except Exception as e:
-        msg = (
-            "Was unable to process phenotype values - this is likely an internal error. "
-            "Please contact the administrator."
-        )
-        logger.error(msg + f" Error: {e}")
+        msg = _internal_error_message(f"Labeled mutation processing failed: {e}. ")
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
index 05edf3a4cf..a32d5898d3 100644
--- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
+++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -1405,7 +1405,10 @@ def test_process_phenotype_values():
     assert process_phenotype_values('[{"name": "NAI","cds": "NA","value": 0.0}]', {}).datum is None
     invalid = process_phenotype_values("Malformed JSON", {"name": "NAI"})
     assert invalid.datum is None
-    assert "Was unable to process phenotype values" in invalid.errors[0].message
+    assert (
+        "Internal Error. Phenotype value processing failed: invalid syntax"
+        in invalid.errors[0].message
+    )
 
 
 def test_reformat_authors_from_loculus_to_embl_style():

From 2cc446f842d58e5e732704b4ef488b49491734b2 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 2 Jul 2026 08:55:36 +0200
Subject: [PATCH 15/18] use internal error in more places

---
 .../processing_functions.py                   | 30 +++++++------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index ca73153cd2..4626836992 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -340,8 +340,6 @@ def call_function(
         try:
             result = func(input_data, output_field, input_fields=input_fields, args=args)
         except Exception as e:
-            message = f"Internal Error. {function_name} raised an unexpected exception for output field '{output_field}': {e}. Please contact the administrator."
-            logger.exception(message)
             return ProcessingResult(
                 datum=None,
                 warnings=[],
@@ -350,7 +348,9 @@ def call_function(
                         input_fields,
                         [output_field],
                         AnnotationSourceType.METADATA,
-                        message=message,
+                        message=_internal_error_message(
+                            f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. "
+                        ),
                     )
                 ],
             )
@@ -377,8 +377,6 @@ def call_function(
                 ],
             )
         if not isinstance(result, ProcessingResult):
-            message = f"Internal Error. {function_name} returned an unexpected type '{type(result).__name__}'. Please contact the administrator."
-            logger.error(message)
             return ProcessingResult(
                 datum=None,
                 warnings=[],
@@ -387,7 +385,9 @@ def call_function(
                         input_fields,
                         [output_field],
                         AnnotationSourceType.METADATA,
-                        message=message,
+                        message=_internal_error_message(
+                            f"{function_name} returned an unexpected type '{type(result).__name__}. "
+                        ),
                     )
                 ],
             )
@@ -1391,15 +1391,11 @@ def process_frameshifts(input: str | None) -> InputData:
     try:
         return InputData(datum=format_frameshift(input))
     except Exception as e:
-        msg = (
-            f"Internal Error. Frameshift formatting failed: {e}. Please contact the administrator."
-        )
-        logger.error(msg)
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
                 "frameshifts",
-                msg,
+                _internal_error_message(f"Frameshift formatting failed: {e}. "),
             ),
         )
 
@@ -1474,15 +1470,11 @@ def process_stop_codons(input: str | None) -> InputData:
     try:
         return InputData(datum=format_stop_codon(input))
     except Exception as e:
-        msg = (
-            f"Internal Error. Stop codon formatting failed: {e}. Please contact the administrator."
-        )
-        logger.error(msg)
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
                 "stopCodons",
-                msg,
+                _internal_error_message(f"Stop codon formatting failed: {e}. "),
             ),
         )
 
@@ -1533,13 +1525,11 @@ def process_mutations_from_clade_founder(input: str | None, args: FunctionArgs |
         if mutations:
             return InputData(datum=" ".join(mutations))
     except Exception as e:
-        msg = f"Internal Error. Clade founder mutation processing failed: {e}. Please contact the administrator."
-        logger.error(msg)
         return InputData(
             datum=None,
             errors=single_metadata_annotation(
                 "mutationsFromCladeFounder",
-                msg,
+                _internal_error_message(f"Clade founder mutation processing failed: {e}. "),
             ),
         )
     return InputData(datum=None)
@@ -1587,7 +1577,7 @@ def process_phenotype_values(input: str | None, args: FunctionArgs | None) -> In
                 value = entry.get("value")
                 return InputData(datum=str(value) if value is not None else None)
     except Exception as e:
-        msg = _internal_error_message(f"Labeled mutation processing failed: {e}. ")
+        msg = _internal_error_message(f"Phenotype value processing failed: {e}. ")
         return InputData(
             datum=None,
             errors=single_metadata_annotation(

From 29efe70adc8ec2408895140581bd9a457ec76972 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 2 Jul 2026 08:59:28 +0200
Subject: [PATCH 16/18] reduce

---
 .../processing_functions.py                     | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 4626836992..f59857a614 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -340,19 +340,10 @@ def call_function(
         try:
             result = func(input_data, output_field, input_fields=input_fields, args=args)
         except Exception as e:
-            return ProcessingResult(
-                datum=None,
-                warnings=[],
-                errors=[
-                    ProcessingAnnotation.from_fields(
-                        input_fields,
-                        [output_field],
-                        AnnotationSourceType.METADATA,
-                        message=_internal_error_message(
-                            f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. "
-                        ),
-                    )
-                ],
+            result = processing_error(
+                _internal_error_message(
+                    f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. "
+                )
             )
         if isinstance(result, RawProcessingResult):
             return ProcessingResult(

From 1523f29613fddc235d14703f15c980dc2bd483d5 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 2 Jul 2026 09:07:37 +0200
Subject: [PATCH 17/18] improve test

---
 .../processing_functions.py                   |  6 +--
 .../tests/test_host_name_validation.py        | 40 +------------------
 .../test_metadata_processing_functions.py     | 34 ++++++++++++++++
 3 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index f59857a614..c307739e7a 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -340,10 +340,8 @@ def call_function(
         try:
             result = func(input_data, output_field, input_fields=input_fields, args=args)
         except Exception as e:
-            result = processing_error(
-                _internal_error_message(
-                    f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. "
-                )
+            result = raw_internal_error(
+                f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. "
             )
         if isinstance(result, RawProcessingResult):
             return ProcessingResult(
diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py
index d84b53f662..bcd4bdee57 100644
--- a/preprocessing/nextclade/tests/test_host_name_validation.py
+++ b/preprocessing/nextclade/tests/test_host_name_validation.py
@@ -7,14 +7,11 @@
 from loculus_preprocessing import processing_functions
 from loculus_preprocessing.config import get_config
 from loculus_preprocessing.datatypes import (
-    AnnotationSource,
-    AnnotationSourceType,
-    FunctionArgs,
     UnprocessedData,
     UnprocessedEntry,
 )
 from loculus_preprocessing.prepro import process_all
-from loculus_preprocessing.processing_functions import ProcessingFunctions
+
 
 HOST_PROCESSING_CONFIG = "tests/host_processing_config.yaml"
 
@@ -195,38 +192,3 @@ def test_host_processing_invalid_host_direct(mock_session: MagicMock) -> None:
     assert result[0].processed_entry.warnings == []
     assert len(result[0].processed_entry.errors) == 1
     assert "Host validation for" in result[0].processed_entry.errors[0].message
-
-
-@patch.object(processing_functions.taxonomy_cache, "session")
-def test_call_function_converts_raw_errors_to_annotations(mock_session: MagicMock) -> None:
-    """call_function must convert RawProcessingResult string errors into ProcessingAnnotations
-    with correct message and field linkage."""
-    mock_session.get.return_value = make_response(404, {"detail": "not found"})
-
-    input_fields = ["host"]
-    output_field = "hostTaxonId"
-    args: FunctionArgs = {
-        "taxonomy_service_url": "http://localhost:5000",
-        "is_insdc_ingest_group": False,
-    }
-
-    result = ProcessingFunctions.call_function(
-        "resolve_host_taxon_id",
-        args=args,
-        input_data={"host": "not a real species"},
-        output_field=output_field,
-        input_fields=input_fields,
-    )
-
-    assert result.datum is None
-    assert result.warnings == []
-    assert len(result.errors) == 1
-
-    annotation = result.errors[0]
-    assert "Host validation for" in annotation.message
-    assert annotation.processedFields == (
-        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA),
-    )
-    assert annotation.unprocessedFields == (
-        AnnotationSource(name="host", type=AnnotationSourceType.METADATA),
-    )
diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
index 9e6383e63d..de20b2c240 100644
--- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py
+++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
@@ -12,6 +12,8 @@
 
 from loculus_preprocessing.config import Config, get_config, get_processing_order
 from loculus_preprocessing.datatypes import (
+    AnnotationSource,
+    AnnotationSourceType,
     FunctionArgs,
     InputMetadata,
     ProcessedEntry,
@@ -1405,5 +1407,37 @@ def args_insdc():
     )
 
 
+def test_call_function_converts_raw_errors_to_annotations() -> None:
+    """call_function must convert RawProcessingResult string errors into ProcessingAnnotations
+    with correct message and field linkage."""
+    input_fields = ["myField"]
+    output_field = "myField"
+    args: FunctionArgs = {
+        "options": ["OptionA", "OptionB"],
+        "is_insdc_ingest_group": False,
+    }
+
+    result = ProcessingFunctions.call_function(
+        "process_options",
+        args=args,
+        input_data={"input": "NotAnOption"},
+        output_field=output_field,
+        input_fields=input_fields,
+    )
+
+    assert result.datum is None
+    assert result.warnings == []
+    assert len(result.errors) == 1
+
+    annotation = result.errors[0]
+    assert "not in list of accepted options" in annotation.message
+    assert annotation.processedFields == (
+        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA),
+    )
+    assert annotation.unprocessedFields == (
+        AnnotationSource(name="myField", type=AnnotationSourceType.METADATA),
+    )
+
+
 if __name__ == "__main__":
     pytest.main()

From de417ce99cab170c7d4229e1a600c4f79d582323 Mon Sep 17 00:00:00 2001
From: anna-parker <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 2 Jul 2026 09:09:08 +0200
Subject: [PATCH 18/18] weird

---
 preprocessing/nextclade/tests/test_host_name_validation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py
index bcd4bdee57..adef09a0af 100644
--- a/preprocessing/nextclade/tests/test_host_name_validation.py
+++ b/preprocessing/nextclade/tests/test_host_name_validation.py
@@ -12,7 +12,6 @@
 )
 from loculus_preprocessing.prepro import process_all
 
-
 HOST_PROCESSING_CONFIG = "tests/host_processing_config.yaml"