From 8d7830cccf6db1d7621204e2d3fb25f52bdb8504 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 29 May 2026 10:18:21 +0000 Subject: [PATCH 01/18] Initial plan From 030c4fc10213efc906a46bc89e4dce6f2febac79 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 29 May 2026 10:31:45 +0000 Subject: [PATCH 02/18] Refactor processing functions to use RawProcessingResult for less verbose error/warning handling - Add RawProcessingResult dataclass with plain string warnings/errors - Add processing_error() and processing_warning() helper functions - Modify call_function to handle both ProcessingResult and RawProcessingResult - Convert all processing functions to return RawProcessingResult - Simplify check_latin_characters and taxonomy helper functions - Remove unused invalid_value_annotation function - Update tests to match new return types (errors/warnings are now strings) This reduces the file from 2208 to 1719 lines by eliminating repetitive ProcessingAnnotation.from_fields(...) boilerplate in every error/warning. --- .../src/loculus_preprocessing/datatypes.py | 24 + .../processing_functions.py | 903 ++++-------------- .../tests/test_assign_custom_lineage.py | 2 +- .../test_metadata_processing_functions.py | 10 +- 4 files changed, 237 insertions(+), 702 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index f56dd0cb3a..7004f81354 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -176,6 +176,30 @@ class ProcessingResult: errors: list[ProcessingAnnotation] = field(default_factory=list) +@dataclass +class RawProcessingResult: + """A simplified ProcessingResult where warnings/errors are plain strings. + + Processing functions return this type; the caller (call_function) converts + it into a full ProcessingResult by attaching input_fields, output_field, + and annotation source type information. + """ + + datum: ProcessedMetadataValue = None + warnings: list[str] = field(default_factory=list) + errors: list[str] = field(default_factory=list) + + +def processing_error(message: str) -> "RawProcessingResult": + """Helper to create a RawProcessingResult with a single error and no datum.""" + return RawProcessingResult(datum=None, errors=[message]) + + +def processing_warning(message: str, datum: ProcessedMetadataValue = None) -> "RawProcessingResult": + """Helper to create a RawProcessingResult with a single warning.""" + return RawProcessingResult(datum=datum, warnings=[message]) + + @unique class SegmentClassificationMethod(StrEnum): """ diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index e2428bd2b7..3adb56c556 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -31,6 +31,8 @@ ProcessedMetadataValue, ProcessingAnnotation, ProcessingResult, + RawProcessingResult, + processing_error, ) logger = logging.getLogger(__name__) @@ -102,19 +104,6 @@ def standardize_option(option): return " ".join(option.lower().split()) -def invalid_value_annotation( - input_datum, output_field, input_fields, value_type -) -> ProcessingAnnotation: - return ProcessingAnnotation( - processedFields=[AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)], - unprocessedFields=[ - AnnotationSource(name=field, type=AnnotationSourceType.METADATA) - for field in input_fields - ], - message=f"Invalid {value_type} value: {input_datum} for field {output_field}.", - ) - - def valid_name() -> str: chars = ( r"\u0041-\u005A" # A-Z @@ -163,9 +152,9 @@ def reformat_authors_from_latin_to_ascii(authors: str) -> str: def check_latin_characters( authors: str, input_fields: list[str], output_field: str -) -> tuple[list[ProcessingAnnotation], list[ProcessingAnnotation]]: - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] +) -> tuple[list[str], list[str]]: + warnings: list[str] = [] + errors: list[str] = [] # Check if all characters in the authors string are Latin letters or spaces # (transformable to ASCII) for char in authors: @@ -174,18 +163,7 @@ def check_latin_characters( continue if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F): errors = [ - ProcessingAnnotation( - processedFields=[ - AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) - ], - unprocessedFields=[ - AnnotationSource(name=field, type=AnnotationSourceType.METADATA) - for field in input_fields - ], - message=( - f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})." - ), - ) + f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})." ] return (errors, warnings) @@ -236,18 +214,9 @@ def regex_error( ) -def missing_taxonomy_service_error(input_fields: list[str], output_field: str) -> ProcessingResult: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message="Configuration error: taxonomy_service_url was None. Please contact the administrator.", - ) - ], +def missing_taxonomy_service_error(input_fields: list[str], output_field: str) -> RawProcessingResult: + return processing_error( + "Configuration error: taxonomy_service_url was None. Please contact the administrator." ) @@ -257,18 +226,9 @@ def taxonomy_network_error( e: Exception, input_fields: list[str], output_field: str, -) -> ProcessingResult: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator.", - ) - ], +) -> RawProcessingResult: + return processing_error( + f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator." ) @@ -406,6 +366,28 @@ def call_function( ) ], ) + if isinstance(result, RawProcessingResult): + return ProcessingResult( + datum=result.datum, + warnings=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=msg, + ) + for msg in result.warnings + ], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=msg, + ) + for msg in result.errors + ], + ) if not isinstance(result, ProcessingResult): logger.error( f"ERROR: Function {function_name} did not return ProcessingResult " @@ -436,7 +418,7 @@ def check_date( output_field: str, input_fields: list[str], args: FunctionArgs, # args is essential - even if Pylance says it's not used - ) -> ProcessingResult: + ) -> RawProcessingResult: """Check that date is complete YYYY-MM-DD If not according to format return error If in future, return warning @@ -445,40 +427,18 @@ def check_date( date = input_data["date"] if not date: - return ProcessingResult( - datum=None, - warnings=[], - errors=[], - ) + return RawProcessingResult() - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] + warnings: list[str] = [] try: parsed_date = datetime.strptime(date, "%Y-%m-%d").astimezone(pytz.utc) if parsed_date > datetime.now(tz=pytz.utc): - warnings.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message="Date is in the future.", - ) - ) - return ProcessingResult(datum=date, warnings=warnings, errors=errors) + warnings.append("Date is in the future.") + return RawProcessingResult(datum=date, warnings=warnings) except ValueError as e: - error_message = ( - f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}" - ) - return ProcessingResult( - datum=None, - warnings=warnings, + return RawProcessingResult( errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=error_message, - ) + f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}" ], ) @@ -488,7 +448,7 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 output_field: str, input_fields: list[str], args: FunctionArgs, # args is essential - even if Pylance says it's not used - ) -> ProcessingResult: + ) -> RawProcessingResult: """ Parse date string (`input.date`) with input formats: - YYYY @@ -516,36 +476,23 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 try: submitted_at = datetime.fromtimestamp(float(str(args["submittedAt"])), tz=pytz.utc) except Exception: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Internal Error: Function parse_into_ranges did not receive valid " - f"submittedAt date, with input {input_data} and args {args}, " - "please contact the administrator." - ), - ) - ], + return processing_error( + f"Internal Error: Function parse_into_ranges did not receive valid " + f"submittedAt date, with input {input_data} and args {args}, " + "please contact the administrator." ) max_upper_limit = min(submitted_at, release_date) if release_date else submitted_at if not input_date_str: - return ProcessingResult( + return RawProcessingResult( datum=max_upper_limit.strftime("%Y-%m-%d") if args["fieldType"] == "dateRangeUpper" else None, - warnings=[], - errors=[], ) - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] + warnings: list[str] = [] + errors: list[str] = [] datum = convert_to_date_range(input_date_str) @@ -566,18 +513,9 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 upper_date = convert_to_date_range(match.group(2)) if lower_date is None or upper_date is None: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Metadata field {output_field}: " - f"Detected date range but could not parse date: {input_date_str}.", - ) - ], + return processing_error( + f"Metadata field {output_field}: " + f"Detected date range but could not parse date: {input_date_str}." ) msg = None if lower_date.message or upper_date.message: @@ -601,27 +539,13 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 if datum and datum.message: warnings.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Metadata field {output_field}:'{input_date_str}' - " + datum.message, - ) + f"Metadata field {output_field}:'{input_date_str}' - " + datum.message ) if datum is None: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Metadata field {output_field}: " - f"Date {input_date_str} could not be parsed.", - ) - ], + return processing_error( + f"Metadata field {output_field}: " + f"Date {input_date_str} could not be parsed." ) logger.debug(f"parsed_date: {datum}") @@ -631,16 +555,9 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 f"Lower range of date: {datum.date_range_lower} > upper: {datum.date_range_upper}" ) errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Metadata field {output_field}:'{input_date_str}' is an invalid date " - f"range. Lower bound: {datum.date_range_lower} is after upper " - f"bound: {datum.date_range_upper}." - ), - ) + f"Metadata field {output_field}:'{input_date_str}' is an invalid date " + f"range. Lower bound: {datum.date_range_lower} is after upper " + f"bound: {datum.date_range_upper}." ) if datum.date_range_upper > max_upper_limit: @@ -656,12 +573,7 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 f"Lower range of date: {datum.date_range_lower} > {datetime.now(tz=pytz.utc)}" ) errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=(f"Metadata field {output_field}:'{input_date_str}' is in the future."), - ) + f"Metadata field {output_field}:'{input_date_str}' is in the future." ) if release_date and datum.date_range_lower and (datum.date_range_lower > release_date): @@ -669,14 +581,7 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 f"Lower range of date: {datum.date_range_lower} > release_date: {release_date}" ) errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Metadata field {output_field}:'{input_date_str}' is after release date." - ), - ) + f"Metadata field {output_field}:'{input_date_str}' is after release date." ) match args["fieldType"]: @@ -697,7 +602,7 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 msg = f"Config error: Unknown fieldType: {args['fieldType']}" raise ValueError(msg) - return ProcessingResult(datum=return_value, warnings=warnings, errors=errors) + return RawProcessingResult(datum=return_value, warnings=warnings, errors=errors) @staticmethod def parse_and_assert_past_date( # noqa: C901 @@ -705,7 +610,7 @@ def parse_and_assert_past_date( # noqa: C901 output_field, input_fields: list[str], args: FunctionArgs, # args is essential - even if Pylance says it's not used - ) -> ProcessingResult: + ) -> RawProcessingResult: """Parse date string. If it's incomplete, add 01-01, if no year, return null and error input_data: date: str, date string to parse @@ -714,11 +619,7 @@ def parse_and_assert_past_date( # noqa: C901 date_str = input_data["date"] if not date_str: - return ProcessingResult( - datum=None, - warnings=[], - errors=[], - ) + return RawProcessingResult() release_date_str = input_data.get("release_date", "") or "" try: release_date = dateutil.parse(release_date_str) @@ -731,8 +632,8 @@ def parse_and_assert_past_date( # noqa: C901 "%Y": "Month and day are missing. Assuming January 1st.", } - warnings = [] - errors = [] + warnings: list[str] = [] + errors: list[str] = [] for format, message in formats_to_messages.items(): try: @@ -749,54 +650,28 @@ def parse_and_assert_past_date( # noqa: C901 if message: warnings.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Metadata field {output_field}:'{date_str}' - " + message, - ) + f"Metadata field {output_field}:'{date_str}' - " + message ) if parsed_date > datetime.now(tz=pytz.utc): logger.debug(f"parsed_date: {parsed_date} > {datetime.now(tz=pytz.utc)}") errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Metadata field {output_field}:'{date_str}' is in the future.", - ) + f"Metadata field {output_field}:'{date_str}' is in the future." ) if release_date and parsed_date > release_date: logger.debug(f"parsed_date: {parsed_date} > release_date: {release_date}") errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Metadata field {output_field}:'{date_str}'is after release date." - ), - ) + f"Metadata field {output_field}:'{date_str}'is after release date." ) - return ProcessingResult(datum=datum, warnings=warnings, errors=errors) + return RawProcessingResult(datum=datum, warnings=warnings, errors=errors) except ValueError: continue # If all parsing attempts fail, it's an unrecognized format - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Metadata field {output_field}: Date format is not recognized.", - ) - ], + return processing_error( + f"Metadata field {output_field}: Date format is not recognized." ) @staticmethod @@ -805,43 +680,20 @@ def parse_timestamp( output_field: str, input_fields: list[str], args: FunctionArgs, # args is essential - even if Pylance says it's not used - ) -> ProcessingResult: + ) -> RawProcessingResult: """Parse a timestamp string, e.g. 2022-11-01T00:00:00Z and return a YYYY-MM-DD string""" timestamp = input_data["timestamp"] if not timestamp: - return ProcessingResult( - datum=None, - warnings=[], - errors=[], - ) + return RawProcessingResult() - # Parse timestamp - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] try: parsed_timestamp = dateutil.parse(timestamp) - return ProcessingResult( - datum=parsed_timestamp.strftime("%Y-%m-%d"), - warnings=warnings, - errors=errors, - ) + return RawProcessingResult(datum=parsed_timestamp.strftime("%Y-%m-%d")) except ValueError as e: - error_message = ( + return processing_error( f"Timestamp is {timestamp} which is not in parseable YYYY-MM-DD. Parsing error: {e}" ) - return ProcessingResult( - datum=None, - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=error_message, - ) - ], - warnings=warnings, - ) @staticmethod def concatenate( @@ -849,31 +701,20 @@ def concatenate( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: """Concatenates input fields using the "/" separator in the order specified by the order argument. Optionally, a 'fallback_value' argument can be provided. This should be a string to use in place of metadata that is not available. If fallback_value is not provided, the empty string will be used in place of missing metadata. """ - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] + warnings: list[str] = [] + errors: list[str] = [] if not isinstance(args["ACCESSION_VERSION"], str): - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - "Internal Error: Function concatenate did not receive " - f"accession_version ProcessingResult with input {input_data} " - f"and args {args}, please contact the administrator." - ), - ) - ], + return processing_error( + "Internal Error: Function concatenate did not receive " + f"accession_version ProcessingResult with input {input_data} " + f"and args {args}, please contact the administrator." ) accession_version: str = args["ACCESSION_VERSION"] @@ -883,28 +724,18 @@ def concatenate( str(args["fallback_value"]).strip() if args.get("fallback_value") is not None else "" ) - def add_errors(): - errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message="Concatenation failed." - "This may be a configuration error, please contact the administrator.", - ) - ) + error_msg = ( + "Concatenation failed." + "This may be a configuration error, please contact the administrator." + ) if not isinstance(order, list): logger.error( f"Concatenate: Expected order field to be a list. " f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" ) - add_errors() - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + errors.append(error_msg) + return RawProcessingResult(errors=errors) n_inputs = len(input_data.keys()) # exclude ACCESSION_VERSION as it's provided by _call_preprocessing_function() and should not be an input_metadata field @@ -916,23 +747,15 @@ def add_errors(): f"Concatenate: Expected {n_expected} fields, got {n_inputs}. " f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" ) - add_errors() - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + errors.append(error_msg) + return RawProcessingResult(errors=errors) if not isinstance(field_types, list): logger.error( f"Concatenate: Expected type field to be a list. " f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" ) - add_errors() - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + errors.append(error_msg) + return RawProcessingResult(errors=errors) formatted_input_data: list[str] = [] try: @@ -955,7 +778,7 @@ def add_errors(): raw_value = str(raw).strip() if raw_value.count("/") > 1: date_string = None - add_errors() + errors.append(error_msg) else: date_string = raw_value.replace("/", " TO ") formatted_input_data.append( @@ -978,12 +801,8 @@ def add_errors(): f"Concatenate: Missing argument {field_types[i][4:]} in args. " f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" ) - add_errors() - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + errors.append(error_msg) + return RawProcessingResult(errors=errors, warnings=warnings) formatted_input_data.append(str(args[field_types[i][4:]])) elif order[i] in input_data: formatted_input_data.append( @@ -996,37 +815,22 @@ def add_errors(): f"Concatenate: cannot find field {order[i]} of {field_types[i]} in input_data" f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" ) - add_errors() - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + errors.append(error_msg) + return RawProcessingResult(errors=errors, warnings=warnings) result = "/".join(formatted_input_data) # To avoid downstream issues do not let the result start or end in a "/" # Also replace white space with '_' result = result.strip("/").replace(" ", "_") - return ProcessingResult(datum=result, warnings=warnings, errors=errors) + return RawProcessingResult(datum=result, warnings=warnings, errors=errors) except ValueError as e: logger.error(f"Concatenate failed with {e} (ACCESSION_VERSION: {accession_version})") errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Concatenation failed for {output_field}. This is a technical error, " - "please contact the administrator." - ), - ) - ) - return ProcessingResult( - datum=None, - errors=errors, - warnings=warnings, + f"Concatenation failed for {output_field}. This is a technical error, " + "please contact the administrator." ) + return RawProcessingResult(errors=errors, warnings=warnings) @staticmethod def check_authors( @@ -1034,7 +838,7 @@ def check_authors( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: authors = input_data["authors"] author_format_description = ( @@ -1045,57 +849,29 @@ def check_authors( "are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' " "or 'Xu,;' if the first name is unknown." ) - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] + warnings: list[str] = [] + errors: list[str] = [] if not authors: - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + return RawProcessingResult() errors, warnings = check_latin_characters(authors, input_fields, output_field) if errors or warnings: - return ProcessingResult( - datum=None, - warnings=warnings, - errors=errors, - ) + return RawProcessingResult(warnings=warnings, errors=errors) if valid_authors(authors): formatted_authors = format_authors(authors) if warn_potentially_invalid_authors(authors): - warning_message = ( + warnings.append( "The authors list might not be using the Loculus format. " + author_format_description ) - warnings.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=warning_message, - ) - ) if " and " in authors: - warning_message = ( + warnings.append( "Authors list contains 'and'. This may indicate a misformatted " "authors list. Authors should always be separated by semi-colons only e.g. " "`Smith, Anna; Perez, Tom J.; Xu, X.L.`." ) - warnings.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=warning_message, - ) - ) - return ProcessingResult( - datum=formatted_authors, - warnings=warnings, - errors=errors, - ) + return RawProcessingResult(datum=formatted_authors, warnings=warnings) invalid_names = get_invalid_author_names(authors) if invalid_names: names_to_show = "; ".join(f"'{name}'" for name in invalid_names[:3]) @@ -1106,18 +882,7 @@ def check_authors( error_message = ( "The authors list is not in a recognized format. " + author_format_description ) - return ProcessingResult( - datum=None, - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=error_message, - ) - ], - warnings=warnings, - ) + return RawProcessingResult(errors=[error_message], warnings=warnings) @staticmethod def extract_regex( @@ -1125,7 +890,7 @@ def extract_regex( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: """ Extracts a substring from the `regex_field` using the provided regex `pattern` with a `capture_group`, if `uppercase` is set to true the extracted value is capitalized. @@ -1134,68 +899,41 @@ def extract_regex( """ regex_field = input_data["regex_field"] - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] + errors: list[str] = [] pattern = args.get("pattern") capture_group = args.get("capture_group") uppercase = args.get("uppercase", False) if not regex_field: - return ProcessingResult(datum=None, warnings=warnings, errors=errors) + return RawProcessingResult() if not isinstance(pattern, str): - errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=regex_error("extract_regex", "pattern", input_data, args), - ) + return processing_error( + regex_error("extract_regex", "pattern", input_data, args) ) - return ProcessingResult(datum=None, warnings=warnings, errors=errors) if not isinstance(capture_group, str): - errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=regex_error("extract_regex", "capture_group", input_data, args), - ) + return processing_error( + regex_error("extract_regex", "capture_group", input_data, args) ) - return ProcessingResult(datum=None, warnings=warnings, errors=errors) match = re.match(pattern, regex_field.strip()) if match: try: result = match.group(capture_group) if result is not None and uppercase: result = result.upper() - return ProcessingResult(datum=result, warnings=warnings, errors=errors) + return RawProcessingResult(datum=result) except IndexError: errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"The pattern '{pattern}' does not contain a capture group: " - f"'{capture_group}'- this is an internal error," - " please contact your local administrator." - ), - ) + f"The pattern '{pattern}' does not contain a capture group: " + f"'{capture_group}'- this is an internal error," + " please contact your local administrator." ) else: errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"The value '{regex_field}' does not match the expected regex " - f"pattern: '{pattern}'." - ), - ) + f"The value '{regex_field}' does not match the expected regex " + f"pattern: '{pattern}'." ) - return ProcessingResult(datum=None, warnings=warnings, errors=errors) + return RawProcessingResult(errors=errors) @staticmethod def check_regex( @@ -1203,69 +941,41 @@ def check_regex( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: """ Validates that the field regex_field matches the regex expression. If not return error """ regex_field = input_data["regex_field"] - warnings: list[ProcessingAnnotation] = [] - errors: list[ProcessingAnnotation] = [] - pattern = args["pattern"] if not regex_field: - return ProcessingResult(datum=None, warnings=warnings, errors=errors) + return RawProcessingResult() if not isinstance(pattern, str): - errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=regex_error("check_regex", "pattern", input_data, args), - ) + return processing_error( + regex_error("check_regex", "pattern", input_data, args) ) - return ProcessingResult(datum=None, warnings=warnings, errors=errors) regex_field = regex_field.strip() if re.match(pattern, regex_field): - return ProcessingResult(datum=regex_field, warnings=warnings, errors=errors) - errors.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"The value '{regex_field}' does not match the expected regex " - f"pattern: '{pattern}'." - ), - ) + return RawProcessingResult(datum=regex_field) + return processing_error( + f"The value '{regex_field}' does not match the expected regex " + f"pattern: '{pattern}'." ) - return ProcessingResult(datum=None, warnings=warnings, errors=errors) @staticmethod def identity( # noqa: C901, PLR0912 input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs - ) -> ProcessingResult: + ) -> RawProcessingResult: """Identity function, takes input_data["input"] and returns it as output""" if "input" not in input_data: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"No data found for output field: {output_field}", - ) - ], - ) + return processing_error(f"No data found for output field: {output_field}") input_datum = input_data["input"] if not input_datum: - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() - errors: list[ProcessingAnnotation] = [] + errors: list[str] = [] output_datum: ProcessedMetadataValue if args and "type" in args: match args["type"]: @@ -1275,7 +985,7 @@ def identity( # noqa: C901, PLR0912 except ValueError: output_datum = None errors.append( - invalid_value_annotation(input_datum, output_field, input_fields, "int") + f"Invalid int value: {input_datum} for field {output_field}." ) case "float": try: @@ -1287,9 +997,7 @@ def identity( # noqa: C901, PLR0912 except ValueError: output_datum = None errors.append( - invalid_value_annotation( - input_datum, output_field, input_fields, "float" - ) + f"Invalid float value: {input_datum} for field {output_field}." ) case "boolean": if input_datum.lower() == "true": @@ -1299,9 +1007,7 @@ def identity( # noqa: C901, PLR0912 else: output_datum = None errors.append( - invalid_value_annotation( - input_datum, output_field, input_fields, "boolean" - ) + f"Invalid boolean value: {input_datum} for field {output_field}." ) case _: if isinstance(input_datum, str): @@ -1310,32 +1016,21 @@ def identity( # noqa: C901, PLR0912 output_datum = input_datum else: output_datum = input_datum - return ProcessingResult(datum=output_datum, warnings=[], errors=errors) + return RawProcessingResult(datum=output_datum, errors=errors) @staticmethod def process_options( input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs - ) -> ProcessingResult: + ) -> RawProcessingResult: """Checks that option is in options""" if "options" not in args or not isinstance(args["options"], list): - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - "Website configuration error: no options list specified for field " - f"{output_field}, please contact an administrator." - ), - ) - ], + return processing_error( + "Website configuration error: no options list specified for field " + f"{output_field}, please contact an administrator." ) input_datum = input_data["input"] if not input_datum: - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() output_datum: ProcessedMetadataValue standardized_input_datum = standardize_option(input_datum) @@ -1350,102 +1045,47 @@ def process_options( output_datum = options[standardized_input_datum] # Allow ingested data to include fields not in options elif args["is_insdc_ingest_group"]: - return ProcessingResult( - datum=input_datum, - warnings=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=error_msg, - ) - ], - errors=[], - ) + return RawProcessingResult(datum=input_datum, warnings=[error_msg]) else: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=error_msg, - ) - ], - ) - return ProcessingResult(datum=output_datum, warnings=[], errors=[]) + return processing_error(error_msg) + return RawProcessingResult(datum=output_datum) @staticmethod def is_above_threshold( input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs - ) -> ProcessingResult: + ) -> RawProcessingResult: """Flag if input value is above a threshold specified in args""" if "threshold" not in args: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Field {output_field} is missing threshold argument." - " Please report this error to the administrator." - ), - ) - ], + return processing_error( + f"Field {output_field} is missing threshold argument." + " Please report this error to the administrator." ) input_datum = input_data["input"] if input_datum is None or (isinstance(input_datum, str) and not input_datum.strip()): - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() try: threshold = float(args["threshold"]) # type: ignore input = float(input_datum) except (ValueError, TypeError): msg = f"Field {output_field} has non-numeric threshold value." logger.error(msg) - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=(msg), - ) - ], - ) - return ProcessingResult(datum=(input > threshold), warnings=[], errors=[]) + return processing_error(msg) + return RawProcessingResult(datum=(input > threshold)) @staticmethod def is_variant( input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs - ) -> ProcessingResult: + ) -> RawProcessingResult: """Flag if number of mutations is above mutation rate (specified in args) times length""" if "mu" not in args: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Field {output_field} is missing mu argument." - " Please report this error to the administrator." - ), - ) - ], + return processing_error( + f"Field {output_field} is missing mu argument." + " Please report this error to the administrator." ) length_datum = input_data.get("length") num_mutations_datum = input_data.get("numMutations") if length_datum is None or num_mutations_datum is None: - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() try: mu = float(args["mu"]) # type: ignore length = float(length_datum) @@ -1457,21 +1097,10 @@ def is_variant( args={"threshold": threshold}, ) except (ValueError, TypeError): - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Field {output_field} has non-numeric length or numMutations value." - ), - ) - ], + return processing_error( + f"Field {output_field} has non-numeric length or numMutations value." ) - return ProcessingResult( + return RawProcessingResult( datum=is_above_threshold_result.datum, warnings=is_above_threshold_result.warnings, errors=is_above_threshold_result.errors, @@ -1480,7 +1109,7 @@ def is_variant( @staticmethod def assign_custom_lineage( # noqa: C901 input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs - ) -> ProcessingResult: + ) -> RawProcessingResult: """ Assign flu lineage based on seg4 and seg6. Add reassortant flag if different lineages are detected for internal segments, @@ -1501,7 +1130,7 @@ def assign_custom_lineage( # noqa: C901 f"Starting custom lineage assignment with input_data: {input_data} and args: {args}" ) if not input_data: - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() ha_subtype = input_data.get("subtype_seg4") na_subtype = input_data.get("subtype_seg6") references: dict[str, str | None] = {} @@ -1532,7 +1161,7 @@ def assign_custom_lineage( # noqa: C901 ).datum logger.debug(f"Extracted lineages: {extracted_lineages} from references: {references}") if not ha_subtype or not na_subtype: - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() lineage = f"{ha_subtype}{na_subtype}" if ( extracted_lineages.get("seg4") == "H1N1PDM" @@ -1551,23 +1180,12 @@ def assign_custom_lineage( # noqa: C901 lineage += " reassortant" if any(v for v in variant.values() if v): lineage += " (variant)" - return ProcessingResult(datum=lineage, warnings=[], errors=[]) + return RawProcessingResult(datum=lineage) except (ValueError, TypeError): - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"Internal error processing custom lineage for field {output_field}." - ), - ) - ], + return processing_error( + f"Internal error processing custom lineage for field {output_field}." ) - return ProcessingResult(datum=None, warnings=[], errors=[]) + return RawProcessingResult() @staticmethod def build_display_name( # noqa: C901 @@ -1575,7 +1193,7 @@ def build_display_name( # noqa: C901 output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: """Builds a displayName from input_fields. The identifier field in the displayName is based on specimenCollectorSampleId or - if it is not set - submissionId. @@ -1592,21 +1210,10 @@ def build_display_name( # noqa: C901 """ collector_id = input_data.get("specimenCollectorSampleId", None) submission_id = input_data.get("submissionId", None) - warnings: list[ProcessingAnnotation] = [] + warnings: list[str] = [] if submission_id is None: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator." - ), - ) - ], + return processing_error( + "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator." ) order = args.get("order") @@ -1617,19 +1224,8 @@ def build_display_name( # noqa: C901 or len(order) != len(field_types) or "IDENTIFIER" not in order ): - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator." - ), - ) - ], + return processing_error( + "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator." ) regex_pattern = args.get("regex_pattern") @@ -1637,19 +1233,8 @@ def build_display_name( # noqa: C901 regex_pattern is not None and "identifier" not in re.compile(str(regex_pattern)).groupindex ): - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'" - ), - ) - ], + return processing_error( + "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'" ) concatenate_order = order.copy() @@ -1680,14 +1265,7 @@ def replace_identifier(values, replacement): if extract_result.datum is None: # regex extraction of ID field failed, fall back to ACCESSION_VERSION warnings.append( - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=( - f"identifier string '{identifier}' could not be parsed, using ACCESSION_VERSION in displayName instead" - ), - ) + f"identifier string '{identifier}' could not be parsed, using ACCESSION_VERSION in displayName instead" ) identifier = extract_result.datum @@ -1717,7 +1295,7 @@ def replace_identifier(values, replacement): new_args, ) - return ProcessingResult( + return RawProcessingResult( datum=concat_result.datum, warnings=warnings + concat_result.warnings, errors=concat_result.errors, @@ -1729,7 +1307,7 @@ def resolve_host_taxon_id( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: """Validates that the host exists in NCBI's taxonomy. Checks either the hostTaxonId or the hostNameScientific, depending on the input. @@ -1746,11 +1324,7 @@ def resolve_host_taxon_id( unvalidated_host = input_data.get("host") if not unvalidated_host: - return ProcessingResult( - datum=None, - warnings=[], - errors=[], - ) + return RawProcessingResult() if unvalidated_host.isdigit(): url = f"{tax_service}/taxa/{unvalidated_host}" @@ -1768,13 +1342,8 @@ def resolve_host_taxon_id( if response.status_code != requests.codes.ok: # an invalid host organism is a warning for INSDC ingested sequences, but an error for everyone else - message = ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Host validation for '{unvalidated_host}' failed with code {response.status_code}: {body.get('detail', '')}", - ) - return ProcessingResult( + message = f"Host validation for '{unvalidated_host}' failed with code {response.status_code}: {body.get('detail', '')}" + return RawProcessingResult( datum=None, warnings=[message] if args["is_insdc_ingest_group"] else [], errors=[message] if not args["is_insdc_ingest_group"] else [], @@ -1788,26 +1357,13 @@ def resolve_host_taxon_id( tax_id = taxon.get("tax_id") if tax_id is None: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Internal error: host validation for '{unvalidated_host}' " - f"was successful but response json 'tax_id' was missing. " - f"Please contact the administrator", - ) - ], + return processing_error( + f"Internal error: host validation for '{unvalidated_host}' " + f"was successful but response json 'tax_id' was missing. " + f"Please contact the administrator" ) - return ProcessingResult( - datum=str(tax_id), - warnings=[], - errors=[], - ) + return RawProcessingResult(datum=str(tax_id)) @staticmethod def scientific_name_from_id( @@ -1815,17 +1371,17 @@ def scientific_name_from_id( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: tax_service = args.get("taxonomy_service_url") if not tax_service: return missing_taxonomy_service_error(input_fields, output_field) tax_id: str | None = input_data.get("hostTaxonId") if not tax_id: - return ProcessingResult( - datum=None, - warnings=[], - errors=[], + return RawProcessingResult( + datum=input_data.get("hostNameScientific") + if args["is_insdc_ingest_group"] + else None, ) url = f"{tax_service}/taxa/{tax_id}" @@ -1838,38 +1394,19 @@ def scientific_name_from_id( if response.status_code != requests.codes.ok: message = f"Could not map '{tax_id}' to scientific name. Code {response.status_code}: {body.get('detail', '')}" logger.warning(message) - processing_annotation = ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=message, - ) - return ProcessingResult( + return RawProcessingResult( datum=None, - warnings=[processing_annotation] if args["is_insdc_ingest_group"] else [], - errors=[processing_annotation] if not args["is_insdc_ingest_group"] else [], + warnings=[message] if args["is_insdc_ingest_group"] else [], + errors=[message] if not args["is_insdc_ingest_group"] else [], ) scientific_name = body.get("scientific_name") if scientific_name is None: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Internal error: '{tax_id}' is a valid taxon ID but response json had no 'scientific_name'. Please contact the administrator", - ) - ], + return processing_error( + f"Internal error: '{tax_id}' is a valid taxon ID but response json had no 'scientific_name'. Please contact the administrator" ) - return ProcessingResult( - datum=scientific_name, - warnings=[], - errors=[], - ) + return RawProcessingResult(datum=scientific_name) @staticmethod def common_name_from_id( @@ -1877,18 +1414,14 @@ def common_name_from_id( output_field: str, input_fields: list[str], args: FunctionArgs, - ) -> ProcessingResult: + ) -> RawProcessingResult: tax_service = args.get("taxonomy_service_url") if not tax_service: return missing_taxonomy_service_error(input_fields, output_field) tax_id: str | None = input_data.get("hostTaxonId") if not tax_id: - return ProcessingResult( - datum=None, - warnings=[], - errors=[], - ) + return RawProcessingResult() url = f"{tax_service}/taxa/{tax_id}?find_common_name=true" try: @@ -1900,39 +1433,19 @@ def common_name_from_id( ) if response.status_code != requests.codes.ok: - return ProcessingResult( - datum=None, + return RawProcessingResult( warnings=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Could not map '{tax_id}' to common name. Code {response.status_code}: {body.get('detail', '')}", - ) + f"Could not map '{tax_id}' to common name. Code {response.status_code}: {body.get('detail', '')}" ], - errors=[], ) common_name = body.get("common_name") if common_name is None: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=f"Internal error: taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it. Please contact the administrator.", - ) - ], + return processing_error( + f"Internal error: taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it. Please contact the administrator." ) - return ProcessingResult( - datum=common_name, - warnings=[], - errors=[], - ) + return RawProcessingResult(datum=common_name) def single_metadata_annotation( diff --git a/preprocessing/nextclade/tests/test_assign_custom_lineage.py b/preprocessing/nextclade/tests/test_assign_custom_lineage.py index 8ffec69a15..36974815b3 100644 --- a/preprocessing/nextclade/tests/test_assign_custom_lineage.py +++ b/preprocessing/nextclade/tests/test_assign_custom_lineage.py @@ -234,7 +234,7 @@ def test_missing_mu_arg_returns_error(): ) assert result.datum is None assert len(result.errors) == 1 - assert "missing mu argument" in result.errors[0].message + assert "missing mu argument" in result.errors[0] @staticmethod def test_non_numeric_inputs_return_error(): diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index d17ae42244..a03c3b2826 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -1109,7 +1109,6 @@ def test_parse_date_into_range() -> None: }, ) .errors[0] - .message == "Metadata field field_name: Detected date range but could not parse date: 20-01-2020/2021-06-30." ), "Invalid date range format errors." assert ( @@ -1123,7 +1122,6 @@ def test_parse_date_into_range() -> None: }, ) .errors[0] - .message == "Metadata field field_name:'2022-01-01/2021-06-30' is an invalid date range. Lower bound: 2022-01-01 00:00:00+00:00 is after upper bound: 2021-06-30 00:00:00+00:00." ), "Invalid date range format errors." assert ( @@ -1362,7 +1360,7 @@ def args_insdc(): assert res.datum == "DENV-1/unknown/version.1/2025" assert len(res.warnings) == 1 assert ( - res.warnings[0].message + res.warnings[0] == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead" ) assert res_insdc.datum == "DENV-1/unknown/version.1/2025" @@ -1370,7 +1368,7 @@ def args_insdc(): assert res_prefix.datum == "hYF/unknown/version.1/2025" assert len(res_prefix.warnings) == 1 assert ( - res_prefix.warnings[0].message + res_prefix.warnings[0] == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead" ) @@ -1396,7 +1394,7 @@ def args_insdc(): assert res.datum == "DENV-1/another_fallback/version.1/2025" assert len(res.warnings) == 1 assert ( - res.warnings[0].message + res.warnings[0] == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead" ) assert res_insdc.datum == "DENV-1/another_fallback/version.1/2025" @@ -1404,7 +1402,7 @@ def args_insdc(): assert res_prefix.datum == "hYF/another_fallback/version.1/2025" assert len(res_prefix.warnings) == 1 assert ( - res_prefix.warnings[0].message + res_prefix.warnings[0] == "identifier string 'hDENV1/myExtractedSample/2025' could not be parsed, using ACCESSION_VERSION in displayName instead" ) From bcbf40617c2446f977d414363ce2347441fb961c Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:34:15 +0200 Subject: [PATCH 03/18] format --- .../processing_functions.py | 53 ++++++------------- .../test_metadata_processing_functions.py | 6 +-- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 3adb56c556..78b7f34511 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -162,9 +162,7 @@ def check_latin_characters( if ord(char) < 128: continue if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F): - errors = [ - f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})." - ] + errors = [f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."] return (errors, warnings) @@ -214,7 +212,9 @@ def regex_error( ) -def missing_taxonomy_service_error(input_fields: list[str], output_field: str) -> RawProcessingResult: +def missing_taxonomy_service_error( + input_fields: list[str], output_field: str +) -> RawProcessingResult: return processing_error( "Configuration error: taxonomy_service_url was None. Please contact the administrator." ) @@ -538,14 +538,11 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 ) if datum and datum.message: - warnings.append( - f"Metadata field {output_field}:'{input_date_str}' - " + datum.message - ) + warnings.append(f"Metadata field {output_field}:'{input_date_str}' - " + datum.message) if datum is None: return processing_error( - f"Metadata field {output_field}: " - f"Date {input_date_str} could not be parsed." + f"Metadata field {output_field}: Date {input_date_str} could not be parsed." ) logger.debug(f"parsed_date: {datum}") @@ -572,9 +569,7 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 logger.debug( f"Lower range of date: {datum.date_range_lower} > {datetime.now(tz=pytz.utc)}" ) - errors.append( - f"Metadata field {output_field}:'{input_date_str}' is in the future." - ) + errors.append(f"Metadata field {output_field}:'{input_date_str}' is in the future.") if release_date and datum.date_range_lower and (datum.date_range_lower > release_date): logger.debug( @@ -649,15 +644,11 @@ def parse_and_assert_past_date( # noqa: C901 logger.debug(f"parsed_date: {parsed_date}") if message: - warnings.append( - f"Metadata field {output_field}:'{date_str}' - " + message - ) + warnings.append(f"Metadata field {output_field}:'{date_str}' - " + message) if parsed_date > datetime.now(tz=pytz.utc): logger.debug(f"parsed_date: {parsed_date} > {datetime.now(tz=pytz.utc)}") - errors.append( - f"Metadata field {output_field}:'{date_str}' is in the future." - ) + errors.append(f"Metadata field {output_field}:'{date_str}' is in the future.") if release_date and parsed_date > release_date: logger.debug(f"parsed_date: {parsed_date} > release_date: {release_date}") @@ -670,9 +661,7 @@ def parse_and_assert_past_date( # noqa: C901 continue # If all parsing attempts fail, it's an unrecognized format - return processing_error( - f"Metadata field {output_field}: Date format is not recognized." - ) + return processing_error(f"Metadata field {output_field}: Date format is not recognized.") @staticmethod def parse_timestamp( @@ -908,13 +897,9 @@ def extract_regex( if not regex_field: return RawProcessingResult() if not isinstance(pattern, str): - return processing_error( - regex_error("extract_regex", "pattern", input_data, args) - ) + return processing_error(regex_error("extract_regex", "pattern", input_data, args)) if not isinstance(capture_group, str): - return processing_error( - regex_error("extract_regex", "capture_group", input_data, args) - ) + return processing_error(regex_error("extract_regex", "capture_group", input_data, args)) match = re.match(pattern, regex_field.strip()) if match: try: @@ -930,8 +915,7 @@ def extract_regex( ) else: errors.append( - f"The value '{regex_field}' does not match the expected regex " - f"pattern: '{pattern}'." + f"The value '{regex_field}' does not match the expected regex pattern: '{pattern}'." ) return RawProcessingResult(errors=errors) @@ -953,15 +937,12 @@ def check_regex( if not regex_field: return RawProcessingResult() if not isinstance(pattern, str): - return processing_error( - regex_error("check_regex", "pattern", input_data, args) - ) + return processing_error(regex_error("check_regex", "pattern", input_data, args)) regex_field = regex_field.strip() if re.match(pattern, regex_field): return RawProcessingResult(datum=regex_field) return processing_error( - f"The value '{regex_field}' does not match the expected regex " - f"pattern: '{pattern}'." + f"The value '{regex_field}' does not match the expected regex pattern: '{pattern}'." ) @staticmethod @@ -984,9 +965,7 @@ def identity( # noqa: C901, PLR0912 output_datum = int(input_datum) except ValueError: output_datum = None - errors.append( - f"Invalid int value: {input_datum} for field {output_field}." - ) + errors.append(f"Invalid int value: {input_datum} for field {output_field}.") case "float": try: output_datum = float(input_datum) diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index a03c3b2826..9e6383e63d 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -1107,8 +1107,7 @@ def test_parse_date_into_range() -> None: "fieldType": "dateRangeString", "submittedAt": ts_from_ymd(2022, 1, 1), }, - ) - .errors[0] + ).errors[0] == "Metadata field field_name: Detected date range but could not parse date: 20-01-2020/2021-06-30." ), "Invalid date range format errors." assert ( @@ -1120,8 +1119,7 @@ def test_parse_date_into_range() -> None: "fieldType": "dateRangeString", "submittedAt": ts_from_ymd(2022, 1, 1), }, - ) - .errors[0] + ).errors[0] == "Metadata field field_name:'2022-01-01/2021-06-30' is an invalid date range. Lower bound: 2022-01-01 00:00:00+00:00 is after upper bound: 2021-06-30 00:00:00+00:00." ), "Invalid date range format errors." assert ( From 9c7eb5dbedec48951888a3548b73054f1e88e1fb Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:39:06 +0200 Subject: [PATCH 04/18] merge conflict --- .../src/loculus_preprocessing/processing_functions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 78b7f34511..9153bd3e38 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1358,9 +1358,7 @@ def scientific_name_from_id( tax_id: str | None = input_data.get("hostTaxonId") if not tax_id: return RawProcessingResult( - datum=input_data.get("hostNameScientific") - if args["is_insdc_ingest_group"] - else None, + datum=None, ) url = f"{tax_service}/taxa/{tax_id}" From 10eb6e493a0137eec13cc4bbade120af5f92cac4 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:44:25 +0200 Subject: [PATCH 05/18] fixup --- .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 9153bd3e38..80765179f8 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -838,8 +838,6 @@ def check_authors( "are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' " "or 'Xu,;' if the first name is unknown." ) - warnings: list[str] = [] - errors: list[str] = [] if not authors: return RawProcessingResult() From 8ecdff0f72f2518a62a16e7787fa12f88705189d Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:49:18 +0200 Subject: [PATCH 06/18] clean up more --- .../processing_functions.py | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 80765179f8..d821f9ecc1 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -150,9 +150,7 @@ def reformat_authors_from_latin_to_ascii(authors: str) -> str: return unicodedata.normalize("NFKD", authors).encode("ascii", "ignore").decode("ascii") -def check_latin_characters( - authors: str, input_fields: list[str], output_field: str -) -> tuple[list[str], list[str]]: +def check_latin_characters(authors: str) -> tuple[list[str], list[str]]: warnings: list[str] = [] errors: list[str] = [] # Check if all characters in the authors string are Latin letters or spaces @@ -212,9 +210,7 @@ def regex_error( ) -def missing_taxonomy_service_error( - input_fields: list[str], output_field: str -) -> RawProcessingResult: +def missing_taxonomy_service_error() -> RawProcessingResult: return processing_error( "Configuration error: taxonomy_service_url was None. Please contact the administrator." ) @@ -224,8 +220,6 @@ def taxonomy_network_error( subject: str, action: str, e: Exception, - input_fields: list[str], - output_field: str, ) -> RawProcessingResult: return processing_error( f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator." @@ -841,7 +835,7 @@ def check_authors( if not authors: return RawProcessingResult() - errors, warnings = check_latin_characters(authors, input_fields, output_field) + errors, warnings = check_latin_characters(authors) if errors or warnings: return RawProcessingResult(warnings=warnings, errors=errors) @@ -1297,7 +1291,7 @@ def resolve_host_taxon_id( """ tax_service = args.get("taxonomy_service_url") if not tax_service: - return missing_taxonomy_service_error(input_fields, output_field) + return missing_taxonomy_service_error() unvalidated_host = input_data.get("host") if not unvalidated_host: @@ -1313,9 +1307,7 @@ def resolve_host_taxon_id( response = taxonomy_cache.get_or_fetch(url) body = response.json() except requests.exceptions.RequestException as e: - return taxonomy_network_error( - unvalidated_host, "validating", e, input_fields, output_field - ) + return taxonomy_network_error(unvalidated_host, "validating", e) if response.status_code != requests.codes.ok: # an invalid host organism is a warning for INSDC ingested sequences, but an error for everyone else @@ -1351,7 +1343,7 @@ def scientific_name_from_id( ) -> RawProcessingResult: tax_service = args.get("taxonomy_service_url") if not tax_service: - return missing_taxonomy_service_error(input_fields, output_field) + return missing_taxonomy_service_error() tax_id: str | None = input_data.get("hostTaxonId") if not tax_id: @@ -1364,7 +1356,7 @@ def scientific_name_from_id( response = taxonomy_cache.get_or_fetch(url) body = response.json() except requests.exceptions.RequestException as e: - return taxonomy_network_error(tax_id, "validating", e, input_fields, output_field) + return taxonomy_network_error(tax_id, "validating", e) if response.status_code != requests.codes.ok: message = f"Could not map '{tax_id}' to scientific name. Code {response.status_code}: {body.get('detail', '')}" @@ -1392,7 +1384,7 @@ def common_name_from_id( ) -> RawProcessingResult: tax_service = args.get("taxonomy_service_url") if not tax_service: - return missing_taxonomy_service_error(input_fields, output_field) + return missing_taxonomy_service_error() tax_id: str | None = input_data.get("hostTaxonId") if not tax_id: @@ -1403,9 +1395,7 @@ def common_name_from_id( response = taxonomy_cache.get_or_fetch(url) body = response.json() except requests.exceptions.RequestException as e: - return taxonomy_network_error( - tax_id, "getting common name for", e, input_fields, output_field - ) + return taxonomy_network_error(tax_id, "getting common name for", e) if response.status_code != requests.codes.ok: return RawProcessingResult( From 0d0023f392f5040fc439b487b4a832e6da6558b6 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 09:54:32 +0200 Subject: [PATCH 07/18] feat: add test for conversion Co-authored-by: Cornelius Roemer --- .../tests/test_host_name_validation.py | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py index d97cdbe97e..9f8fd81588 100644 --- a/preprocessing/nextclade/tests/test_host_name_validation.py +++ b/preprocessing/nextclade/tests/test_host_name_validation.py @@ -6,8 +6,14 @@ from loculus_preprocessing import processing_functions from loculus_preprocessing.config import get_config -from loculus_preprocessing.datatypes import UnprocessedData, UnprocessedEntry +from loculus_preprocessing.datatypes import ( + AnnotationSource, + AnnotationSourceType, + UnprocessedData, + UnprocessedEntry, +) from loculus_preprocessing.prepro import process_all +from loculus_preprocessing.processing_functions import ProcessingFunctions HOST_PROCESSING_CONFIG = "tests/host_processing_config.yaml" @@ -188,3 +194,35 @@ def test_host_processing_invalid_host_direct(mock_session: MagicMock) -> None: assert result[0].processed_entry.warnings == [] assert len(result[0].processed_entry.errors) == 1 assert "Host validation for" in result[0].processed_entry.errors[0].message + + +@patch.object(processing_functions.taxonomy_cache, "session") +def test_call_function_converts_raw_errors_to_annotations(mock_session: MagicMock) -> None: + """call_function must convert RawProcessingResult string errors into ProcessingAnnotations + with correct message and field linkage.""" + mock_session.get.return_value = make_response(404, {"detail": "not found"}) + + input_fields = ["host"] + output_field = "hostTaxonId" + args = {"taxonomy_service_url": "http://localhost:5000", "is_insdc_ingest_group": False} + + result = ProcessingFunctions.call_function( + "resolve_host_taxon_id", + args=args, + input_data={"host": "not a real species"}, + output_field=output_field, + input_fields=input_fields, + ) + + assert result.datum is None + assert result.warnings == [] + assert len(result.errors) == 1 + + annotation = result.errors[0] + assert "Host validation for" in annotation.message + assert annotation.processedFields == ( + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA), + ) + assert annotation.unprocessedFields == ( + AnnotationSource(name="host", type=AnnotationSourceType.METADATA), + ) From b10a363fac6e423349c35704aae6430c5928abb4 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 10:42:16 +0200 Subject: [PATCH 08/18] fixup --- preprocessing/nextclade/tests/test_host_name_validation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py index 9f8fd81588..d84b53f662 100644 --- a/preprocessing/nextclade/tests/test_host_name_validation.py +++ b/preprocessing/nextclade/tests/test_host_name_validation.py @@ -9,6 +9,7 @@ from loculus_preprocessing.datatypes import ( AnnotationSource, AnnotationSourceType, + FunctionArgs, UnprocessedData, UnprocessedEntry, ) @@ -204,7 +205,10 @@ def test_call_function_converts_raw_errors_to_annotations(mock_session: MagicMoc input_fields = ["host"] output_field = "hostTaxonId" - args = {"taxonomy_service_url": "http://localhost:5000", "is_insdc_ingest_group": False} + args: FunctionArgs = { + "taxonomy_service_url": "http://localhost:5000", + "is_insdc_ingest_group": False, + } result = ProcessingFunctions.call_function( "resolve_host_taxon_id", From 62cf0a34da012328de0f5cb226e866514da6dd05 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 11:13:29 +0200 Subject: [PATCH 09/18] append --- .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index d821f9ecc1..f8b1c912ff 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -160,7 +160,7 @@ def check_latin_characters(authors: str) -> tuple[list[str], list[str]]: if ord(char) < 128: continue if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F): - errors = [f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})."] + errors.append(f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X}).") return (errors, warnings) From 0f040ceb3021cbf1c7ef5eab82717e4d34fda67a Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 11:15:24 +0200 Subject: [PATCH 10/18] format --- .../src/loculus_preprocessing/processing_functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index f8b1c912ff..1fb74ced7c 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -160,7 +160,9 @@ def check_latin_characters(authors: str) -> tuple[list[str], list[str]]: if ord(char) < 128: continue if char.isalpha() and not (0x0000 <= ord(char) <= 0x024F): - errors.append(f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X}).") + errors.append( + f"Unsupported non-Latin character encountered: {char} (U+{ord(char):04X})." + ) return (errors, warnings) From ee958cd43d9dc2c61d3e4022c48a95725a519659 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 11:59:11 +0200 Subject: [PATCH 11/18] Update preprocessing/nextclade/src/loculus_preprocessing/datatypes.py Co-authored-by: maverbiest <59956362+maverbiest@users.noreply.github.com> --- .../nextclade/src/loculus_preprocessing/datatypes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index 7004f81354..ffc183afca 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -195,11 +195,6 @@ def processing_error(message: str) -> "RawProcessingResult": return RawProcessingResult(datum=None, errors=[message]) -def processing_warning(message: str, datum: ProcessedMetadataValue = None) -> "RawProcessingResult": - """Helper to create a RawProcessingResult with a single warning.""" - return RawProcessingResult(datum=datum, warnings=[message]) - - @unique class SegmentClassificationMethod(StrEnum): """ From 92b72b360254e5baaf93151d654da660fd41f9e1 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 30 Jun 2026 12:06:07 +0200 Subject: [PATCH 12/18] fix up --- .../src/loculus_preprocessing/processing_functions.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 1fb74ced7c..48ee20ad94 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -432,10 +432,8 @@ def check_date( warnings.append("Date is in the future.") return RawProcessingResult(datum=date, warnings=warnings) except ValueError as e: - return RawProcessingResult( - errors=[ - f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}" - ], + return processing_error( + f"Date is {date} which is not in the required format YYYY-MM-DD. Parsing error: {e}" ) @staticmethod From e648833df79bc1d5e53a7945ef58ae280e80df7e Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 2 Jul 2026 08:41:38 +0200 Subject: [PATCH 13/18] fixup --- .../src/loculus_preprocessing/processing_functions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 48ee20ad94..8792c760ac 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1347,9 +1347,7 @@ def scientific_name_from_id( tax_id: str | None = input_data.get("hostTaxonId") if not tax_id: - return RawProcessingResult( - datum=None, - ) + return RawProcessingResult() url = f"{tax_service}/taxa/{tax_id}" try: From 5f649af00c7cd387d846f51cc33945a30387fe60 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 2 Jul 2026 08:43:28 +0200 Subject: [PATCH 14/18] feat(prepro): ensure all internal errors are properly logged and include proper information (#6801) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …lude proper information Follow up to https://github.com/loculus-project/loculus/pull/6537 - ensure that all internal errors thrown by prepro (these are related to invalid config, unexpected nextclade format etc.) are properly annotated with the prefix "Internal Error:" and the suffix "Please contact the administrator." ### Screenshot ### PR Checklist - [x] All necessary documentation has been adapted. - [x] The implemented feature is covered by appropriate, automated tests. - [ ] Any manual testing that has been done is documented (i.e. what exactly was tested?) 🚀 Preview: Add `preview` label to enable --------- Co-authored-by: maverbiest <59956362+maverbiest@users.noreply.github.com> --- .../processing_functions.py | 201 +++++++----------- .../tests/test_nextclade_preprocessing.py | 5 +- 2 files changed, 79 insertions(+), 127 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 8792c760ac..ca73153cd2 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -23,7 +23,6 @@ from urllib3.util.retry import Retry from .datatypes import ( - AnnotationSource, AnnotationSourceType, FunctionArgs, InputData, @@ -202,20 +201,26 @@ def format_authors(authors: str) -> str: return "; ".join(loculus_authors).strip() +def _internal_error_message(message: str) -> str: + full = f"Internal Error. {message} Please contact the administrator." + logger.error(full) + return full + + +def raw_internal_error(message: str) -> RawProcessingResult: + return processing_error(_internal_error_message(message)) + + def regex_error( function_name: str, function_arg: str, input_data: InputMetadata, args: FunctionArgs -) -> str: - return ( - f"Internal Error: Function {function_name} did not receive valid " - f"regex {function_arg}, with input {input_data} and args {args}, " - "please contact the administrator." +) -> RawProcessingResult: + return raw_internal_error( + f"{function_name} did not receive a valid {function_arg}, with input {input_data} and args {args}." ) def missing_taxonomy_service_error() -> RawProcessingResult: - return processing_error( - "Configuration error: taxonomy_service_url was None. Please contact the administrator." - ) + return raw_internal_error("taxonomy_service_url was not configured.") def taxonomy_network_error( @@ -223,9 +228,7 @@ def taxonomy_network_error( action: str, e: Exception, ) -> RawProcessingResult: - return processing_error( - f"Internal error: network error while {action} '{subject}': {e}. Please contact the administrator." - ) + return raw_internal_error(f"Network error while {action} '{subject}': {e}.") @dataclass @@ -337,28 +340,17 @@ def call_function( try: result = func(input_data, output_field, input_fields=input_fields, args=args) except Exception as e: - message = ( - f"Error calling function {function_name} for output field {output_field} " - f"with input {input_data} and args {args}: {e}" - ) + message = f"Internal Error. {function_name} raised an unexpected exception for output field '{output_field}': {e}. Please contact the administrator." logger.exception(message) return ProcessingResult( datum=None, warnings=[], errors=[ - ProcessingAnnotation( - processedFields=[ - AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) - ], - unprocessedFields=[ - AnnotationSource(name=field, type=AnnotationSourceType.METADATA) - for field in input_fields - ], - message=( - f"Internal Error: Function {function_name} did not return " - f"ProcessingResult with input {input_data} and args {args}, " - "please contact the administrator." - ), + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=message, ) ], ) @@ -385,11 +377,8 @@ def call_function( ], ) if not isinstance(result, ProcessingResult): - logger.error( - f"ERROR: Function {function_name} did not return ProcessingResult " - f"given input {input_data} and args {args}. " - "This is likely a preprocessing bug." - ) + message = f"Internal Error. {function_name} returned an unexpected type '{type(result).__name__}'. Please contact the administrator." + logger.error(message) return ProcessingResult( datum=None, warnings=[], @@ -398,11 +387,7 @@ def call_function( input_fields, [output_field], AnnotationSourceType.METADATA, - message=( - f"Internal Error: Function {function_name} did not return " - f"ProcessingResult with input {input_data} and args {args}, " - "please contact the administrator." - ), + message=message, ) ], ) @@ -470,10 +455,8 @@ def parse_date_into_range( # noqa: C901, PLR0912, PLR0915 try: submitted_at = datetime.fromtimestamp(float(str(args["submittedAt"])), tz=pytz.utc) except Exception: - return processing_error( - f"Internal Error: Function parse_into_ranges did not receive valid " - f"submittedAt date, with input {input_data} and args {args}, " - "please contact the administrator." + return raw_internal_error( + f"parse_into_ranges did not receive a valid submittedAt date, with input {input_data} and args {args}." ) max_upper_limit = min(submitted_at, release_date) if release_date else submitted_at @@ -694,10 +677,8 @@ def concatenate( errors: list[str] = [] if not isinstance(args["ACCESSION_VERSION"], str): - return processing_error( - "Internal Error: Function concatenate did not receive " - f"accession_version ProcessingResult with input {input_data} " - f"and args {args}, please contact the administrator." + return raw_internal_error( + f"concatenate did not receive a valid ACCESSION_VERSION (got: {args['ACCESSION_VERSION']!r})." ) accession_version: str = args["ACCESSION_VERSION"] @@ -707,18 +688,10 @@ def concatenate( str(args["fallback_value"]).strip() if args.get("fallback_value") is not None else "" ) - error_msg = ( - "Concatenation failed." - "This may be a configuration error, please contact the administrator." - ) - if not isinstance(order, list): - logger.error( - f"Concatenate: Expected order field to be a list. " - f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" + return raw_internal_error( + f"concatenate expected 'order' to be a list (ACCESSION_VERSION: {accession_version})." ) - errors.append(error_msg) - return RawProcessingResult(errors=errors) n_inputs = len(input_data.keys()) # exclude ACCESSION_VERSION as it's provided by _call_preprocessing_function() and should not be an input_metadata field @@ -726,19 +699,13 @@ def concatenate( [i for i in order if i != "ACCESSION_VERSION" and not i.startswith("ARG:")] ) if n_inputs < n_expected: - logger.error( - f"Concatenate: Expected {n_expected} fields, got {n_inputs}. " - f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" + return raw_internal_error( + f"concatenate expected {n_expected} input fields but got {n_inputs} (ACCESSION_VERSION: {accession_version})." ) - errors.append(error_msg) - return RawProcessingResult(errors=errors) if not isinstance(field_types, list): - logger.error( - f"Concatenate: Expected type field to be a list. " - f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" + return raw_internal_error( + f"concatenate expected 'type' to be a list (ACCESSION_VERSION: {accession_version})." ) - errors.append(error_msg) - return RawProcessingResult(errors=errors) formatted_input_data: list[str] = [] try: @@ -761,7 +728,11 @@ def concatenate( raw_value = str(raw).strip() if raw_value.count("/") > 1: date_string = None - errors.append(error_msg) + errors.append( + _internal_error_message( + f"dateRangeString field '{order[i]}' has an unexpected format: '{raw_value}' (ACCESSION_VERSION: {accession_version})." + ) + ) else: date_string = raw_value.replace("/", " TO ") formatted_input_data.append( @@ -780,12 +751,9 @@ def concatenate( formatted_input_data.append(accession_version) elif field_types[i].startswith("ARG:"): if field_types[i][4:] not in args: - logger.error( - f"Concatenate: Missing argument {field_types[i][4:]} in args. " - f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" + return raw_internal_error( + f"concatenate is missing argument '{field_types[i][4:]}' in args (ACCESSION_VERSION: {accession_version})." ) - errors.append(error_msg) - return RawProcessingResult(errors=errors, warnings=warnings) formatted_input_data.append(str(args[field_types[i][4:]])) elif order[i] in input_data: formatted_input_data.append( @@ -794,12 +762,9 @@ def concatenate( else str(input_data[order[i]]).strip() ) else: - logger.error( - f"Concatenate: cannot find field {order[i]} of {field_types[i]} in input_data" - f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" + return raw_internal_error( + f"concatenate could not find field '{order[i]}' (type: {field_types[i]}) in input_data (ACCESSION_VERSION: {accession_version})." ) - errors.append(error_msg) - return RawProcessingResult(errors=errors, warnings=warnings) result = "/".join(formatted_input_data) # To avoid downstream issues do not let the result start or end in a "/" @@ -808,12 +773,15 @@ def concatenate( return RawProcessingResult(datum=result, warnings=warnings, errors=errors) except ValueError as e: - logger.error(f"Concatenate failed with {e} (ACCESSION_VERSION: {accession_version})") - errors.append( - f"Concatenation failed for {output_field}. This is a technical error, " - "please contact the administrator." + return RawProcessingResult( + warnings=warnings, + errors=[ + *errors, + _internal_error_message( + f"Concatenation failed for '{output_field}' with error: {e} (ACCESSION_VERSION: {accession_version})." + ), + ], ) - return RawProcessingResult(errors=errors, warnings=warnings) @staticmethod def check_authors( @@ -889,9 +857,9 @@ def extract_regex( if not regex_field: return RawProcessingResult() if not isinstance(pattern, str): - return processing_error(regex_error("extract_regex", "pattern", input_data, args)) + return regex_error("extract_regex", "pattern", input_data, args) if not isinstance(capture_group, str): - return processing_error(regex_error("extract_regex", "capture_group", input_data, args)) + return regex_error("extract_regex", "capture_group", input_data, args) match = re.match(pattern, regex_field.strip()) if match: try: @@ -900,10 +868,8 @@ def extract_regex( result = result.upper() return RawProcessingResult(datum=result) except IndexError: - errors.append( - f"The pattern '{pattern}' does not contain a capture group: " - f"'{capture_group}'- this is an internal error," - " please contact your local administrator." + return raw_internal_error( + f"Pattern '{pattern}' does not contain capture group '{capture_group}'." ) else: errors.append( @@ -929,7 +895,7 @@ def check_regex( if not regex_field: return RawProcessingResult() if not isinstance(pattern, str): - return processing_error(regex_error("check_regex", "pattern", input_data, args)) + return regex_error("check_regex", "pattern", input_data, args) regex_field = regex_field.strip() if re.match(pattern, regex_field): return RawProcessingResult(datum=regex_field) @@ -1183,9 +1149,7 @@ def build_display_name( # noqa: C901 submission_id = input_data.get("submissionId", None) warnings: list[str] = [] if submission_id is None: - return processing_error( - "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator." - ) + return raw_internal_error("'submissionId' must not be None for build_display_name().") order = args.get("order") field_types = args.get("type") @@ -1195,8 +1159,8 @@ def build_display_name( # noqa: C901 or len(order) != len(field_types) or "IDENTIFIER" not in order ): - return processing_error( - "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator." + return raw_internal_error( + "'order' and 'type' must be lists of equal length and 'order' must contain IDENTIFIER." ) regex_pattern = args.get("regex_pattern") @@ -1204,8 +1168,8 @@ def build_display_name( # noqa: C901 regex_pattern is not None and "identifier" not in re.compile(str(regex_pattern)).groupindex ): - return processing_error( - "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'" + return raw_internal_error( + "If provided, 'regex_pattern' must contain a named capture group called 'identifier'." ) concatenate_order = order.copy() @@ -1326,10 +1290,8 @@ def resolve_host_taxon_id( tax_id = taxon.get("tax_id") if tax_id is None: - return processing_error( - f"Internal error: host validation for '{unvalidated_host}' " - f"was successful but response json 'tax_id' was missing. " - f"Please contact the administrator" + return raw_internal_error( + f"Host validation for '{unvalidated_host}' was successful but response json 'tax_id' was missing." ) return RawProcessingResult(datum=str(tax_id)) @@ -1367,8 +1329,8 @@ def scientific_name_from_id( scientific_name = body.get("scientific_name") if scientific_name is None: - return processing_error( - f"Internal error: '{tax_id}' is a valid taxon ID but response json had no 'scientific_name'. Please contact the administrator" + return raw_internal_error( + f"'{tax_id}' is a valid taxon ID but response json had no 'scientific_name'." ) return RawProcessingResult(datum=scientific_name) @@ -1404,8 +1366,8 @@ def common_name_from_id( common_name = body.get("common_name") if common_name is None: - return processing_error( - f"Internal error: taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it. Please contact the administrator." + return raw_internal_error( + f"Taxonomy service indicated common name was found for hostTaxonId '{tax_id}', but failed to return it." ) return RawProcessingResult(datum=common_name) @@ -1430,10 +1392,9 @@ def process_frameshifts(input: str | None) -> InputData: return InputData(datum=format_frameshift(input)) except Exception as e: msg = ( - "Was unable to format frameshift - this is likely an internal error. " - "Please contact the administrator." + f"Internal Error. Frameshift formatting failed: {e}. Please contact the administrator." ) - logger.error(msg + f" Error: {e}") + logger.error(msg) return InputData( datum=None, errors=single_metadata_annotation( @@ -1514,10 +1475,9 @@ def process_stop_codons(input: str | None) -> InputData: return InputData(datum=format_stop_codon(input)) except Exception as e: msg = ( - "Was unable to format stop codon - this is likely an internal error. " - "Please contact the administrator." + f"Internal Error. Stop codon formatting failed: {e}. Please contact the administrator." ) - logger.error(msg + f" Error: {e}") + logger.error(msg) return InputData( datum=None, errors=single_metadata_annotation( @@ -1573,11 +1533,8 @@ def process_mutations_from_clade_founder(input: str | None, args: FunctionArgs | if mutations: return InputData(datum=" ".join(mutations)) except Exception as e: - msg = ( - "Was unable to process mutations from clade founder - this is likely an internal error. " - "Please contact the administrator." - ) - logger.error(msg + f" Error: {e}") + msg = f"Internal Error. Clade founder mutation processing failed: {e}. Please contact the administrator." + logger.error(msg) return InputData( datum=None, errors=single_metadata_annotation( @@ -1607,11 +1564,7 @@ def process_labeled_mutations(input: str | None, args: FunctionArgs | None) -> I if mutations: return InputData(datum=" ".join(mutations)) except Exception as e: - msg = ( - "Was unable to process labeled mutations - this is likely an internal error. " - "Please contact the administrator." - ) - logger.error(msg + f" Error: {e}") + msg = _internal_error_message(f"Labeled mutation processing failed: {e}. ") return InputData( datum=None, errors=single_metadata_annotation( @@ -1634,11 +1587,7 @@ def process_phenotype_values(input: str | None, args: FunctionArgs | None) -> In value = entry.get("value") return InputData(datum=str(value) if value is not None else None) except Exception as e: - msg = ( - "Was unable to process phenotype values - this is likely an internal error. " - "Please contact the administrator." - ) - logger.error(msg + f" Error: {e}") + msg = _internal_error_message(f"Labeled mutation processing failed: {e}. ") return InputData( datum=None, errors=single_metadata_annotation( diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 05edf3a4cf..a32d5898d3 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -1405,7 +1405,10 @@ def test_process_phenotype_values(): assert process_phenotype_values('[{"name": "NAI","cds": "NA","value": 0.0}]', {}).datum is None invalid = process_phenotype_values("Malformed JSON", {"name": "NAI"}) assert invalid.datum is None - assert "Was unable to process phenotype values" in invalid.errors[0].message + assert ( + "Internal Error. Phenotype value processing failed: invalid syntax" + in invalid.errors[0].message + ) def test_reformat_authors_from_loculus_to_embl_style(): From 2cc446f842d58e5e732704b4ef488b49491734b2 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 2 Jul 2026 08:55:36 +0200 Subject: [PATCH 15/18] use internal error in more places --- .../processing_functions.py | 30 +++++++------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index ca73153cd2..4626836992 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -340,8 +340,6 @@ def call_function( try: result = func(input_data, output_field, input_fields=input_fields, args=args) except Exception as e: - message = f"Internal Error. {function_name} raised an unexpected exception for output field '{output_field}': {e}. Please contact the administrator." - logger.exception(message) return ProcessingResult( datum=None, warnings=[], @@ -350,7 +348,9 @@ def call_function( input_fields, [output_field], AnnotationSourceType.METADATA, - message=message, + message=_internal_error_message( + f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. " + ), ) ], ) @@ -377,8 +377,6 @@ def call_function( ], ) if not isinstance(result, ProcessingResult): - message = f"Internal Error. {function_name} returned an unexpected type '{type(result).__name__}'. Please contact the administrator." - logger.error(message) return ProcessingResult( datum=None, warnings=[], @@ -387,7 +385,9 @@ def call_function( input_fields, [output_field], AnnotationSourceType.METADATA, - message=message, + message=_internal_error_message( + f"{function_name} returned an unexpected type '{type(result).__name__}. " + ), ) ], ) @@ -1391,15 +1391,11 @@ def process_frameshifts(input: str | None) -> InputData: try: return InputData(datum=format_frameshift(input)) except Exception as e: - msg = ( - f"Internal Error. Frameshift formatting failed: {e}. Please contact the administrator." - ) - logger.error(msg) return InputData( datum=None, errors=single_metadata_annotation( "frameshifts", - msg, + _internal_error_message(f"Frameshift formatting failed: {e}. "), ), ) @@ -1474,15 +1470,11 @@ def process_stop_codons(input: str | None) -> InputData: try: return InputData(datum=format_stop_codon(input)) except Exception as e: - msg = ( - f"Internal Error. Stop codon formatting failed: {e}. Please contact the administrator." - ) - logger.error(msg) return InputData( datum=None, errors=single_metadata_annotation( "stopCodons", - msg, + _internal_error_message(f"Stop codon formatting failed: {e}. "), ), ) @@ -1533,13 +1525,11 @@ def process_mutations_from_clade_founder(input: str | None, args: FunctionArgs | if mutations: return InputData(datum=" ".join(mutations)) except Exception as e: - msg = f"Internal Error. Clade founder mutation processing failed: {e}. Please contact the administrator." - logger.error(msg) return InputData( datum=None, errors=single_metadata_annotation( "mutationsFromCladeFounder", - msg, + _internal_error_message(f"Clade founder mutation processing failed: {e}. "), ), ) return InputData(datum=None) @@ -1587,7 +1577,7 @@ def process_phenotype_values(input: str | None, args: FunctionArgs | None) -> In value = entry.get("value") return InputData(datum=str(value) if value is not None else None) except Exception as e: - msg = _internal_error_message(f"Labeled mutation processing failed: {e}. ") + msg = _internal_error_message(f"Phenotype value processing failed: {e}. ") return InputData( datum=None, errors=single_metadata_annotation( From 29efe70adc8ec2408895140581bd9a457ec76972 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 2 Jul 2026 08:59:28 +0200 Subject: [PATCH 16/18] reduce --- .../processing_functions.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 4626836992..f59857a614 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -340,19 +340,10 @@ def call_function( try: result = func(input_data, output_field, input_fields=input_fields, args=args) except Exception as e: - return ProcessingResult( - datum=None, - warnings=[], - errors=[ - ProcessingAnnotation.from_fields( - input_fields, - [output_field], - AnnotationSourceType.METADATA, - message=_internal_error_message( - f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. " - ), - ) - ], + result = processing_error( + _internal_error_message( + f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. " + ) ) if isinstance(result, RawProcessingResult): return ProcessingResult( From 1523f29613fddc235d14703f15c980dc2bd483d5 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 2 Jul 2026 09:07:37 +0200 Subject: [PATCH 17/18] improve test --- .../processing_functions.py | 6 +-- .../tests/test_host_name_validation.py | 40 +------------------ .../test_metadata_processing_functions.py | 34 ++++++++++++++++ 3 files changed, 37 insertions(+), 43 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index f59857a614..c307739e7a 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -340,10 +340,8 @@ def call_function( try: result = func(input_data, output_field, input_fields=input_fields, args=args) except Exception as e: - result = processing_error( - _internal_error_message( - f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. " - ) + result = raw_internal_error( + f"{function_name} raised an unexpected exception for output field '{output_field}': {e}. " ) if isinstance(result, RawProcessingResult): return ProcessingResult( diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py index d84b53f662..bcd4bdee57 100644 --- a/preprocessing/nextclade/tests/test_host_name_validation.py +++ b/preprocessing/nextclade/tests/test_host_name_validation.py @@ -7,14 +7,11 @@ from loculus_preprocessing import processing_functions from loculus_preprocessing.config import get_config from loculus_preprocessing.datatypes import ( - AnnotationSource, - AnnotationSourceType, - FunctionArgs, UnprocessedData, UnprocessedEntry, ) from loculus_preprocessing.prepro import process_all -from loculus_preprocessing.processing_functions import ProcessingFunctions + HOST_PROCESSING_CONFIG = "tests/host_processing_config.yaml" @@ -195,38 +192,3 @@ def test_host_processing_invalid_host_direct(mock_session: MagicMock) -> None: assert result[0].processed_entry.warnings == [] assert len(result[0].processed_entry.errors) == 1 assert "Host validation for" in result[0].processed_entry.errors[0].message - - -@patch.object(processing_functions.taxonomy_cache, "session") -def test_call_function_converts_raw_errors_to_annotations(mock_session: MagicMock) -> None: - """call_function must convert RawProcessingResult string errors into ProcessingAnnotations - with correct message and field linkage.""" - mock_session.get.return_value = make_response(404, {"detail": "not found"}) - - input_fields = ["host"] - output_field = "hostTaxonId" - args: FunctionArgs = { - "taxonomy_service_url": "http://localhost:5000", - "is_insdc_ingest_group": False, - } - - result = ProcessingFunctions.call_function( - "resolve_host_taxon_id", - args=args, - input_data={"host": "not a real species"}, - output_field=output_field, - input_fields=input_fields, - ) - - assert result.datum is None - assert result.warnings == [] - assert len(result.errors) == 1 - - annotation = result.errors[0] - assert "Host validation for" in annotation.message - assert annotation.processedFields == ( - AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA), - ) - assert annotation.unprocessedFields == ( - AnnotationSource(name="host", type=AnnotationSourceType.METADATA), - ) diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index 9e6383e63d..de20b2c240 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -12,6 +12,8 @@ from loculus_preprocessing.config import Config, get_config, get_processing_order from loculus_preprocessing.datatypes import ( + AnnotationSource, + AnnotationSourceType, FunctionArgs, InputMetadata, ProcessedEntry, @@ -1405,5 +1407,37 @@ def args_insdc(): ) +def test_call_function_converts_raw_errors_to_annotations() -> None: + """call_function must convert RawProcessingResult string errors into ProcessingAnnotations + with correct message and field linkage.""" + input_fields = ["myField"] + output_field = "myField" + args: FunctionArgs = { + "options": ["OptionA", "OptionB"], + "is_insdc_ingest_group": False, + } + + result = ProcessingFunctions.call_function( + "process_options", + args=args, + input_data={"input": "NotAnOption"}, + output_field=output_field, + input_fields=input_fields, + ) + + assert result.datum is None + assert result.warnings == [] + assert len(result.errors) == 1 + + annotation = result.errors[0] + assert "not in list of accepted options" in annotation.message + assert annotation.processedFields == ( + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA), + ) + assert annotation.unprocessedFields == ( + AnnotationSource(name="myField", type=AnnotationSourceType.METADATA), + ) + + if __name__ == "__main__": pytest.main() From de417ce99cab170c7d4229e1a600c4f79d582323 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 2 Jul 2026 09:09:08 +0200 Subject: [PATCH 18/18] weird --- preprocessing/nextclade/tests/test_host_name_validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/preprocessing/nextclade/tests/test_host_name_validation.py b/preprocessing/nextclade/tests/test_host_name_validation.py index bcd4bdee57..adef09a0af 100644 --- a/preprocessing/nextclade/tests/test_host_name_validation.py +++ b/preprocessing/nextclade/tests/test_host_name_validation.py @@ -12,7 +12,6 @@ ) from loculus_preprocessing.prepro import process_all - HOST_PROCESSING_CONFIG = "tests/host_processing_config.yaml"