PlasmoGenEpi · kathrynmurie · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/src/pmotools/pmo_builder/merge_to_pmo.py b/src/pmotools/pmo_builder/merge_to_pmo.py
@@ -7,6 +7,7 @@
 from pmotools.pmo_builder.mhap_table_to_pmo import (
     create_minimum_library_specimen_dict_from_mhap_table,
 )
+import warnings
 
 
 def _convert_numpy_scalars(obj):
@@ -86,16 +87,17 @@ def merge_to_pmo(
         specimen_names = {item["specimen_name"] for item in specimen_info}
 
         # Check for names in library_sample_info that are not in specimen_info
+        # Check for names in specimen_info that are not in library_sample_info
         missing_in_specimen = library_sample_names - specimen_names
+        missing_in_library = specimen_names - library_sample_names
+        for missing_lib_name in missing_in_specimen:
+            specimen_info.append({"specimen_name": missing_lib_name})
         if missing_in_specimen:
-            raise ValueError(
-                f"library_sample_names found in the detected_microhaplotypes that don't have corresponding supplied specimen_names: {sorted(missing_in_specimen)}"
+            warnings.warn(
+                f"library_sample_names found in the detected_microhaplotypes that don't have corresponding supplied specimen_names: {sorted(missing_in_specimen)}, will be added to specimen_info with no meta"
             )
-
-        # Check for names in specimen_info that are not in library_sample_info
-        missing_in_library = specimen_names - library_sample_names
         if missing_in_library:
-            raise ValueError(
+            warnings.warn(
                 f"specimen_name were supplied that don't have corresponding library_sample_names in detected_microhaplotypes: {sorted(missing_in_library)}"
             )
 
@@ -289,12 +291,17 @@ def _replace_names_with_IDs(
 ):
     # SPECIMEN INFO
     # replace name with project ID
+    any_missing_project_names = False
+    missing_projects = []
     if project_info is not None:
-        missing_projects = _replace_key_with_id(
-            specimen_info, project_info, "project_name", "project_id"
-        )
-    else:
-        missing_projects = []
+        for spec in specimen_info:
+            if "project_name" not in spec:
+                any_missing_project_names = True
+                break
+        if not any_missing_project_names:
+            missing_projects = _replace_key_with_id(
+                specimen_info, project_info, "project_name", "project_id"
+            )
 
     # LIBRARY SAMPLE INFO
     # replace with sequencing_info_id, specimen_id, panel_id
@@ -307,15 +314,20 @@ def _replace_names_with_IDs(
         "panel_name",
         "panel_id",
     )
+    any_missing_sequence_info_names = False
+    missing_sequencing = []
     if sequencing_info is not None:
-        missing_sequencing = _replace_key_with_id(
-            library_sample_info,
-            sequencing_info,
-            "sequencing_info_name",
-            "sequencing_info_id",
-        )
-    else:
-        missing_sequencing = []
+        for lib_sample in library_sample_info:
+            if "sequencing_info_name" not in lib_sample:
+                any_missing_sequence_info_names = True
+                break
+        if not any_missing_sequence_info_names:
+            missing_sequencing = _replace_key_with_id(
+                library_sample_info,
+                sequencing_info,
+                "sequencing_info_name",
+                "sequencing_info_id",
+            )
 
     # REP MHAPS
     # replace target_name with ID
@@ -328,15 +340,21 @@ def _replace_names_with_IDs(
 
     # DETECTED MHAPS
     # Replace library_sample_name and bioinformatics_run_name
+    any_missing_bioinfo_run_names = False
+    missing_bioinfo_runs = []
     if bioinfo_run_info is not None:
-        missing_bioinfo_runs = _replace_key_with_id(
-            mhap_info["detected_microhaplotypes"],
-            bioinfo_run_info,
-            "bioinformatics_run_name",
-            "bioinformatics_run_id",
-        )
-    else:
-        missing_bioinfo_runs = []
+        for detected in mhap_info["detected_microhaplotypes"]:
+            if "bioinformatics_run_name" not in detected:
+                any_missing_bioinfo_run_names = True
+                break
+        if not any_missing_bioinfo_run_names:
+            missing_bioinfo_runs = _replace_key_with_id(
+                mhap_info["detected_microhaplotypes"],
+                bioinfo_run_info,
+                "bioinformatics_run_name",
+                "bioinformatics_run_id",
+            )
+
     lib_sample_lookup = _make_lookup(library_sample_info, "library_sample_name")
     missing_libs = []
     for detected in mhap_info["detected_microhaplotypes"]:
@@ -354,17 +372,23 @@ def _replace_names_with_IDs(
     missing_read_counts_libs = []
     missing_read_counts_targets = []
     target_lookup = _make_lookup(panel_target_info["target_info"], "target_name")
+    any_read_counts_by_stage_missing_bioinfo_run_names = False
     if read_counts_by_stage_info is not None:
-        if bioinfo_run_info is not None:
-            # Replace bioinformatics_run_name with bioinformatics_run_id
-            missing_read_counts_bioinfo_runs = _replace_key_with_id(
-                read_counts_by_stage_info,
-                bioinfo_run_info,
-                "bioinformatics_run_name",
-                "bioinformatics_run_id",
-            )
-        else:
-            missing_read_counts_bioinfo_runs = []
+        for read_counts_run in read_counts_by_stage_info:
+            if "bioinformatics_run_name" not in read_counts_run:
+                any_read_counts_by_stage_missing_bioinfo_run_names = True
+                break
+        if not any_read_counts_by_stage_missing_bioinfo_run_names:
+            if bioinfo_run_info is not None:
+                # Replace bioinformatics_run_name with bioinformatics_run_id
+                missing_read_counts_bioinfo_runs = _replace_key_with_id(
+                    read_counts_by_stage_info,
+                    bioinfo_run_info,
+                    "bioinformatics_run_name",
+                    "bioinformatics_run_id",
+                )
+            else:
+                missing_read_counts_bioinfo_runs = []
 
         # Replace library_sample_name with library_sample_id in each run and map targets
         for read_counts_run in read_counts_by_stage_info:
@@ -388,7 +412,28 @@ def _replace_names_with_IDs(
                         target_entry["target_id"] = target_lookup[target_name]
                     else:
                         missing_read_counts_targets.append(target_name)
-
+    merging_warnings = []
+    if any_missing_project_names and project_info:
+        merging_warnings.append(
+            "project_info provided but there are specimens missing project_name field"
+        )
+    if any_missing_sequence_info_names and sequencing_info:
+        merging_warnings.append(
+            "sequencing_info provided but there are library samples missing sequencing_info_name field"
+        )
+    if any_missing_bioinfo_run_names and bioinfo_run_info:
+        merging_warnings.append(
+            "bioinformatics_run_info provided but there are detected microhaplotypes missing bioinformatics_run_name field"
+        )
+    if any_read_counts_by_stage_missing_bioinfo_run_names and bioinfo_run_info:
+        merging_warnings.append(
+            "bioinformatics_run_info provided but there are read counts by stage missing bioinformatics_run_name field"
+        )
+    if merging_warnings:
+        warnings_text = "\n".join(merging_warnings)
+        raise Exception(
+            f"The following warnings were encountered during merging:\n{warnings_text}"
+        )
     # If any names were missing from reference tables error
     _report_missing_IDs(
         missing_projects,

diff --git a/src/pmotools/pmo_builder/read_count_by_stage_table_to_pmo.py b/src/pmotools/pmo_builder/read_count_by_stage_table_to_pmo.py
@@ -5,8 +5,8 @@
 
 
 def read_count_by_stage_table_to_pmo(
-    bioinformatics_run_name: str,
     total_raw_count_table: pd.DataFrame,
+    bioinformatics_run_name: str | None = None,
     reads_by_stage_table: pd.DataFrame | None = None,
     library_sample_name_col: str = "library_sample_name",
     target_name_col: str = "target_name",
@@ -19,10 +19,11 @@ def read_count_by_stage_table_to_pmo(
     """
     Convert tables of read counts by stage into PMO read_counts_by_stage format.
 
-    :param bioinformatics_run_name: name for the bioinformatics run (column name or individual run name)
-    :type bioinformatics_run_name: str
+
     :param total_raw_count_table: table with total raw counts per sample
     :type total_raw_count_table: pd.DataFrame
+    :param bioinformatics_run_name: name for the bioinformatics run (column name or individual run name)
+    :type bioinformatics_run_name: str, optional
     :param reads_by_stage_table: table of reads per sample, per locus, per stage. Can be long
         format (single stage column) or wide format (multiple stage columns)
     :type reads_by_stage_table: pd.DataFrame, optional
@@ -70,7 +71,10 @@ def read_count_by_stage_table_to_pmo(
         check_additional_columns_exist(reads_by_stage_table, additional_target_cols)
 
     # Check if bioinformatics_run_name is a column in total_raw_count_table
-    if bioinformatics_run_name in total_raw_count_table.columns:
+    if (
+        bioinformatics_run_name is not None
+        and bioinformatics_run_name in total_raw_count_table.columns
+    ):
         # Create separate entries for each unique run
         output_data_list = []
         unique_runs = total_raw_count_table[bioinformatics_run_name].unique()
@@ -292,14 +296,14 @@ def _process_reads_by_stage_table(
 def _build_read_counts_by_stage_output(
     library_sample_data: dict,
     reads_by_stage_data: dict | None,
-    bioinformatics_run_name: str,
+    bioinformatics_run_name: str | None = None,
 ) -> dict:
     """
     Build the final output structure for read_counts_by_stage.
 
     :param library_sample_data: Dictionary with library sample data
     :param reads_by_stage_data: Optional dictionary with reads by stage data
-    :param bioinformatics_run_name: Name for the bioinformatics run
+    :param bioinformatics_run_name: Optional name for the bioinformatics run
 
     :return: Dictionary formatted for PMO read_counts_by_stage section
     """
@@ -344,8 +348,8 @@ def _build_read_counts_by_stage_output(
 
     # Build the final output structure
     output_data = {
-        "bioinformatics_run_name": bioinformatics_run_name,
         "read_counts_by_library_sample_by_stage": read_counts_by_library_sample_by_stage,
     }
-
+    if bioinformatics_run_name is not None:
+        output_data["bioinformatics_run_name"] = bioinformatics_run_name
     return output_data
diff --git a/tests/test_pmo_builder/test_merge_to_pmo.py b/tests/test_pmo_builder/test_merge_to_pmo.py
@@ -403,8 +403,8 @@ def test_merge_to_pmo_specimen_info_matching_names_passes(self):
         output_names = {s["library_sample_name"] for s in result["library_sample_info"]}
         assert output_names == expected_names
 
-    def test_merge_to_pmo_specimen_info_missing_names_raises(self):
-        """Raises ValueError when specimen_info is missing names that exist in detected_microhaplotypes."""
+    def test_merge_to_pmo_specimen_info_missing_names_warns(self):
+        """Warns (does not raise) when specimen_info is missing names that exist in detected_microhaplotypes."""
         incomplete_specimen_info = [
             {
                 "specimen_name": "samp_A",
@@ -413,20 +413,24 @@ def test_merge_to_pmo_specimen_info_missing_names_raises(self):
             },
             # samp_B is missing
         ]
-        with self.assertRaises(ValueError) as context:
-            merge_to_pmo(
+        with self.assertWarns(UserWarning) as context:
+            result = merge_to_pmo(
                 panel_target_info=self.for_merging_spec_info_panel_info,
                 mhap_info=self.for_merging_spec_info_mhap_info,
                 specimen_info=incomplete_specimen_info,
             )
-        self.assertIn("samp_B", str(context.exception))
+
+        warning_message = str(context.warning)
+        self.assertIn("samp_B", warning_message)
         self.assertIn(
             "library_sample_names found in the detected_microhaplotypes that don't have corresponding supplied specimen_names",
-            str(context.exception),
+            warning_message,
         )
+        # merge should still succeed and produce a result
+        self.assertIsNotNone(result)
 
-    def test_merge_to_pmo_specimen_info_extra_names_raises(self):
-        """Raises ValueError when specimen_info has names not present in detected_microhaplotypes."""
+    def test_merge_to_pmo_specimen_info_extra_names_warns(self):
+        """Warns (does not raise) when specimen_info has names not present in detected_microhaplotypes."""
         excess_specimen_info = [
             {
                 "specimen_name": "samp_A",
@@ -444,17 +448,20 @@ def test_merge_to_pmo_specimen_info_extra_names_raises(self):
                 "collection_date": "2022-03-01",
             },
         ]
-        with self.assertRaises(ValueError) as context:
-            merge_to_pmo(
+        with self.assertWarns(UserWarning) as context:
+            result = merge_to_pmo(
                 panel_target_info=self.for_merging_spec_info_panel_info,
                 mhap_info=self.for_merging_spec_info_mhap_info,
                 specimen_info=excess_specimen_info,
             )
-        self.assertIn("samp_C", str(context.exception))
+        warning_message = str(context.warning)
+        self.assertIn("samp_C", warning_message)
         self.assertIn(
             "specimen_name were supplied that don't have corresponding library_sample_names",
-            str(context.exception),
+            warning_message,
         )
+        # merge should still succeed and produce a result
+        self.assertIsNotNone(result)
 
 
 if __name__ == "__main__":