Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 83 additions & 38 deletions src/pmotools/pmo_builder/merge_to_pmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pmotools.pmo_builder.mhap_table_to_pmo import (
create_minimum_library_specimen_dict_from_mhap_table,
)
import warnings


def _convert_numpy_scalars(obj):
Expand Down Expand Up @@ -86,16 +87,17 @@ def merge_to_pmo(
specimen_names = {item["specimen_name"] for item in specimen_info}

# Check for names in library_sample_info that are not in specimen_info
# Check for names in specimen_info that are not in library_sample_info
missing_in_specimen = library_sample_names - specimen_names
missing_in_library = specimen_names - library_sample_names
for missing_lib_name in missing_in_specimen:
specimen_info.append({"specimen_name": missing_lib_name})
if missing_in_specimen:
raise ValueError(
f"library_sample_names found in the detected_microhaplotypes that don't have corresponding supplied specimen_names: {sorted(missing_in_specimen)}"
warnings.warn(
f"library_sample_names found in the detected_microhaplotypes that don't have corresponding supplied specimen_names: {sorted(missing_in_specimen)}, will be added to specimen_info with no meta"
)

# Check for names in specimen_info that are not in library_sample_info
missing_in_library = specimen_names - library_sample_names
if missing_in_library:
raise ValueError(
warnings.warn(
f"specimen_name were supplied that don't have corresponding library_sample_names in detected_microhaplotypes: {sorted(missing_in_library)}"
)

Expand Down Expand Up @@ -289,12 +291,17 @@ def _replace_names_with_IDs(
):
# SPECIMEN INFO
# replace name with project ID
any_missing_project_names = False
missing_projects = []
if project_info is not None:
missing_projects = _replace_key_with_id(
specimen_info, project_info, "project_name", "project_id"
)
else:
missing_projects = []
for spec in specimen_info:
if "project_name" not in spec:
any_missing_project_names = True
break
if not any_missing_project_names:
missing_projects = _replace_key_with_id(
specimen_info, project_info, "project_name", "project_id"
)

# LIBRARY SAMPLE INFO
# replace with sequencing_info_id, specimen_id, panel_id
Expand All @@ -307,15 +314,20 @@ def _replace_names_with_IDs(
"panel_name",
"panel_id",
)
any_missing_sequence_info_names = False
missing_sequencing = []
if sequencing_info is not None:
missing_sequencing = _replace_key_with_id(
library_sample_info,
sequencing_info,
"sequencing_info_name",
"sequencing_info_id",
)
else:
missing_sequencing = []
for lib_sample in library_sample_info:
if "sequencing_info_name" not in lib_sample:
any_missing_sequence_info_names = True
break
if not any_missing_sequence_info_names:
missing_sequencing = _replace_key_with_id(
library_sample_info,
sequencing_info,
"sequencing_info_name",
"sequencing_info_id",
)

# REP MHAPS
# replace target_name with ID
Expand All @@ -328,15 +340,21 @@ def _replace_names_with_IDs(

# DETECTED MHAPS
# Replace library_sample_name and bioinformatics_run_name
any_missing_bioinfo_run_names = False
missing_bioinfo_runs = []
if bioinfo_run_info is not None:
missing_bioinfo_runs = _replace_key_with_id(
mhap_info["detected_microhaplotypes"],
bioinfo_run_info,
"bioinformatics_run_name",
"bioinformatics_run_id",
)
else:
missing_bioinfo_runs = []
for detected in mhap_info["detected_microhaplotypes"]:
if "bioinformatics_run_name" not in detected:
any_missing_bioinfo_run_names = True
break
if not any_missing_bioinfo_run_names:
missing_bioinfo_runs = _replace_key_with_id(
mhap_info["detected_microhaplotypes"],
bioinfo_run_info,
"bioinformatics_run_name",
"bioinformatics_run_id",
)

lib_sample_lookup = _make_lookup(library_sample_info, "library_sample_name")
missing_libs = []
for detected in mhap_info["detected_microhaplotypes"]:
Expand All @@ -354,17 +372,23 @@ def _replace_names_with_IDs(
missing_read_counts_libs = []
missing_read_counts_targets = []
target_lookup = _make_lookup(panel_target_info["target_info"], "target_name")
any_read_counts_by_stage_missing_bioinfo_run_names = False
if read_counts_by_stage_info is not None:
if bioinfo_run_info is not None:
# Replace bioinformatics_run_name with bioinformatics_run_id
missing_read_counts_bioinfo_runs = _replace_key_with_id(
read_counts_by_stage_info,
bioinfo_run_info,
"bioinformatics_run_name",
"bioinformatics_run_id",
)
else:
missing_read_counts_bioinfo_runs = []
for read_counts_run in read_counts_by_stage_info:
if "bioinformatics_run_name" not in read_counts_run:
any_read_counts_by_stage_missing_bioinfo_run_names = True
break
if not any_read_counts_by_stage_missing_bioinfo_run_names:
if bioinfo_run_info is not None:
# Replace bioinformatics_run_name with bioinformatics_run_id
missing_read_counts_bioinfo_runs = _replace_key_with_id(
read_counts_by_stage_info,
bioinfo_run_info,
"bioinformatics_run_name",
"bioinformatics_run_id",
)
else:
missing_read_counts_bioinfo_runs = []

# Replace library_sample_name with library_sample_id in each run and map targets
for read_counts_run in read_counts_by_stage_info:
Expand All @@ -388,7 +412,28 @@ def _replace_names_with_IDs(
target_entry["target_id"] = target_lookup[target_name]
else:
missing_read_counts_targets.append(target_name)

merging_warnings = []
if any_missing_project_names and project_info:
merging_warnings.append(
"project_info provided but there are specimens missing project_name field"
)
if any_missing_sequence_info_names and sequencing_info:
merging_warnings.append(
"sequencing_info provided but there are library samples missing sequencing_info_name field"
)
if any_missing_bioinfo_run_names and bioinfo_run_info:
merging_warnings.append(
"bioinformatics_run_info provided but there are detected microhaplotypes missing bioinformatics_run_name field"
)
if any_read_counts_by_stage_missing_bioinfo_run_names and bioinfo_run_info:
merging_warnings.append(
"bioinformatics_run_info provided but there are read counts by stage missing bioinformatics_run_name field"
)
if merging_warnings:
warnings_text = "\n".join(merging_warnings)
raise Exception(
f"The following warnings were encountered during merging:\n{warnings_text}"
)
# If any names were missing from reference tables error
_report_missing_IDs(
missing_projects,
Expand Down
20 changes: 12 additions & 8 deletions src/pmotools/pmo_builder/read_count_by_stage_table_to_pmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@


def read_count_by_stage_table_to_pmo(
bioinformatics_run_name: str,
total_raw_count_table: pd.DataFrame,
bioinformatics_run_name: str | None = None,
reads_by_stage_table: pd.DataFrame | None = None,
library_sample_name_col: str = "library_sample_name",
target_name_col: str = "target_name",
Expand All @@ -19,10 +19,11 @@ def read_count_by_stage_table_to_pmo(
"""
Convert tables of read counts by stage into PMO read_counts_by_stage format.

:param bioinformatics_run_name: name for the bioinformatics run (column name or individual run name)
:type bioinformatics_run_name: str

:param total_raw_count_table: table with total raw counts per sample
:type total_raw_count_table: pd.DataFrame
:param bioinformatics_run_name: name for the bioinformatics run (column name or individual run name)
:type bioinformatics_run_name: str, optional
:param reads_by_stage_table: table of reads per sample, per locus, per stage. Can be long
format (single stage column) or wide format (multiple stage columns)
:type reads_by_stage_table: pd.DataFrame, optional
Expand Down Expand Up @@ -70,7 +71,10 @@ def read_count_by_stage_table_to_pmo(
check_additional_columns_exist(reads_by_stage_table, additional_target_cols)

# Check if bioinformatics_run_name is a column in total_raw_count_table
if bioinformatics_run_name in total_raw_count_table.columns:
if (
bioinformatics_run_name is not None
and bioinformatics_run_name in total_raw_count_table.columns
):
# Create separate entries for each unique run
output_data_list = []
unique_runs = total_raw_count_table[bioinformatics_run_name].unique()
Expand Down Expand Up @@ -292,14 +296,14 @@ def _process_reads_by_stage_table(
def _build_read_counts_by_stage_output(
library_sample_data: dict,
reads_by_stage_data: dict | None,
bioinformatics_run_name: str,
bioinformatics_run_name: str | None = None,
) -> dict:
"""
Build the final output structure for read_counts_by_stage.

:param library_sample_data: Dictionary with library sample data
:param reads_by_stage_data: Optional dictionary with reads by stage data
:param bioinformatics_run_name: Name for the bioinformatics run
:param bioinformatics_run_name: Optional name for the bioinformatics run

:return: Dictionary formatted for PMO read_counts_by_stage section
"""
Expand Down Expand Up @@ -344,8 +348,8 @@ def _build_read_counts_by_stage_output(

# Build the final output structure
output_data = {
"bioinformatics_run_name": bioinformatics_run_name,
"read_counts_by_library_sample_by_stage": read_counts_by_library_sample_by_stage,
}

if bioinformatics_run_name is not None:
output_data["bioinformatics_run_name"] = bioinformatics_run_name
return output_data
31 changes: 19 additions & 12 deletions tests/test_pmo_builder/test_merge_to_pmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,8 @@ def test_merge_to_pmo_specimen_info_matching_names_passes(self):
output_names = {s["library_sample_name"] for s in result["library_sample_info"]}
assert output_names == expected_names

def test_merge_to_pmo_specimen_info_missing_names_raises(self):
"""Raises ValueError when specimen_info is missing names that exist in detected_microhaplotypes."""
def test_merge_to_pmo_specimen_info_missing_names_warns(self):
"""Warns (does not raise) when specimen_info is missing names that exist in detected_microhaplotypes."""
incomplete_specimen_info = [
{
"specimen_name": "samp_A",
Expand All @@ -413,20 +413,24 @@ def test_merge_to_pmo_specimen_info_missing_names_raises(self):
},
# samp_B is missing
]
with self.assertRaises(ValueError) as context:
merge_to_pmo(
with self.assertWarns(UserWarning) as context:
result = merge_to_pmo(
panel_target_info=self.for_merging_spec_info_panel_info,
mhap_info=self.for_merging_spec_info_mhap_info,
specimen_info=incomplete_specimen_info,
)
self.assertIn("samp_B", str(context.exception))

warning_message = str(context.warning)
self.assertIn("samp_B", warning_message)
self.assertIn(
"library_sample_names found in the detected_microhaplotypes that don't have corresponding supplied specimen_names",
str(context.exception),
warning_message,
)
# merge should still succeed and produce a result
self.assertIsNotNone(result)

def test_merge_to_pmo_specimen_info_extra_names_raises(self):
"""Raises ValueError when specimen_info has names not present in detected_microhaplotypes."""
def test_merge_to_pmo_specimen_info_extra_names_warns(self):
"""Warns (does not raise) when specimen_info has names not present in detected_microhaplotypes."""
excess_specimen_info = [
{
"specimen_name": "samp_A",
Expand All @@ -444,17 +448,20 @@ def test_merge_to_pmo_specimen_info_extra_names_raises(self):
"collection_date": "2022-03-01",
},
]
with self.assertRaises(ValueError) as context:
merge_to_pmo(
with self.assertWarns(UserWarning) as context:
result = merge_to_pmo(
panel_target_info=self.for_merging_spec_info_panel_info,
mhap_info=self.for_merging_spec_info_mhap_info,
specimen_info=excess_specimen_info,
)
self.assertIn("samp_C", str(context.exception))
warning_message = str(context.warning)
self.assertIn("samp_C", warning_message)
self.assertIn(
"specimen_name were supplied that don't have corresponding library_sample_names",
str(context.exception),
warning_message,
)
# merge should still succeed and produce a result
self.assertIsNotNone(result)


if __name__ == "__main__":
Expand Down
Loading