From c9789b2afc9d7093ea6d69d1001a173dcbec33b2 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 15 May 2026 11:19:50 -0400 Subject: [PATCH 01/17] Add combine_predictor_results helper --- CHANGELOG.md | 17 +++ docs/ranking.md | 39 ++++++ tests/test_combine_predictor_results.py | 153 +++++++++++++++++++++ topiary/__init__.py | 5 +- topiary/predictor.py | 52 ++++++- topiary/result.py | 174 ++++++++++++++++++++++++ 6 files changed, 434 insertions(+), 6 deletions(-) create mode 100644 tests/test_combine_predictor_results.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f53d7a7..0de3610 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ # Changelog +## 5.16.2 + +**Combine separate predictor runs (#170):** + +`topiary.combine_predictor_results([a, b, ...])` stacks separate +single-allele predictor outputs into the same long-form shape produced +by running those predictors together. It accepts `TopiaryResult` or +fresh `TopiaryPredictor` DataFrame outputs, strictly validates that all +inputs cover the same `(peptide, allele)` identity set, rejects +duplicate `prediction_method_name` values across inputs, and merges +model / `kind_support` metadata when present. + +Fresh `TopiaryPredictor` DataFrames now carry lightweight +`DataFrame.attrs` metadata (`topiary_models`, `topiary_kind_support`) so +this helper can preserve predictor provenance without changing the +public return type. + ## 5.16.1 **pirlygenes 5.1.0 integration:** diff --git a/docs/ranking.md b/docs/ranking.md index 742a642..bfd2307 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -356,6 +356,45 @@ should use the recommended forms above. default is `auto`: raw affinity values and percentile ranks sort ascending, while all other sort expressions sort descending. +## Combining Separate Predictor Runs + +Run predictors together when that is convenient: + +```python +from mhctools import NetMHCpan, MHCflurry +from topiary import TopiaryPredictor + +combined = TopiaryPredictor( + models=[NetMHCpan, MHCflurry], + alleles=["HLA-A*02:01", "HLA-B*07:02"], +).predict_from_named_peptides(peptides) +``` + +When predictors need to run separately, use `combine_predictor_results` to +stack the outputs back into the same long-form shape: + +```python +from mhctools import NetMHCpan, MHCflurry +from topiary import TopiaryPredictor, combine_predictor_results + +netmhcpan_rows = TopiaryPredictor( + models=NetMHCpan, + alleles=["HLA-A*02:01", "HLA-B*07:02"], +).predict_from_named_peptides(peptides) + +mhcflurry_rows = TopiaryPredictor( + models=MHCflurry, + alleles=["HLA-A*02:01", "HLA-B*07:02"], +).predict_from_named_peptides(peptides) + +combined = combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) +``` + +The helper is intentionally strict. Every input must cover the same +`(peptide, allele)` keys, and each `prediction_method_name` may appear in only +one input. It supports the common single-allele predictor case today; haplotype +mode needs the allele-set schema work tracked separately. + ## Putting it together ```python diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py new file mode 100644 index 0000000..d405c02 --- /dev/null +++ b/tests/test_combine_predictor_results.py @@ -0,0 +1,153 @@ +"""Tests for combining separately-run predictor outputs.""" + +import pandas as pd +import pytest + +from topiary import TopiaryPredictor, TopiaryResult, combine_predictor_results +from topiary.io import Metadata + + +class ToyAffinityPredictor: + default_peptide_lengths = [9] + supported_kinds = ("pMHC_affinity",) + + def __init__(self, name, version, alleles, offset): + self.prediction_method_name = name + self.predictor_version = version + self.alleles = alleles + self.offset = offset + + def kind_support(self): + return { + "pMHC_affinity": { + "mhc_dependence": "single_allele", + "mhc_class": "I", + } + } + + def predict_dataframe(self, peptides): + rows = [] + for peptide_i, peptide in enumerate(peptides): + for allele_i, allele in enumerate(self.alleles): + affinity = self.offset + 10 * peptide_i + allele_i + rows.append({ + "peptide": peptide, + "allele": allele, + "kind": "pMHC_affinity", + "value": float(affinity), + "score": 1.0 / affinity, + "percentile_rank": affinity / 100.0, + "predictor_name": self.prediction_method_name, + "predictor_version": self.predictor_version, + }) + return pd.DataFrame(rows) + + +def _sort_predictions(df): + cols = [ + "source_sequence_name", "peptide", "allele", "kind", + "prediction_method_name", "predictor_version", + ] + return ( + df.sort_values(cols) + .reset_index(drop=True) + .loc[:, sorted(df.columns)] + ) + + +def _simple_result(method="netmhcpan", peptide="SIINFEKLA", allele="HLA-A*02:01"): + df = pd.DataFrame([{ + "source_sequence_name": "pep1", + "peptide": peptide, + "peptide_offset": 0, + "peptide_length": len(peptide), + "allele": allele, + "kind": "pMHC_affinity", + "value": 100.0, + "score": 0.9, + "percentile_rank": 1.0, + "affinity": 100.0, + "prediction_method_name": method, + "predictor_version": "1.0", + }]) + return TopiaryResult( + df, + Metadata( + form="long", + models={method: "1.0"}, + sources=[method], + extra={ + "kind_support": { + method: { + "pMHC_affinity": { + "mhc_dependence": "single_allele", + "mhc_class": "I", + } + } + } + }, + ), + ) + + +def test_combine_separate_predictor_runs_matches_combined_run(): + peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} + alleles = ["HLA-A*02:01", "HLA-B*07:02"] + netmhcpan = ToyAffinityPredictor("netmhcpan", "4.1b", alleles, offset=100) + mhcflurry = ToyAffinityPredictor("mhcflurry", "2.1.1", alleles, offset=200) + + direct = TopiaryPredictor( + models=[netmhcpan, mhcflurry] + ).predict_from_named_peptides(peptides) + net_only = TopiaryPredictor(models=netmhcpan).predict_from_named_peptides(peptides) + flurry_only = TopiaryPredictor(models=mhcflurry).predict_from_named_peptides(peptides) + + combined = combine_predictor_results([net_only, flurry_only]) + + pd.testing.assert_frame_equal( + _sort_predictions(combined.df), + _sort_predictions(direct), + ) + assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} + assert combined.extra["kind_support"] == direct.attrs["topiary_kind_support"] + + +def test_combine_rejects_different_identity_sets(): + r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") + r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") + + with pytest.raises(ValueError, match="same .* keys"): + combine_predictor_results([r1, r2]) + + +def test_combine_rejects_duplicate_prediction_methods(): + r1 = _simple_result("netmhcpan") + r2 = _simple_result("netmhcpan") + + with pytest.raises(ValueError, match="duplicate prediction method"): + combine_predictor_results([r1, r2]) + + +def test_combine_rejects_haplotype_kind_support(): + haplotype = _simple_result("mhcflurry") + haplotype.extra["kind_support"]["mhcflurry"]["pMHC_affinity"] = { + "mhc_dependence": "haplotype", + "mhc_class": "I", + } + single_allele = _simple_result("netmhcpan") + + with pytest.raises(ValueError, match="#168/#169"): + combine_predictor_results([haplotype, single_allele]) + + +def test_topiary_result_reads_predictor_dataframe_attrs(): + peptides = {"pep1": "SIINFEKLA"} + predictor = ToyAffinityPredictor( + "netmhcpan", "4.1b", ["HLA-A*02:01"], offset=100, + ) + df = TopiaryPredictor(models=predictor).predict_from_named_peptides(peptides) + + result = TopiaryResult(df) + + assert result.models == {"netmhcpan": "4.1b"} + assert result.extra["kind_support"] == df.attrs["topiary_kind_support"] diff --git a/topiary/__init__.py b/topiary/__init__.py index cc578b0..692c86f 100644 --- a/topiary/__init__.py +++ b/topiary/__init__.py @@ -68,10 +68,10 @@ melt_pvacseq_algorithms, read_pvacseq, ) -from .result import TopiaryResult, concat +from .result import TopiaryResult, combine_predictor_results, concat from .wide import detect_form, from_wide, to_wide -__version__ = "5.16.1" +__version__ = "5.16.2" __all__ = [ "TopiaryPredictor", @@ -149,5 +149,6 @@ "from_wide", "to_wide", "TopiaryResult", + "combine_predictor_results", "concat", ] diff --git a/topiary/predictor.py b/topiary/predictor.py index f651ce8..a22b990 100644 --- a/topiary/predictor.py +++ b/topiary/predictor.py @@ -579,7 +579,9 @@ def predict_from_named_sequences(self, name_to_sequence_dict): prediction_method_name, predictor_version, n_flank, c_flank """ df = self._predict_raw(name_to_sequence_dict) - return self._strip_internal_columns(self._apply_filter(df)) + return self._attach_result_attrs( + self._strip_internal_columns(self._apply_filter(df)) + ) def predict_from_named_peptides(self, name_to_peptide_dict): """ @@ -596,7 +598,9 @@ def predict_from_named_peptides(self, name_to_peptide_dict): prediction_method_name, predictor_version, n_flank, c_flank """ df = self._predict_raw_peptides(name_to_peptide_dict) - return self._strip_internal_columns(self._apply_filter(df)) + return self._attach_result_attrs( + self._strip_internal_columns(self._apply_filter(df)) + ) def _predict_raw(self, name_to_sequence_dict): """Run models and format output, without applying filter/ranking.""" @@ -670,6 +674,46 @@ def _strip_internal_columns(self, df): errors="ignore", ) + def _attach_result_attrs(self, df): + """Attach lightweight metadata to public DataFrame outputs.""" + model_versions = {} + if "prediction_method_name" in df.columns and "predictor_version" in df.columns: + for method, version in ( + df.dropna(subset=["prediction_method_name"]) + .groupby("prediction_method_name")["predictor_version"] + .first() + .items() + ): + if pd.notna(version) and str(version): + model_versions[str(method)] = str(version) + if not model_versions: + for model in self.models: + name = ( + getattr(model, "prediction_method_name", None) + or getattr(model, "predictor_name", None) + or getattr(model, "name", None) + or type(model).__name__ + ) + version = getattr(model, "predictor_version", None) + if name and version: + model_versions[str(name)] = str(version) + if model_versions: + df.attrs["topiary_models"] = model_versions + kind_support = self._result_kind_support() + if kind_support: + df.attrs["topiary_kind_support"] = kind_support + return df + + def _result_kind_support(self): + """Return kind_support for public DataFrame attrs when available.""" + support = {} + for key, model in zip(self._model_keys, self.models): + kind_support = getattr(model, "kind_support", None) + if kind_support is None: + return {} + support[key] = dict(kind_support()) + return support + def _expand_named_peptide_predictions(self, model_df, peptide_names_df): """Attach the original peptide names to model predictions.""" if model_df.empty: @@ -914,13 +958,13 @@ def _finalize_rows(self, df, fragments=None): ``only_novel_epitopes`` is set, and reset the index. Shared tail for every ProteinFragment-producing entry point.""" if df.empty: - return df + return self._attach_result_attrs(df) df = self._maybe_predict_wt_peptides(df, fragments=fragments) df = self._apply_filter(df) if self.only_novel_epitopes: df = df[df["contains_mutant_residues"].eq(True)] df = self._strip_internal_columns(df) - return df.reset_index(drop=True) + return self._attach_result_attrs(df.reset_index(drop=True)) def predict_from_fragments(self, fragments): """Predict MHC binding for peptides derived from a collection of diff --git a/topiary/result.py b/topiary/result.py index 6643c60..58a669b 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -77,6 +77,13 @@ def __init__( sort_by_str = sort_by_str or getattr(metadata, "sort_by", None) extra = extra if extra is not None else metadata.extra + if models is None and hasattr(df, "attrs"): + models = df.attrs.get("topiary_models") + if extra is None and hasattr(df, "attrs"): + kind_support = df.attrs.get("topiary_kind_support") + if kind_support: + extra = OrderedDict([("kind_support", kind_support)]) + self.df = df self.topiary_version = topiary_version self.form = form or detect_form(df) @@ -433,3 +440,170 @@ def concat(results): sort_by_ast=sort_by_ast, extra=merged_extra, ) + + +def combine_predictor_results(results, on=("peptide", "allele")): + """Combine separate predictor outputs into one predictor-equivalent result. + + This is stricter than :func:`concat`: every input must cover the same + identity key set, and no prediction method may appear in more than one + input. It is intended for the common single-allele case where, for example, + NetMHCpan and MHCflurry are run separately and then stacked into the same + long-form shape produced by running both models together. + + Parameters + ---------- + results : iterable of TopiaryResult or pandas.DataFrame + Separate predictor outputs to combine. + on : tuple of str + Columns defining the strict identity set. Defaults to + ``("peptide", "allele")``. + + Returns + ------- + TopiaryResult + Combined long-form result with merged model metadata and, when present + on the inputs, merged ``extra["kind_support"]`` metadata. + """ + results = [_as_topiary_result(r) for r in results] + if not results: + return TopiaryResult(pd.DataFrame()) + + if isinstance(on, str): + on = (on,) + on = tuple(on) + + for i, result in enumerate(results): + _validate_predictor_result(result, i, on) + + _validate_unique_prediction_methods(results) + _validate_same_identity_keys(results, on) + merged_kind_support = _merge_kind_support(results) + + combined = concat(results) + if merged_kind_support: + extra = OrderedDict(combined.extra) + extra["kind_support"] = merged_kind_support + combined.extra = extra + return combined + + +def _as_topiary_result(result): + if isinstance(result, TopiaryResult): + return result + if isinstance(result, pd.DataFrame): + return TopiaryResult(result) + raise TypeError( + "combine_predictor_results expects TopiaryResult or pandas.DataFrame " + f"inputs, got {type(result).__name__}" + ) + + +def _validate_predictor_result(result, index, on): + if result.form != "long": + raise ValueError( + "combine_predictor_results only supports long-form predictor " + f"results; result {index} has form {result.form!r}" + ) + required = set(on) | {"kind", "prediction_method_name"} + missing = sorted(c for c in required if c not in result.df.columns) + if missing: + raise ValueError( + f"combine_predictor_results result {index} is missing required " + f"column(s): {missing}" + ) + if not _prediction_methods(result): + raise ValueError( + f"combine_predictor_results result {index} has no " + "prediction_method_name values" + ) + + +def _prediction_methods(result): + return { + str(method) + for method in result.df["prediction_method_name"].dropna().unique() + } + + +def _validate_unique_prediction_methods(results): + seen = {} + for index, result in enumerate(results): + for method in sorted(_prediction_methods(result)): + if method in seen: + raise ValueError( + "combine_predictor_results cannot combine duplicate " + f"prediction method {method!r}; found in results " + f"{seen[method]} and {index}" + ) + seen[method] = index + + +def _identity_keys(result, on): + return set( + result.df.loc[:, list(on)] + .drop_duplicates() + .itertuples(index=False, name=None) + ) + + +def _validate_same_identity_keys(results, on): + baseline = _identity_keys(results[0], on) + for index, result in enumerate(results[1:], start=1): + current = _identity_keys(result, on) + missing = baseline - current + extra = current - baseline + if missing or extra: + message = [ + "combine_predictor_results requires every input to cover the " + f"same {on!r} keys; result {index} differs from result 0." + ] + if missing: + message.append( + f"Missing from result {index}: {_format_key_examples(missing)}" + ) + if extra: + message.append( + f"Extra in result {index}: {_format_key_examples(extra)}" + ) + raise ValueError(" ".join(message)) + + +def _format_key_examples(keys, limit=5): + ordered = sorted(keys, key=repr) + shown = ordered[:limit] + suffix = "" if len(ordered) <= limit else f" ... +{len(ordered) - limit} more" + return f"{shown}{suffix}" + + +def _merge_kind_support(results): + merged = OrderedDict() + for index, result in enumerate(results): + kind_support = result.extra.get("kind_support") + if not kind_support: + continue + _validate_single_allele_kind_support(kind_support, index) + for model_key, kind_map in kind_support.items(): + if model_key in merged: + raise ValueError( + "combine_predictor_results cannot merge duplicate " + f"kind_support model key {model_key!r}" + ) + merged[model_key] = OrderedDict( + (kind, dict(meta)) for kind, meta in kind_map.items() + ) + return merged + + +def _validate_single_allele_kind_support(kind_support, result_index): + for model_key, kind_map in kind_support.items(): + for kind, meta in kind_map.items(): + dependence = meta.get("mhc_dependence") + if dependence != "single_allele": + raise ValueError( + "combine_predictor_results currently supports only " + "single_allele predictor rows. " + f"Result {result_index} reports {dependence!r} for " + f"{model_key!r}/{kind!r}; haplotype-mode combining " + "depends on #168/#169." + ) From 0e9079e59ada53b47ec1fe384d612499ea1f21d4 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 15 May 2026 11:38:04 -0400 Subject: [PATCH 02/17] Handle structured kind_support metadata --- tests/test_combine_predictor_results.py | 52 +++++++++++++++++++++- tests/test_io.py | 53 ++++++++++++++++++++++ topiary/io.py | 58 ++++++++++++++++++++++++- topiary/result.py | 37 +++++++++++++++- 4 files changed, 196 insertions(+), 4 deletions(-) diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index d405c02..e8cc6f6 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -3,7 +3,13 @@ import pandas as pd import pytest -from topiary import TopiaryPredictor, TopiaryResult, combine_predictor_results +from topiary import ( + TopiaryPredictor, + TopiaryResult, + combine_predictor_results, + read_csv, + read_tsv, +) from topiary.io import Metadata @@ -112,6 +118,41 @@ def test_combine_separate_predictor_runs_matches_combined_run(): assert combined.extra["kind_support"] == direct.attrs["topiary_kind_support"] +def test_combine_roundtripped_topiary_results(tmp_path): + peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} + alleles = ["HLA-A*02:01", "HLA-B*07:02"] + netmhcpan = ToyAffinityPredictor("netmhcpan", "4.1b", alleles, offset=100) + mhcflurry = ToyAffinityPredictor("mhcflurry", "2.1.1", alleles, offset=200) + + direct = TopiaryPredictor( + models=[netmhcpan, mhcflurry] + ).predict_from_named_peptides(peptides) + net_only = TopiaryResult( + TopiaryPredictor(models=netmhcpan).predict_from_named_peptides(peptides) + ) + flurry_only = TopiaryResult( + TopiaryPredictor(models=mhcflurry).predict_from_named_peptides(peptides) + ) + + net_path = tmp_path / "netmhcpan.tsv" + flurry_path = tmp_path / "mhcflurry.csv" + net_only.to_tsv(net_path) + flurry_only.to_csv(flurry_path) + + combined = combine_predictor_results([ + read_tsv(net_path), + read_csv(flurry_path), + ]) + + combined_df = combined.df.drop(columns=["source"], errors="ignore") + pd.testing.assert_frame_equal( + _sort_predictions(combined_df), + _sort_predictions(direct), + ) + assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} + assert combined.extra["kind_support"] == direct.attrs["topiary_kind_support"] + + def test_combine_rejects_different_identity_sets(): r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") @@ -128,6 +169,15 @@ def test_combine_rejects_duplicate_prediction_methods(): combine_predictor_results([r1, r2]) +def test_combine_rejects_invalid_kind_support_metadata(): + r1 = _simple_result("netmhcpan") + r1.extra["kind_support"] = "not a mapping" + r2 = _simple_result("mhcflurry") + + with pytest.raises(ValueError, match="kind_support.*mapping"): + combine_predictor_results([r1, r2]) + + def test_combine_rejects_haplotype_kind_support(): haplotype = _simple_result("mhcflurry") haplotype.extra["kind_support"]["mhcflurry"]["pMHC_affinity"] = { diff --git a/tests/test_io.py b/tests/test_io.py index ee09674..1d7f82e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -47,6 +47,23 @@ def test_unknown_keys_in_extra(self): assert n == 3 assert meta.extra == {"custom_key": "custom_value", "other_key": "other_value"} + def test_kind_support_legacy_literal_extra(self): + lines = [ + "#kind_support={'netmhcpan': {'pMHC_affinity': " + "{'mhc_dependence': 'single_allele', 'mhc_class': 'I'}}}\n", + "peptide\n", + ] + meta, n = _parse_comment_block(lines) + assert n == 1 + assert meta.extra["kind_support"] == { + "netmhcpan": { + "pMHC_affinity": { + "mhc_dependence": "single_allele", + "mhc_class": "I", + }, + }, + } + def test_source_lines(self): lines = [ "#source=patient01.tsv\n", @@ -119,6 +136,24 @@ def test_extra_keys_preserved(self): block = _format_comment_block(meta) assert "#custom_key=custom_value" in block + def test_structured_extra_roundtrip(self): + kind_support = { + "netmhcpan": { + "pMHC_affinity": { + "mhc_dependence": "single_allele", + "mhc_class": "I", + }, + }, + } + meta = Metadata(extra={"kind_support": kind_support}) + block = _format_comment_block(meta) + assert "#kind_support=json:" in block + + lines = [line + "\n" for line in block.split("\n")] + ["data\n"] + parsed, _ = _parse_comment_block(lines) + + assert parsed.extra["kind_support"] == kind_support + def test_sources_formatted(self): meta = Metadata(sources=["patient01.tsv", "patient02.tsv"]) block = _format_comment_block(meta) @@ -211,6 +246,24 @@ def test_metadata_preserved(self, tmp_path): assert "test_cohort" in meta2.sources assert meta2.extra.get("patient") == "PT01" + def test_dataframe_attrs_kind_support_preserved(self, tmp_path): + df = _sample_long_df() + kind_support = { + "netmhcpan": { + "pMHC_affinity": { + "mhc_dependence": "single_allele", + "mhc_class": "I", + }, + }, + } + df.attrs["topiary_kind_support"] = kind_support + path = tmp_path / "attrs.tsv" + + to_tsv(df, path) + result = read_tsv(path) + + assert result.extra["kind_support"] == kind_support + def test_model_versions_auto_extracted(self, tmp_path): df = _sample_long_df() path = tmp_path / "out.tsv" diff --git a/topiary/io.py b/topiary/io.py index 9d7a69c..c4f4241 100644 --- a/topiary/io.py +++ b/topiary/io.py @@ -15,6 +15,8 @@ parse the comment block into a :class:`Metadata` object. """ +import ast +import json from collections import OrderedDict from dataclasses import dataclass, field as dataclass_field from io import StringIO @@ -23,6 +25,9 @@ import pandas as pd +_JSON_EXTRA_PREFIX = "json:" + + @dataclass class Metadata: """Comment-block metadata for a topiary file.""" @@ -72,7 +77,7 @@ def _parse_comment_block(lines): model_name = key[len("model:"):] meta.models[model_name] = value else: - meta.extra[key] = value + meta.extra[key] = _parse_extra_value(key, value) # Also parse bare #model:name lines (no =, version-less). # These were skipped by the "=" check above, so re-scan. @@ -108,10 +113,46 @@ def _format_comment_block(meta): if meta.sort_by: lines.append(f"#sort_by={meta.sort_by}") for key, value in meta.extra.items(): - lines.append(f"#{key}={value}") + lines.append(f"#{key}={_format_extra_value(value)}") return "\n".join(lines) +def _parse_extra_value(key, value): + """Parse a comment-block extra value.""" + if value.startswith(_JSON_EXTRA_PREFIX): + json_value = value[len(_JSON_EXTRA_PREFIX):] + try: + return json.loads(json_value, object_pairs_hook=OrderedDict) + except json.JSONDecodeError: + return value + + if key == "kind_support": + # Compatibility for files written before structured extras used an + # explicit JSON marker. + for parser in ( + lambda v: json.loads(v, object_pairs_hook=OrderedDict), + ast.literal_eval, + ): + try: + parsed = parser(value) + except (SyntaxError, ValueError, TypeError, json.JSONDecodeError): + continue + if isinstance(parsed, dict): + return parsed + + return value + + +def _format_extra_value(value): + """Format a Metadata.extra value for the comment block.""" + if isinstance(value, (dict, list)): + try: + return _JSON_EXTRA_PREFIX + json.dumps(value, separators=(",", ":")) + except TypeError: + pass + return str(value) + + # -- Read ------------------------------------------------------------------ @@ -200,6 +241,19 @@ def _write_delimited(df, path, sep, metadata, index): if not metadata.form: metadata.form = detect_form(df) + if hasattr(df, "attrs"): + if not metadata.models: + attr_models = df.attrs.get("topiary_models") + if attr_models: + metadata.models.update( + (str(model), str(version)) + for model, version in attr_models.items() + ) + if "kind_support" not in metadata.extra: + kind_support = df.attrs.get("topiary_kind_support") + if kind_support: + metadata.extra["kind_support"] = kind_support + # Auto-extract model versions from long-form data. if ( not metadata.models diff --git a/topiary/result.py b/topiary/result.py index 58a669b..62b2885 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -7,6 +7,7 @@ import warnings from collections import OrderedDict +from collections.abc import Mapping import pandas as pd @@ -579,7 +580,9 @@ def _format_key_examples(keys, limit=5): def _merge_kind_support(results): merged = OrderedDict() for index, result in enumerate(results): - kind_support = result.extra.get("kind_support") + kind_support = _normalize_kind_support( + result.extra.get("kind_support"), index + ) if not kind_support: continue _validate_single_allele_kind_support(kind_support, index) @@ -595,6 +598,38 @@ def _merge_kind_support(results): return merged +def _normalize_kind_support(kind_support, result_index): + if not kind_support: + return OrderedDict() + if not isinstance(kind_support, Mapping): + raise ValueError( + "combine_predictor_results expected " + f"extra['kind_support'] for result {result_index} to be a " + f"mapping, got {type(kind_support).__name__}" + ) + + normalized = OrderedDict() + for model_key, kind_map in kind_support.items(): + if not isinstance(kind_map, Mapping): + raise ValueError( + "combine_predictor_results expected " + f"extra['kind_support'][{model_key!r}] for result " + f"{result_index} to be a mapping, got " + f"{type(kind_map).__name__}" + ) + normalized[model_key] = OrderedDict() + for kind, meta in kind_map.items(): + if not isinstance(meta, Mapping): + raise ValueError( + "combine_predictor_results expected " + f"extra['kind_support'][{model_key!r}][{kind!r}] " + f"for result {result_index} to be a mapping, got " + f"{type(meta).__name__}" + ) + normalized[model_key][kind] = dict(meta) + return normalized + + def _validate_single_allele_kind_support(kind_support, result_index): for model_key, kind_map in kind_support.items(): for kind, meta in kind_map.items(): From ccebe3505109e3f24714997749c4a7aeff59c379 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 15 May 2026 17:33:05 -0400 Subject: [PATCH 03/17] Stop using kind_support as combine provenance --- CHANGELOG.md | 11 ++-- docs/ranking.md | 5 +- tests/test_combine_predictor_results.py | 43 +++---------- tests/test_io.py | 18 ------ topiary/io.py | 4 -- topiary/predictor.py | 13 ---- topiary/result.py | 80 +------------------------ 7 files changed, 20 insertions(+), 154 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0de3610..2b4befc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,15 @@ by running those predictors together. It accepts `TopiaryResult` or fresh `TopiaryPredictor` DataFrame outputs, strictly validates that all inputs cover the same `(peptide, allele)` identity set, rejects duplicate `prediction_method_name` values across inputs, and merges -model / `kind_support` metadata when present. +model metadata when present. Fresh `TopiaryPredictor` DataFrames now carry lightweight -`DataFrame.attrs` metadata (`topiary_models`, `topiary_kind_support`) so -this helper can preserve predictor provenance without changing the -public return type. +`DataFrame.attrs` model-version metadata (`topiary_models`) so this +helper can preserve model provenance without changing the public return +type. The emitted rows remain the source of truth for which predictor +produced which quantities: `prediction_method_name`, `predictor_version`, +`kind`, and the value/rank columns are not duplicated into separate +`kind_support` metadata. ## 5.16.1 diff --git a/docs/ranking.md b/docs/ranking.md index bfd2307..06e6b88 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -392,8 +392,9 @@ combined = combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) The helper is intentionally strict. Every input must cover the same `(peptide, allele)` keys, and each `prediction_method_name` may appear in only -one input. It supports the common single-allele predictor case today; haplotype -mode needs the allele-set schema work tracked separately. +one input. The combined result preserves the original rows: use each row's +`prediction_method_name`, `predictor_version`, `kind`, and value/rank columns +to inspect which predictor produced which quantity. ## Putting it together diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index e8cc6f6..6496d32 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -23,14 +23,6 @@ def __init__(self, name, version, alleles, offset): self.alleles = alleles self.offset = offset - def kind_support(self): - return { - "pMHC_affinity": { - "mhc_dependence": "single_allele", - "mhc_class": "I", - } - } - def predict_dataframe(self, peptides): rows = [] for peptide_i, peptide in enumerate(peptides): @@ -82,16 +74,6 @@ def _simple_result(method="netmhcpan", peptide="SIINFEKLA", allele="HLA-A*02:01" form="long", models={method: "1.0"}, sources=[method], - extra={ - "kind_support": { - method: { - "pMHC_affinity": { - "mhc_dependence": "single_allele", - "mhc_class": "I", - } - } - } - }, ), ) @@ -115,7 +97,7 @@ def test_combine_separate_predictor_runs_matches_combined_run(): _sort_predictions(direct), ) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} - assert combined.extra["kind_support"] == direct.attrs["topiary_kind_support"] + assert "kind_support" not in combined.extra def test_combine_roundtripped_topiary_results(tmp_path): @@ -150,7 +132,7 @@ def test_combine_roundtripped_topiary_results(tmp_path): _sort_predictions(direct), ) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} - assert combined.extra["kind_support"] == direct.attrs["topiary_kind_support"] + assert "kind_support" not in combined.extra def test_combine_rejects_different_identity_sets(): @@ -169,28 +151,17 @@ def test_combine_rejects_duplicate_prediction_methods(): combine_predictor_results([r1, r2]) -def test_combine_rejects_invalid_kind_support_metadata(): +def test_combine_ignores_legacy_kind_support_metadata(): r1 = _simple_result("netmhcpan") r1.extra["kind_support"] = "not a mapping" r2 = _simple_result("mhcflurry") - with pytest.raises(ValueError, match="kind_support.*mapping"): - combine_predictor_results([r1, r2]) - - -def test_combine_rejects_haplotype_kind_support(): - haplotype = _simple_result("mhcflurry") - haplotype.extra["kind_support"]["mhcflurry"]["pMHC_affinity"] = { - "mhc_dependence": "haplotype", - "mhc_class": "I", - } - single_allele = _simple_result("netmhcpan") + combined = combine_predictor_results([r1, r2]) - with pytest.raises(ValueError, match="#168/#169"): - combine_predictor_results([haplotype, single_allele]) + assert "kind_support" not in combined.extra -def test_topiary_result_reads_predictor_dataframe_attrs(): +def test_topiary_result_reads_predictor_model_attrs(): peptides = {"pep1": "SIINFEKLA"} predictor = ToyAffinityPredictor( "netmhcpan", "4.1b", ["HLA-A*02:01"], offset=100, @@ -200,4 +171,4 @@ def test_topiary_result_reads_predictor_dataframe_attrs(): result = TopiaryResult(df) assert result.models == {"netmhcpan": "4.1b"} - assert result.extra["kind_support"] == df.attrs["topiary_kind_support"] + assert result.extra == {} diff --git a/tests/test_io.py b/tests/test_io.py index 1d7f82e..860f728 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -246,24 +246,6 @@ def test_metadata_preserved(self, tmp_path): assert "test_cohort" in meta2.sources assert meta2.extra.get("patient") == "PT01" - def test_dataframe_attrs_kind_support_preserved(self, tmp_path): - df = _sample_long_df() - kind_support = { - "netmhcpan": { - "pMHC_affinity": { - "mhc_dependence": "single_allele", - "mhc_class": "I", - }, - }, - } - df.attrs["topiary_kind_support"] = kind_support - path = tmp_path / "attrs.tsv" - - to_tsv(df, path) - result = read_tsv(path) - - assert result.extra["kind_support"] == kind_support - def test_model_versions_auto_extracted(self, tmp_path): df = _sample_long_df() path = tmp_path / "out.tsv" diff --git a/topiary/io.py b/topiary/io.py index c4f4241..b40c224 100644 --- a/topiary/io.py +++ b/topiary/io.py @@ -249,10 +249,6 @@ def _write_delimited(df, path, sep, metadata, index): (str(model), str(version)) for model, version in attr_models.items() ) - if "kind_support" not in metadata.extra: - kind_support = df.attrs.get("topiary_kind_support") - if kind_support: - metadata.extra["kind_support"] = kind_support # Auto-extract model versions from long-form data. if ( diff --git a/topiary/predictor.py b/topiary/predictor.py index a22b990..a42d1ec 100644 --- a/topiary/predictor.py +++ b/topiary/predictor.py @@ -699,21 +699,8 @@ def _attach_result_attrs(self, df): model_versions[str(name)] = str(version) if model_versions: df.attrs["topiary_models"] = model_versions - kind_support = self._result_kind_support() - if kind_support: - df.attrs["topiary_kind_support"] = kind_support return df - def _result_kind_support(self): - """Return kind_support for public DataFrame attrs when available.""" - support = {} - for key, model in zip(self._model_keys, self.models): - kind_support = getattr(model, "kind_support", None) - if kind_support is None: - return {} - support[key] = dict(kind_support()) - return support - def _expand_named_peptide_predictions(self, model_df, peptide_names_df): """Attach the original peptide names to model predictions.""" if model_df.empty: diff --git a/topiary/result.py b/topiary/result.py index 62b2885..fbaa1fd 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -7,7 +7,6 @@ import warnings from collections import OrderedDict -from collections.abc import Mapping import pandas as pd @@ -80,10 +79,6 @@ def __init__( if models is None and hasattr(df, "attrs"): models = df.attrs.get("topiary_models") - if extra is None and hasattr(df, "attrs"): - kind_support = df.attrs.get("topiary_kind_support") - if kind_support: - extra = OrderedDict([("kind_support", kind_support)]) self.df = df self.topiary_version = topiary_version @@ -463,8 +458,7 @@ def combine_predictor_results(results, on=("peptide", "allele")): Returns ------- TopiaryResult - Combined long-form result with merged model metadata and, when present - on the inputs, merged ``extra["kind_support"]`` metadata. + Combined long-form result with merged model metadata. """ results = [_as_topiary_result(r) for r in results] if not results: @@ -479,12 +473,11 @@ def combine_predictor_results(results, on=("peptide", "allele")): _validate_unique_prediction_methods(results) _validate_same_identity_keys(results, on) - merged_kind_support = _merge_kind_support(results) combined = concat(results) - if merged_kind_support: + if "kind_support" in combined.extra: extra = OrderedDict(combined.extra) - extra["kind_support"] = merged_kind_support + extra.pop("kind_support", None) combined.extra = extra return combined @@ -575,70 +568,3 @@ def _format_key_examples(keys, limit=5): shown = ordered[:limit] suffix = "" if len(ordered) <= limit else f" ... +{len(ordered) - limit} more" return f"{shown}{suffix}" - - -def _merge_kind_support(results): - merged = OrderedDict() - for index, result in enumerate(results): - kind_support = _normalize_kind_support( - result.extra.get("kind_support"), index - ) - if not kind_support: - continue - _validate_single_allele_kind_support(kind_support, index) - for model_key, kind_map in kind_support.items(): - if model_key in merged: - raise ValueError( - "combine_predictor_results cannot merge duplicate " - f"kind_support model key {model_key!r}" - ) - merged[model_key] = OrderedDict( - (kind, dict(meta)) for kind, meta in kind_map.items() - ) - return merged - - -def _normalize_kind_support(kind_support, result_index): - if not kind_support: - return OrderedDict() - if not isinstance(kind_support, Mapping): - raise ValueError( - "combine_predictor_results expected " - f"extra['kind_support'] for result {result_index} to be a " - f"mapping, got {type(kind_support).__name__}" - ) - - normalized = OrderedDict() - for model_key, kind_map in kind_support.items(): - if not isinstance(kind_map, Mapping): - raise ValueError( - "combine_predictor_results expected " - f"extra['kind_support'][{model_key!r}] for result " - f"{result_index} to be a mapping, got " - f"{type(kind_map).__name__}" - ) - normalized[model_key] = OrderedDict() - for kind, meta in kind_map.items(): - if not isinstance(meta, Mapping): - raise ValueError( - "combine_predictor_results expected " - f"extra['kind_support'][{model_key!r}][{kind!r}] " - f"for result {result_index} to be a mapping, got " - f"{type(meta).__name__}" - ) - normalized[model_key][kind] = dict(meta) - return normalized - - -def _validate_single_allele_kind_support(kind_support, result_index): - for model_key, kind_map in kind_support.items(): - for kind, meta in kind_map.items(): - dependence = meta.get("mhc_dependence") - if dependence != "single_allele": - raise ValueError( - "combine_predictor_results currently supports only " - "single_allele predictor rows. " - f"Result {result_index} reports {dependence!r} for " - f"{model_key!r}/{kind!r}; haplotype-mode combining " - "depends on #168/#169." - ) From d1784c87163a77f10367bd9fbc3c5e0d7ae4835e Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 15 May 2026 20:48:17 -0400 Subject: [PATCH 04/17] Fix combine metadata edge cases --- tests/test_combine_predictor_results.py | 25 ++++++ tests/test_io.py | 34 ++++++++ topiary/io.py | 107 ++++++++++++++++++------ topiary/result.py | 34 ++++++-- 4 files changed, 166 insertions(+), 34 deletions(-) diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 6496d32..8fa9ffc 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -143,6 +143,15 @@ def test_combine_rejects_different_identity_sets(): combine_predictor_results([r1, r2]) +def test_combine_treats_null_identity_keys_as_equal(): + r1 = _simple_result("netmhcpan", allele=pd.NA) + r2 = _simple_result("mhcflurry", allele=pd.NA) + + combined = combine_predictor_results([r1, r2]) + + assert len(combined) == 2 + + def test_combine_rejects_duplicate_prediction_methods(): r1 = _simple_result("netmhcpan") r2 = _simple_result("netmhcpan") @@ -172,3 +181,19 @@ def test_topiary_result_reads_predictor_model_attrs(): assert result.models == {"netmhcpan": "4.1b"} assert result.extra == {} + + +def test_topiary_result_model_attrs_filtered_to_observed_rows(): + peptides = {"pep1": "SIINFEKLA"} + alleles = ["HLA-A*02:01"] + df = TopiaryPredictor( + models=[ + ToyAffinityPredictor("netmhcpan", "4.1b", alleles, offset=100), + ToyAffinityPredictor("mhcflurry", "2.1.1", alleles, offset=200), + ] + ).predict_from_named_peptides(peptides) + filtered = df[df["prediction_method_name"] == "netmhcpan"] + + result = TopiaryResult(filtered) + + assert result.models == {"netmhcpan": "4.1b"} diff --git a/tests/test_io.py b/tests/test_io.py index 860f728..8e9c63d 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -253,6 +253,40 @@ def test_model_versions_auto_extracted(self, tmp_path): meta = read_tsv(path).metadata assert meta.models.get("netmhcpan") == "4.1b" + def test_model_attrs_filtered_to_observed_long_rows(self, tmp_path): + df = _sample_long_df() + df.loc[1, "prediction_method_name"] = "mhcflurry" + df.loc[1, "predictor_version"] = "2.1.1" + df.attrs["topiary_models"] = { + "netmhcpan": "4.1b", + "mhcflurry": "2.1.1", + } + filtered = df[df["prediction_method_name"] == "netmhcpan"] + + path = tmp_path / "filtered.tsv" + to_tsv(filtered, path) + meta = read_tsv(path).metadata + + assert meta.models == {"netmhcpan": "4.1b"} + + def test_result_metadata_filtered_to_observed_long_rows(self, tmp_path): + from topiary import TopiaryResult + + df = _sample_long_df() + df.loc[1, "prediction_method_name"] = "mhcflurry" + df.loc[1, "predictor_version"] = "2.1.1" + result = TopiaryResult( + df, + models={"netmhcpan": "4.1b", "mhcflurry": "2.1.1"}, + ) + filtered = result[result["prediction_method_name"] == "netmhcpan"] + + path = tmp_path / "filtered_result.tsv" + to_tsv(filtered, path) + meta = read_tsv(path).metadata + + assert meta.models == {"netmhcpan": "4.1b"} + class TestReadWriteCSV: def test_csv_roundtrip(self, tmp_path): diff --git a/topiary/io.py b/topiary/io.py index b40c224..a021b5b 100644 --- a/topiary/io.py +++ b/topiary/io.py @@ -153,6 +153,76 @@ def _format_extra_value(value): return str(value) +def _models_from_long_rows(df): + """Extract observed model versions from long-form rows, if possible.""" + if ( + "prediction_method_name" not in df.columns + or "predictor_version" not in df.columns + ): + return None + + models = OrderedDict() + for method, version in ( + df.dropna(subset=["prediction_method_name"]) + .groupby("prediction_method_name", sort=False)["predictor_version"] + .first() + .items() + ): + method_str = str(method).strip() + if not method_str: + continue + version_str = str(version).strip() if pd.notna(version) else "" + models[method_str] = version_str + return models + + +def _observed_model_names(df): + """Return model names visible in prediction columns, or None if unknown.""" + if "prediction_method_name" in df.columns: + return { + str(method).strip() + for method in df["prediction_method_name"].dropna().unique() + if str(method).strip() + } + + from .wide import _parse_wide_column + + models = set() + for column in df.columns: + parsed = _parse_wide_column(str(column)) + if parsed is not None and parsed[0]: + models.add(str(parsed[0])) + return models if models else None + + +def _models_from_attrs(df): + """Extract non-stale model attrs by intersecting with observed models.""" + if not hasattr(df, "attrs"): + return OrderedDict() + + attr_models = df.attrs.get("topiary_models") + if not attr_models: + return OrderedDict() + + observed = _observed_model_names(df) + if observed is None: + return OrderedDict() + + return OrderedDict( + (str(model), str(version)) + for model, version in attr_models.items() + if str(model) in observed + ) + + +def _models_from_dataframe(df): + """Extract model metadata from the DataFrame contents before attrs.""" + row_models = _models_from_long_rows(df) + if row_models is not None: + return row_models + return _models_from_attrs(df) + + # -- Read ------------------------------------------------------------------ @@ -223,11 +293,13 @@ def _write_delimited(df, path, sep, metadata, index): from . import __version__ from .wide import detect_form + metadata_from_result = False # Accept TopiaryResult too — pull out its df and metadata. # Use duck typing to avoid a circular import. if hasattr(df, "df") and hasattr(df, "metadata"): if metadata is None: metadata = df.metadata + metadata_from_result = True df = df.df path = Path(path) @@ -241,33 +313,14 @@ def _write_delimited(df, path, sep, metadata, index): if not metadata.form: metadata.form = detect_form(df) - if hasattr(df, "attrs"): - if not metadata.models: - attr_models = df.attrs.get("topiary_models") - if attr_models: - metadata.models.update( - (str(model), str(version)) - for model, version in attr_models.items() - ) - - # Auto-extract model versions from long-form data. - if ( - not metadata.models - and "prediction_method_name" in df.columns - and "predictor_version" in df.columns - ): - for method, version in ( - df.dropna(subset=["prediction_method_name"]) - .groupby("prediction_method_name")["predictor_version"] - .first() - .items() - ): - version_str = str(version).strip() if pd.notna(version) else "" - if version_str: - metadata.models[str(method)] = version_str - else: - # Record model even without version - metadata.models[str(method)] = "" + row_models = _models_from_long_rows(df) + if metadata_from_result and row_models is not None: + metadata.models = row_models + elif not metadata.models: + if row_models is not None: + metadata.models.update(row_models) + else: + metadata.models.update(_models_from_attrs(df)) comment_block = _format_comment_block(metadata) diff --git a/topiary/result.py b/topiary/result.py index fbaa1fd..670875f 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -10,10 +10,18 @@ import pandas as pd -from .io import Metadata +from .io import Metadata, _models_from_dataframe from .wide import detect_form +class _MissingIdentityValue: + def __repr__(self): + return "" + + +_MISSING_IDENTITY_VALUE = _MissingIdentityValue() + + class TopiaryResult: """A prediction DataFrame bundled with its provenance and pipeline state. @@ -78,7 +86,7 @@ def __init__( extra = extra if extra is not None else metadata.extra if models is None and hasattr(df, "attrs"): - models = df.attrs.get("topiary_models") + models = _models_from_dataframe(df) self.df = df self.topiary_version = topiary_version @@ -534,11 +542,23 @@ def _validate_unique_prediction_methods(results): def _identity_keys(result, on): - return set( - result.df.loc[:, list(on)] - .drop_duplicates() - .itertuples(index=False, name=None) - ) + return { + tuple(_normalize_identity_value(value) for value in key) + for key in ( + result.df.loc[:, list(on)] + .drop_duplicates() + .itertuples(index=False, name=None) + ) + } + + +def _normalize_identity_value(value): + try: + if pd.isna(value): + return _MISSING_IDENTITY_VALUE + except (TypeError, ValueError): + pass + return value def _validate_same_identity_keys(results, on): From fd8292605c69184623581b92b02e50f19a6d7e17 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 15 May 2026 22:45:38 -0400 Subject: [PATCH 05/17] Normalize combine predictor metadata --- tests/test_combine_predictor_results.py | 55 ++++++++++++++++++++++++- topiary/result.py | 11 +++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 8fa9ffc..9d02bb2 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -9,6 +9,7 @@ combine_predictor_results, read_csv, read_tsv, + to_wide, ) from topiary.io import Metadata @@ -53,6 +54,18 @@ def _sort_predictions(df): ) +def _sort_wide(df): + cols = [ + c for c in ["source_sequence_name", "peptide", "allele"] + if c in df.columns + ] + return ( + df.sort_values(cols) + .reset_index(drop=True) + .loc[:, sorted(df.columns)] + ) + + def _simple_result(method="netmhcpan", peptide="SIINFEKLA", allele="HLA-A*02:01"): df = pd.DataFrame([{ "source_sequence_name": "pep1", @@ -96,6 +109,10 @@ def test_combine_separate_predictor_runs_matches_combined_run(): _sort_predictions(combined.df), _sort_predictions(direct), ) + pd.testing.assert_frame_equal( + _sort_wide(to_wide(combined.df)), + _sort_wide(to_wide(direct)), + ) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} assert "kind_support" not in combined.extra @@ -126,15 +143,49 @@ def test_combine_roundtripped_topiary_results(tmp_path): read_csv(flurry_path), ]) - combined_df = combined.df.drop(columns=["source"], errors="ignore") + assert "source" not in combined.df.columns + assert combined.sources == ["netmhcpan.tsv", "mhcflurry.csv"] pd.testing.assert_frame_equal( - _sort_predictions(combined_df), + _sort_predictions(combined.df), _sort_predictions(direct), ) + pd.testing.assert_frame_equal( + _sort_wide(to_wide(combined.df)), + _sort_wide(to_wide(direct)), + ) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} assert "kind_support" not in combined.extra +def test_combine_recomputes_models_from_combined_rows(): + peptides = {"pep1": "SIINFEKLA"} + alleles = ["HLA-A*02:01"] + netmhcpan = ToyAffinityPredictor("netmhcpan", "4.1b", alleles, offset=100) + mhcflurry = ToyAffinityPredictor("mhcflurry", "2.1.1", alleles, offset=200) + direct = TopiaryPredictor( + models=[netmhcpan, mhcflurry] + ).predict_from_named_peptides(peptides) + + stale_models = { + "netmhcpan": "4.1b", + "mhcflurry": "2.1.1", + "old_model": "0.1", + } + net_only = TopiaryResult( + direct[direct["prediction_method_name"] == "netmhcpan"], + models=stale_models, + ) + flurry_only = TopiaryResult( + direct[direct["prediction_method_name"] == "mhcflurry"], + models=stale_models, + ) + + combined = combine_predictor_results([net_only, flurry_only]) + + assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} + assert combined.metadata.models == combined.models + + def test_combine_rejects_different_identity_sets(): r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") diff --git a/topiary/result.py b/topiary/result.py index 670875f..3e1caaf 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -482,7 +482,9 @@ def combine_predictor_results(results, on=("peptide", "allele")): _validate_unique_prediction_methods(results) _validate_same_identity_keys(results, on) + results = [_drop_non_identity_source(result, on) for result in results] combined = concat(results) + combined.models = _models_from_dataframe(combined.df) if "kind_support" in combined.extra: extra = OrderedDict(combined.extra) extra.pop("kind_support", None) @@ -541,6 +543,15 @@ def _validate_unique_prediction_methods(results): seen[method] = index +def _drop_non_identity_source(result, on): + if "source" not in result.df.columns or "source" in on: + return result + return TopiaryResult( + result.df.drop(columns=["source"]), + **result._field_kwargs(), + ) + + def _identity_keys(result, on): return { tuple(_normalize_identity_value(value) for value in key) From 8490e154cbe470d32a729abe3912d4a16b030c3d Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Sat, 16 May 2026 10:26:49 -0400 Subject: [PATCH 06/17] Tighten combined predictor identity checks --- tests/test_combine_predictor_results.py | 37 +++++++++++++ topiary/io.py | 4 ++ topiary/result.py | 74 +++++++++++++++++++++++-- 3 files changed, 110 insertions(+), 5 deletions(-) diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 9d02bb2..cb2f10c 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -186,6 +186,25 @@ def test_combine_recomputes_models_from_combined_rows(): assert combined.metadata.models == combined.models +def test_combine_fills_missing_row_versions_from_observed_metadata(): + net_only = _simple_result("netmhcpan") + net_df = net_only.df.drop(columns=["predictor_version"]) + net_df.attrs["topiary_models"] = { + "netmhcpan": "4.1b", + "old_model": "0.1", + } + + flurry_only = _simple_result("mhcflurry") + flurry_df = flurry_only.df.assign(predictor_version="") + flurry_df.attrs["topiary_models"] = {"mhcflurry": "2.1.1"} + + assert TopiaryResult(flurry_df).models == {"mhcflurry": "2.1.1"} + + combined = combine_predictor_results([net_df, flurry_df]) + + assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} + + def test_combine_rejects_different_identity_sets(): r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") @@ -194,6 +213,24 @@ def test_combine_rejects_different_identity_sets(): combine_predictor_results([r1, r2]) +def test_combine_rejects_mismatched_source_context(): + r1 = _simple_result("netmhcpan") + r1 = TopiaryResult( + pd.concat( + [ + r1.df, + r1.df.assign(source_sequence_name="pep1-copy"), + ], + ignore_index=True, + ), + models=r1.models, + ) + r2 = _simple_result("mhcflurry") + + with pytest.raises(ValueError, match="source_sequence_name"): + combine_predictor_results([r1, r2]) + + def test_combine_treats_null_identity_keys_as_equal(): r1 = _simple_result("netmhcpan", allele=pd.NA) r2 = _simple_result("mhcflurry", allele=pd.NA) diff --git a/topiary/io.py b/topiary/io.py index a021b5b..0812489 100644 --- a/topiary/io.py +++ b/topiary/io.py @@ -219,6 +219,10 @@ def _models_from_dataframe(df): """Extract model metadata from the DataFrame contents before attrs.""" row_models = _models_from_long_rows(df) if row_models is not None: + attr_models = _models_from_attrs(df) + for model, version in row_models.items(): + if not version and model in attr_models: + row_models[model] = attr_models[model] return row_models return _models_from_attrs(df) diff --git a/topiary/result.py b/topiary/result.py index 3e1caaf..09c2600 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -22,6 +22,13 @@ def __repr__(self): _MISSING_IDENTITY_VALUE = _MissingIdentityValue() +_SOURCE_CONTEXT_IDENTITY_COLUMNS = ( + "source_sequence_name", + "peptide_offset", + "peptide_length", +) + + class TopiaryResult: """A prediction DataFrame bundled with its provenance and pipeline state. @@ -461,7 +468,9 @@ def combine_predictor_results(results, on=("peptide", "allele")): Separate predictor outputs to combine. on : tuple of str Columns defining the strict identity set. Defaults to - ``("peptide", "allele")``. + ``("peptide", "allele")``. Source context columns such as + ``source_sequence_name`` and ``peptide_offset`` are also checked + when present so repeated peptide/allele rows remain distinct. Returns ------- @@ -480,11 +489,12 @@ def combine_predictor_results(results, on=("peptide", "allele")): _validate_predictor_result(result, i, on) _validate_unique_prediction_methods(results) - _validate_same_identity_keys(results, on) + identity_columns = _identity_columns(results, on) + _validate_same_identity_keys(results, identity_columns) results = [_drop_non_identity_source(result, on) for result in results] combined = concat(results) - combined.models = _models_from_dataframe(combined.df) + combined.models = _models_from_observed_rows(combined.df, combined.models) if "kind_support" in combined.extra: extra = OrderedDict(combined.extra) extra.pop("kind_support", None) @@ -552,11 +562,65 @@ def _drop_non_identity_source(result, on): ) -def _identity_keys(result, on): +def _models_from_observed_rows(df, fallback_models): + models = OrderedDict() + fallback = OrderedDict( + (str(model).strip(), _model_version_str(version)) + for model, version in fallback_models.items() + if str(model).strip() + ) + for method, rows in ( + df.dropna(subset=["prediction_method_name"]) + .groupby("prediction_method_name", sort=False) + ): + method_str = str(method).strip() + if not method_str: + continue + version = _version_from_rows(rows) + if not version: + version = fallback.get(method_str, "") + models[method_str] = version + return models + + +def _version_from_rows(rows): + if "predictor_version" not in rows.columns: + return "" + for version in rows["predictor_version"]: + version_str = _model_version_str(version) + if version_str: + return version_str + return "" + + +def _model_version_str(value): + try: + if pd.isna(value): + return "" + except (TypeError, ValueError): + pass + return str(value).strip() + + +def _identity_columns(results, on): + columns = list(on) + for column in _SOURCE_CONTEXT_IDENTITY_COLUMNS: + if column not in columns and any(column in r.df.columns for r in results): + columns.append(column) + return tuple(columns) + + +def _identity_keys(result, columns): + identity_df = pd.DataFrame(index=result.df.index) + for column in columns: + if column in result.df.columns: + identity_df[column] = result.df[column] + else: + identity_df[column] = pd.NA return { tuple(_normalize_identity_value(value) for value in key) for key in ( - result.df.loc[:, list(on)] + identity_df .drop_duplicates() .itertuples(index=False, name=None) ) From 448c3670fcf14c4f59659e03356dc4773b3e1b4b Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Sat, 16 May 2026 19:20:22 -0400 Subject: [PATCH 07/17] Preserve model versions in predictor outputs --- tests/test_combine_predictor_results.py | 60 +++++++++++++++------ tests/test_io.py | 63 ++++++++++++++++++++++ topiary/io.py | 70 ++++++++++++++++++------- topiary/result.py | 17 ++---- 4 files changed, 163 insertions(+), 47 deletions(-) diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index cb2f10c..891f20a 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -91,6 +91,41 @@ def _simple_result(method="netmhcpan", peptide="SIINFEKLA", allele="HLA-A*02:01" ) +_CONTEXT_MISMATCH_VALUES = { + "source_sequence_name": ("pep1", "pep1-copy"), + "peptide_offset": (0, 1), + "peptide_length": (9, 10), + "n_flank": ("AAA", "BBB"), + "c_flank": ("CCC", "DDD"), +} + + +def _mismatched_context_results(column, right_has_column=True): + left, right = _CONTEXT_MISMATCH_VALUES[column] + r1 = _simple_result("netmhcpan") + r2 = _simple_result("mhcflurry") + r1.df[column] = left + if right_has_column: + r2.df[column] = right + elif column in r2.df.columns: + r2.df = r2.df.drop(columns=[column]) + return r1, r2 + + +def _input_pair(tmp_path, input_type, r1, r2): + if input_type == "dataframe": + return [r1.df, r2.df] + if input_type == "result": + return [r1, r2] + if input_type == "roundtrip": + r1_path = tmp_path / "netmhcpan.tsv" + r2_path = tmp_path / "mhcflurry.csv" + r1.to_tsv(r1_path) + r2.to_csv(r2_path) + return [read_tsv(r1_path), read_csv(r2_path)] + raise ValueError(f"unknown input type: {input_type}") + + def test_combine_separate_predictor_runs_matches_combined_run(): peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} alleles = ["HLA-A*02:01", "HLA-B*07:02"] @@ -213,22 +248,17 @@ def test_combine_rejects_different_identity_sets(): combine_predictor_results([r1, r2]) -def test_combine_rejects_mismatched_source_context(): - r1 = _simple_result("netmhcpan") - r1 = TopiaryResult( - pd.concat( - [ - r1.df, - r1.df.assign(source_sequence_name="pep1-copy"), - ], - ignore_index=True, - ), - models=r1.models, - ) - r2 = _simple_result("mhcflurry") +@pytest.mark.parametrize("column", sorted(_CONTEXT_MISMATCH_VALUES)) +@pytest.mark.parametrize("input_type", ["dataframe", "result", "roundtrip"]) +@pytest.mark.parametrize("right_has_column", [True, False]) +def test_combine_rejects_mismatched_context_columns( + tmp_path, column, input_type, right_has_column, +): + r1, r2 = _mismatched_context_results(column, right_has_column) + inputs = _input_pair(tmp_path, input_type, r1, r2) - with pytest.raises(ValueError, match="source_sequence_name"): - combine_predictor_results([r1, r2]) + with pytest.raises(ValueError, match=column): + combine_predictor_results(inputs) def test_combine_treats_null_identity_keys_as_equal(): diff --git a/tests/test_io.py b/tests/test_io.py index 8e9c63d..fcf4c8b 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -204,6 +204,19 @@ def _sample_long_df(): ]) +def _sample_long_df_with_version_state(version_state): + df = _sample_long_df().iloc[[0]].copy() + if version_state == "missing": + return df.drop(columns=["predictor_version"]) + if version_state == "blank": + df["predictor_version"] = "" + return df + if version_state == "na": + df["predictor_version"] = pd.NA + return df + raise ValueError(f"unknown version state: {version_state}") + + # --------------------------------------------------------------------------- # Read/write round-trip tests # --------------------------------------------------------------------------- @@ -287,6 +300,56 @@ def test_result_metadata_filtered_to_observed_long_rows(self, tmp_path): assert meta.models == {"netmhcpan": "4.1b"} + @pytest.mark.parametrize("version_state", ["missing", "blank", "na"]) + @pytest.mark.parametrize( + "writer,reader,suffix", + [(to_tsv, read_tsv, "tsv"), (to_csv, read_csv, "csv")], + ) + def test_dataframe_writer_fills_blank_row_versions_from_attrs( + self, tmp_path, version_state, writer, reader, suffix, + ): + df = _sample_long_df_with_version_state(version_state) + df.attrs["topiary_models"] = { + "netmhcpan": "4.1b", + "old_model": "0.1", + } + + path = tmp_path / f"attrs.{suffix}" + writer(df, path) + + meta = reader(path).metadata + assert meta.models == {"netmhcpan": "4.1b"} + + @pytest.mark.parametrize("version_state", ["missing", "blank", "na"]) + @pytest.mark.parametrize( + "writer,method_name,reader,suffix", + [ + (to_tsv, "to_tsv", read_tsv, "tsv"), + (to_csv, "to_csv", read_csv, "csv"), + ], + ) + @pytest.mark.parametrize("call_style", ["function", "method"]) + def test_result_writer_fills_blank_row_versions_from_metadata( + self, tmp_path, version_state, writer, method_name, reader, suffix, + call_style, + ): + from topiary import TopiaryResult + + df = _sample_long_df_with_version_state(version_state) + result = TopiaryResult( + df, + models={"netmhcpan": "4.1b", "old_model": "0.1"}, + ) + + path = tmp_path / f"result.{suffix}" + if call_style == "function": + writer(result, path) + else: + getattr(result, method_name)(path) + + meta = reader(path).metadata + assert meta.models == {"netmhcpan": "4.1b"} + class TestReadWriteCSV: def test_csv_roundtrip(self, tmp_path): diff --git a/topiary/io.py b/topiary/io.py index 0812489..9629326 100644 --- a/topiary/io.py +++ b/topiary/io.py @@ -155,27 +155,40 @@ def _format_extra_value(value): def _models_from_long_rows(df): """Extract observed model versions from long-form rows, if possible.""" - if ( - "prediction_method_name" not in df.columns - or "predictor_version" not in df.columns - ): + if "prediction_method_name" not in df.columns: return None models = OrderedDict() - for method, version in ( + for method, rows in ( df.dropna(subset=["prediction_method_name"]) - .groupby("prediction_method_name", sort=False)["predictor_version"] - .first() - .items() + .groupby("prediction_method_name", sort=False) ): method_str = str(method).strip() if not method_str: continue - version_str = str(version).strip() if pd.notna(version) else "" - models[method_str] = version_str + models[method_str] = _version_from_rows(rows) return models +def _version_from_rows(rows): + if "predictor_version" not in rows.columns: + return "" + for version in rows["predictor_version"]: + version_str = _model_version_str(version) + if version_str: + return version_str + return "" + + +def _model_version_str(value): + try: + if pd.isna(value): + return "" + except (TypeError, ValueError): + pass + return str(value).strip() + + def _observed_model_names(df): """Return model names visible in prediction columns, or None if unknown.""" if "prediction_method_name" in df.columns: @@ -219,14 +232,31 @@ def _models_from_dataframe(df): """Extract model metadata from the DataFrame contents before attrs.""" row_models = _models_from_long_rows(df) if row_models is not None: - attr_models = _models_from_attrs(df) - for model, version in row_models.items(): - if not version and model in attr_models: - row_models[model] = attr_models[model] - return row_models + return _fill_missing_model_versions(row_models, _models_from_attrs(df)) return _models_from_attrs(df) +def _fill_missing_model_versions(models, *fallbacks): + models = OrderedDict(models) + fallback_models = [ + OrderedDict( + (str(model).strip(), _model_version_str(version)) + for model, version in fallback.items() + if str(model).strip() + ) + for fallback in fallbacks + if fallback + ] + for model, version in models.items(): + if version: + continue + for fallback in fallback_models: + if fallback.get(model): + models[model] = fallback[model] + break + return models + + # -- Read ------------------------------------------------------------------ @@ -318,13 +348,13 @@ def _write_delimited(df, path, sep, metadata, index): metadata.form = detect_form(df) row_models = _models_from_long_rows(df) + df_models = _models_from_dataframe(df) if metadata_from_result and row_models is not None: - metadata.models = row_models + metadata.models = _fill_missing_model_versions( + df_models, metadata.models, + ) elif not metadata.models: - if row_models is not None: - metadata.models.update(row_models) - else: - metadata.models.update(_models_from_attrs(df)) + metadata.models.update(df_models) comment_block = _format_comment_block(metadata) diff --git a/topiary/result.py b/topiary/result.py index 09c2600..424739c 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -10,7 +10,7 @@ import pandas as pd -from .io import Metadata, _models_from_dataframe +from .io import Metadata, _model_version_str, _models_from_dataframe from .wide import detect_form @@ -26,6 +26,8 @@ def __repr__(self): "source_sequence_name", "peptide_offset", "peptide_length", + "n_flank", + "c_flank", ) @@ -279,11 +281,11 @@ def sort_by(self, expr): def to_tsv(self, path): from .io import to_tsv as _to_tsv - _to_tsv(self.df, path, metadata=self.metadata) + _to_tsv(self, path) def to_csv(self, path): from .io import to_csv as _to_csv - _to_csv(self.df, path, metadata=self.metadata) + _to_csv(self, path) # -- Accessors / helpers ---------------------------------------------- @@ -593,15 +595,6 @@ def _version_from_rows(rows): return "" -def _model_version_str(value): - try: - if pd.isna(value): - return "" - except (TypeError, ValueError): - pass - return str(value).strip() - - def _identity_columns(results, on): columns = list(on) for column in _SOURCE_CONTEXT_IDENTITY_COLUMNS: From d14478a4008d0aab5e8f967241b8bc316b995cb5 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Sat, 16 May 2026 21:23:21 -0400 Subject: [PATCH 08/17] Fill mixed predictor model versions --- tests/test_predictor_internals.py | 62 ++++++++++++++++++++++++++++++ topiary/predictor.py | 63 ++++++++++++++++++++++--------- 2 files changed, 108 insertions(+), 17 deletions(-) diff --git a/tests/test_predictor_internals.py b/tests/test_predictor_internals.py index 7287053..983efef 100644 --- a/tests/test_predictor_internals.py +++ b/tests/test_predictor_internals.py @@ -159,6 +159,39 @@ def predict_proteins_dataframe(self, name_to_sequence_dict): return pd.concat([affinity_df, presentation_df], ignore_index=True) +class _ToyVersionModel: + default_peptide_lengths = [9] + supported_kinds = ("pMHC_affinity",) + + def __init__(self, name, version, row_version): + self.prediction_method_name = name + self.predictor_version = version + self.row_version = row_version + + def predict_dataframe(self, peptides): + rows = [] + for peptide in peptides: + rows.append({ + "peptide": peptide, + "allele": "HLA-A*02:01", + "kind": "pMHC_affinity", + "score": 0.5, + "value": 100.0, + "percentile_rank": 1.0, + "predictor_name": self.prediction_method_name, + }) + df = pd.DataFrame(rows) + if self.row_version == "missing": + return df + if self.row_version == "blank": + df["predictor_version"] = "" + elif self.row_version == "na": + df["predictor_version"] = pd.NA + else: + df["predictor_version"] = self.row_version + return df + + def test_end_to_end_presentation_value_populated(): # Public-API check: presentation rows produced through # predict_from_named_sequences must report value == score (not NaN), @@ -177,6 +210,35 @@ def test_end_to_end_presentation_value_populated(): assert (aff["value"] == aff["affinity"]).all() +@pytest.mark.parametrize("missing_state", ["missing", "blank", "na"]) +def test_result_attrs_fill_versions_per_missing_method(tmp_path, missing_state): + from topiary import TopiaryResult, read_tsv + + predictor = TopiaryPredictor(models=[ + _ToyVersionModel("with_rows", "1.0", "1.0"), + _ToyVersionModel("from_model", "2.0", missing_state), + ]) + + df = predictor.predict_from_named_peptides({"pep": "SIINFEKLA"}) + + assert df.attrs["topiary_models"] == { + "with_rows": "1.0", + "from_model": "2.0", + } + assert TopiaryResult(df).models == { + "with_rows": "1.0", + "from_model": "2.0", + } + + path = tmp_path / "predictions.tsv" + TopiaryResult(df).to_tsv(path) + + assert read_tsv(path).models == { + "with_rows": "1.0", + "from_model": "2.0", + } + + # --------------------------------------------------------------------------- # _attach_expression_data: additional edge cases # --------------------------------------------------------------------------- diff --git a/topiary/predictor.py b/topiary/predictor.py index a42d1ec..07ec9d6 100644 --- a/topiary/predictor.py +++ b/topiary/predictor.py @@ -28,6 +28,7 @@ apply_sort, parse, ) +from .io import _model_version_str from .protein_fragment import ProteinFragment from .sequence_helpers import ( check_padding_around_mutation, @@ -97,6 +98,25 @@ def _unique_model_keys(models): return keys +def _model_metadata_name(model): + return ( + getattr(model, "prediction_method_name", None) + or getattr(model, "predictor_name", None) + or getattr(model, "name", None) + or type(model).__name__ + ) + + +def _model_metadata_versions(models): + versions = {} + for model in models: + name = _model_metadata_name(model) + version = _model_version_str(getattr(model, "predictor_version", None)) + if name and version: + versions[str(name)] = version + return versions + + def _build_model_lookup(): """Build a lowercase name → mhctools predictor class mapping.""" import inspect @@ -677,26 +697,35 @@ def _strip_internal_columns(self, df): def _attach_result_attrs(self, df): """Attach lightweight metadata to public DataFrame outputs.""" model_versions = {} + observed_methods = [] + if "prediction_method_name" in df.columns: + observed_methods = [ + str(method).strip() + for method in df["prediction_method_name"].dropna().unique() + if str(method).strip() + ] if "prediction_method_name" in df.columns and "predictor_version" in df.columns: - for method, version in ( + for method, rows in ( df.dropna(subset=["prediction_method_name"]) - .groupby("prediction_method_name")["predictor_version"] - .first() - .items() + .groupby("prediction_method_name", sort=False) ): - if pd.notna(version) and str(version): - model_versions[str(method)] = str(version) - if not model_versions: - for model in self.models: - name = ( - getattr(model, "prediction_method_name", None) - or getattr(model, "predictor_name", None) - or getattr(model, "name", None) - or type(model).__name__ - ) - version = getattr(model, "predictor_version", None) - if name and version: - model_versions[str(name)] = str(version) + method_str = str(method).strip() + if not method_str: + continue + for version in rows["predictor_version"]: + version_str = _model_version_str(version) + if version_str: + model_versions[method_str] = version_str + break + + fallback_versions = _model_metadata_versions(self.models) + if observed_methods: + for method in observed_methods: + if not model_versions.get(method) and fallback_versions.get(method): + model_versions[method] = fallback_versions[method] + elif not model_versions: + model_versions.update(fallback_versions) + if model_versions: df.attrs["topiary_models"] = model_versions return df From 44e7158390f7963eb66063fdae7402aadc1e05c4 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Sun, 17 May 2026 10:33:05 -0400 Subject: [PATCH 09/17] Support split predictor grid combines --- CHANGELOG.md | 14 +- docs/ranking.md | 33 +++- tests/test_combine_predictor_results.py | 200 +++++++++++++++++++++++- topiary/result.py | 143 ++++++++++++----- 4 files changed, 338 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b4befc..98bab23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,14 @@ **Combine separate predictor runs (#170):** `topiary.combine_predictor_results([a, b, ...])` stacks separate -single-allele predictor outputs into the same long-form shape produced -by running those predictors together. It accepts `TopiaryResult` or -fresh `TopiaryPredictor` DataFrame outputs, strictly validates that all -inputs cover the same `(peptide, allele)` identity set, rejects -duplicate `prediction_method_name` values across inputs, and merges -model metadata when present. +predictor outputs into the same long-form shape produced by running +those predictors together. It accepts `TopiaryResult` or fresh +`TopiaryPredictor` DataFrame outputs, supports both split-by-predictor +and split-by-allele/peptide-length runs, rejects duplicate +`(prediction_method_name, kind, identity)` predictions, and by default +requires every emitted `(prediction_method_name, kind)` group to cover +the same identity grid. Use `coverage="partial"` only for deliberate +sparse unions. Fresh `TopiaryPredictor` DataFrames now carry lightweight `DataFrame.attrs` model-version metadata (`topiary_models`) so this diff --git a/docs/ranking.md b/docs/ranking.md index 06e6b88..edefbd7 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -390,11 +390,36 @@ mhcflurry_rows = TopiaryPredictor( combined = combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) ``` -The helper is intentionally strict. Every input must cover the same -`(peptide, allele)` keys, and each `prediction_method_name` may appear in only -one input. The combined result preserves the original rows: use each row's +You can also shard the same predictor over allele or peptide-length batches and +combine the shards: + +```python +shards = [] +for allele in ["HLA-A*02:01", "HLA-B*07:02"]: + shards.append( + TopiaryPredictor( + models=NetMHCpan, + alleles=[allele], + ).predict_from_named_peptides(peptides) + ) + +combined = combine_predictor_results(shards) +``` + +The helper is intentionally strict. It rejects duplicate +`(prediction_method_name, kind, identity)` rows, and by default requires every +emitted `(prediction_method_name, kind)` group to cover the same peptide/allele +identity grid. This catches incomplete split runs before `to_wide()` can +produce half-populated rows. If you intentionally want a sparse union, pass +`coverage="partial"`; duplicate predictions are still rejected. + +The combined result preserves the original rows: use each row's `prediction_method_name`, `predictor_version`, `kind`, and value/rank columns -to inspect which predictor produced which quantity. +to inspect which predictor produced which quantity. Allele aggregation remains +part of the ranking DSL: for example, +`Affinity["netmhcpan"].best_value_allele` and +`Presentation["netmhcpan"].best_score_allele` report the allele associated +with the best BA or EL value across the combined allele grid. ## Putting it together diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 891f20a..af473d7 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -4,6 +4,9 @@ import pytest from topiary import ( + Affinity, + EvalContext, + Presentation, TopiaryPredictor, TopiaryResult, combine_predictor_results, @@ -42,6 +45,60 @@ def predict_dataframe(self, peptides): return pd.DataFrame(rows) +class ToyGridPredictor: + default_peptide_lengths = [9, 10] + supported_kinds = ("pMHC_affinity", "pMHC_presentation") + + def __init__( + self, name, version, alleles, offset, peptide_lengths=None, + kinds=None, + ): + self.prediction_method_name = name + self.predictor_version = version + self.alleles = alleles + self.offset = offset + self.peptide_lengths = set(peptide_lengths) if peptide_lengths else None + self.kinds = tuple(kinds) if kinds else self.supported_kinds + + def predict_dataframe(self, peptides): + rows = [] + for peptide in peptides: + peptide_length = len(peptide) + if self.peptide_lengths and peptide_length not in self.peptide_lengths: + continue + for allele_i, allele in enumerate(self.alleles): + allele_rank = { + "HLA-A*02:01": 0, + "HLA-B*07:02": 1, + }.get(allele, allele_i) + base = self.offset + peptide_length * 10 + allele_rank + if "pMHC_affinity" in self.kinds: + affinity = 100.0 + base + allele_rank * 100.0 + rows.append({ + "peptide": peptide, + "allele": allele, + "kind": "pMHC_affinity", + "value": affinity, + "score": 1.0 / affinity, + "percentile_rank": affinity / 100.0, + "predictor_name": self.prediction_method_name, + "predictor_version": self.predictor_version, + }) + if "pMHC_presentation" in self.kinds: + score = 0.1 + allele_rank * 0.5 + peptide_length / 1000.0 + rows.append({ + "peptide": peptide, + "allele": allele, + "kind": "pMHC_presentation", + "value": score, + "score": score, + "percentile_rank": 100.0 - score, + "predictor_name": self.prediction_method_name, + "predictor_version": self.predictor_version, + }) + return pd.DataFrame(rows) + + def _sort_predictions(df): cols = [ "source_sequence_name", "peptide", "allele", "kind", @@ -126,6 +183,14 @@ def _input_pair(tmp_path, input_type, r1, r2): raise ValueError(f"unknown input type: {input_type}") +def _grid_peptides(): + return {"pep9": "SIINFEKLA", "pep10": "SIINFEKLAA"} + + +def _grid_alleles(): + return ["HLA-A*02:01", "HLA-B*07:02"] + + def test_combine_separate_predictor_runs_matches_combined_run(): peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} alleles = ["HLA-A*02:01", "HLA-B*07:02"] @@ -152,6 +217,103 @@ def test_combine_separate_predictor_runs_matches_combined_run(): assert "kind_support" not in combined.extra +def test_combine_same_method_split_by_allele_and_length_matches_direct_run(): + peptides = _grid_peptides() + alleles = _grid_alleles() + direct = TopiaryPredictor( + models=ToyGridPredictor("netmhcpan", "4.1b", alleles, offset=10) + ).predict_from_named_peptides(peptides) + split_results = [] + for allele in alleles: + for peptide_length in [8, 9, 10]: + split_results.append( + TopiaryPredictor( + models=ToyGridPredictor( + "netmhcpan", "4.1b", [allele], offset=10, + peptide_lengths=[peptide_length], + ) + ).predict_from_named_peptides(peptides) + ) + + combined = combine_predictor_results(split_results) + + pd.testing.assert_frame_equal( + _sort_predictions(combined.df), + _sort_predictions(direct), + ) + pd.testing.assert_frame_equal( + _sort_wide(to_wide(combined.df)), + _sort_wide(to_wide(direct)), + ) + assert combined.models == {"netmhcpan": "4.1b"} + + +def test_combine_multi_method_inputs_split_by_allele_match_direct_run(): + peptides = _grid_peptides() + alleles = _grid_alleles() + direct_models = [ + ToyGridPredictor("netmhcpan", "4.1b", alleles, offset=10), + ToyGridPredictor("mhcflurry", "2.1.1", alleles, offset=20), + ToyGridPredictor( + "pepsickle", "0.2", alleles, offset=30, + kinds=["pMHC_presentation"], + ), + ] + direct = TopiaryPredictor(models=direct_models).predict_from_named_peptides( + peptides + ) + split_results = [] + for allele in alleles: + split_results.append( + TopiaryPredictor(models=[ + ToyGridPredictor("netmhcpan", "4.1b", [allele], offset=10), + ToyGridPredictor("mhcflurry", "2.1.1", [allele], offset=20), + ToyGridPredictor( + "pepsickle", "0.2", [allele], offset=30, + kinds=["pMHC_presentation"], + ), + ]).predict_from_named_peptides(peptides) + ) + + combined = combine_predictor_results(split_results) + + pd.testing.assert_frame_equal( + _sort_predictions(combined.df), + _sort_predictions(direct), + ) + assert combined.models == { + "netmhcpan": "4.1b", + "mhcflurry": "2.1.1", + "pepsickle": "0.2", + } + + +def test_combined_split_grid_supports_best_ba_and_el_allele_aggregation(): + peptides = _grid_peptides() + alleles = _grid_alleles() + split_results = [ + TopiaryPredictor( + models=ToyGridPredictor( + "netmhcpan", "4.1b", [allele], offset=10, + peptide_lengths=[peptide_length], + ) + ).predict_from_named_peptides(peptides) + for allele in alleles + for peptide_length in [8, 9, 10] + ] + combined = combine_predictor_results(split_results) + ctx = EvalContext(combined.df) + + best_ba_allele = Affinity["netmhcpan"].best_value_allele.eval(ctx) + best_el_allele = Presentation["netmhcpan"].best_score_allele.eval(ctx) + + for source_name, peptide in peptides.items(): + for allele in alleles: + key = (source_name, peptide, 0, allele) + assert best_ba_allele.loc[key] == "HLA-A*02:01" + assert best_el_allele.loc[key] == "HLA-B*07:02" + + def test_combine_roundtripped_topiary_results(tmp_path): peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} alleles = ["HLA-A*02:01", "HLA-B*07:02"] @@ -244,10 +406,44 @@ def test_combine_rejects_different_identity_sets(): r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") - with pytest.raises(ValueError, match="same .* keys"): + with pytest.raises(ValueError, match="coverage='complete'"): combine_predictor_results([r1, r2]) +def test_combine_rejects_incomplete_method_coverage_within_one_input(): + method_a = pd.concat( + [ + _simple_result("netmhcpan", peptide="SIINFEKLA").df, + ], + ignore_index=True, + ) + method_b = pd.concat( + [ + _simple_result("mhcflurry", peptide="SIINFEKLA").df, + _simple_result("mhcflurry", peptide="ELAGIGILT").df, + ], + ignore_index=True, + ) + result = TopiaryResult(pd.concat([method_a, method_b], ignore_index=True)) + + with pytest.raises(ValueError, match="coverage='complete'"): + combine_predictor_results([result]) + + +def test_combine_partial_coverage_allows_sparse_union(): + r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") + r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") + + combined = combine_predictor_results([r1, r2], coverage="partial") + + assert len(combined) == 2 + + +def test_combine_rejects_unknown_coverage_mode(): + with pytest.raises(ValueError, match="coverage"): + combine_predictor_results([_simple_result("netmhcpan")], coverage="loose") + + @pytest.mark.parametrize("column", sorted(_CONTEXT_MISMATCH_VALUES)) @pytest.mark.parametrize("input_type", ["dataframe", "result", "roundtrip"]) @pytest.mark.parametrize("right_has_column", [True, False]) @@ -274,7 +470,7 @@ def test_combine_rejects_duplicate_prediction_methods(): r1 = _simple_result("netmhcpan") r2 = _simple_result("netmhcpan") - with pytest.raises(ValueError, match="duplicate prediction method"): + with pytest.raises(ValueError, match="duplicate predictions"): combine_predictor_results([r1, r2]) diff --git a/topiary/result.py b/topiary/result.py index 424739c..a820dfa 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -455,14 +455,15 @@ def concat(results): ) -def combine_predictor_results(results, on=("peptide", "allele")): +def combine_predictor_results(results, on=("peptide", "allele"), coverage="complete"): """Combine separate predictor outputs into one predictor-equivalent result. - This is stricter than :func:`concat`: every input must cover the same - identity key set, and no prediction method may appear in more than one - input. It is intended for the common single-allele case where, for example, - NetMHCpan and MHCflurry are run separately and then stacked into the same - long-form shape produced by running both models together. + This is stricter than :func:`concat`: duplicate predictions are rejected, + and by default every emitted prediction method/kind must cover the same + identity key set. It supports both common split patterns: + + - different predictors run separately on the same peptides; + - the same predictor run separately over disjoint allele/length shards. Parameters ---------- @@ -473,6 +474,13 @@ def combine_predictor_results(results, on=("peptide", "allele")): ``("peptide", "allele")``. Source context columns such as ``source_sequence_name`` and ``peptide_offset`` are also checked when present so repeated peptide/allele rows remain distinct. + coverage : {"complete", "partial"} or bool + ``"complete"`` (default) requires each emitted + ``(prediction_method_name, kind)`` group to cover the same identity + key set, matching a normal multi-predictor run. ``"partial"`` + allows sparse unions but still rejects duplicate predictions. + ``True`` and ``False`` are accepted as aliases for ``"complete"`` + and ``"partial"``. Returns ------- @@ -480,22 +488,24 @@ def combine_predictor_results(results, on=("peptide", "allele")): Combined long-form result with merged model metadata. """ results = [_as_topiary_result(r) for r in results] + results = [r for r in results if not r.df.empty] if not results: return TopiaryResult(pd.DataFrame()) if isinstance(on, str): on = (on,) on = tuple(on) + coverage = _normalize_coverage_mode(coverage) for i, result in enumerate(results): _validate_predictor_result(result, i, on) - _validate_unique_prediction_methods(results) - identity_columns = _identity_columns(results, on) - _validate_same_identity_keys(results, identity_columns) - results = [_drop_non_identity_source(result, on) for result in results] + identity_columns = _identity_columns(results, on) combined = concat(results) + _validate_no_duplicate_predictions(combined.df, identity_columns) + if coverage == "complete": + _validate_complete_prediction_coverage(combined.df, identity_columns) combined.models = _models_from_observed_rows(combined.df, combined.models) if "kind_support" in combined.extra: extra = OrderedDict(combined.extra) @@ -542,17 +552,17 @@ def _prediction_methods(result): } -def _validate_unique_prediction_methods(results): - seen = {} - for index, result in enumerate(results): - for method in sorted(_prediction_methods(result)): - if method in seen: - raise ValueError( - "combine_predictor_results cannot combine duplicate " - f"prediction method {method!r}; found in results " - f"{seen[method]} and {index}" - ) - seen[method] = index +def _normalize_coverage_mode(coverage): + if coverage is True: + return "complete" + if coverage is False: + return "partial" + if coverage not in {"complete", "partial"}: + raise ValueError( + "combine_predictor_results coverage must be 'complete' or " + f"'partial', got {coverage!r}" + ) + return coverage def _drop_non_identity_source(result, on): @@ -603,20 +613,20 @@ def _identity_columns(results, on): return tuple(columns) -def _identity_keys(result, columns): - identity_df = pd.DataFrame(index=result.df.index) +def _identity_frame(df, columns): + identity_df = pd.DataFrame(index=df.index) for column in columns: - if column in result.df.columns: - identity_df[column] = result.df[column] + if column in df.columns: + identity_df[column] = df[column] else: identity_df[column] = pd.NA + return identity_df + + +def _key_set(df): return { tuple(_normalize_identity_value(value) for value in key) - for key in ( - identity_df - .drop_duplicates() - .itertuples(index=False, name=None) - ) + for key in df.drop_duplicates().itertuples(index=False, name=None) } @@ -629,24 +639,77 @@ def _normalize_identity_value(value): return value -def _validate_same_identity_keys(results, on): - baseline = _identity_keys(results[0], on) - for index, result in enumerate(results[1:], start=1): - current = _identity_keys(result, on) - missing = baseline - current - extra = current - baseline +def _prediction_key_frame(df, identity_columns): + return pd.concat( + [ + _identity_frame(df, ("prediction_method_name", "kind")), + _identity_frame(df, identity_columns), + ], + axis=1, + ) + + +def _validate_no_duplicate_predictions(df, identity_columns): + key_df = _prediction_key_frame(df, identity_columns) + seen = {} + duplicates = [] + for row_index, key in zip( + key_df.index, key_df.itertuples(index=False, name=None) + ): + normalized = tuple(_normalize_identity_value(value) for value in key) + if normalized in seen: + duplicates.append(normalized) + else: + seen[normalized] = row_index + if duplicates: + raise ValueError( + "combine_predictor_results found duplicate predictions for " + "(prediction_method_name, kind, identity) keys: " + f"{_format_key_examples(set(duplicates))}" + ) + + +def _prediction_group_key_sets(df, identity_columns): + key_df = _prediction_key_frame(df, identity_columns) + key_df = key_df.rename( + columns={ + "prediction_method_name": "_prediction_method_name", + "kind": "_kind", + } + ) + groups = OrderedDict() + for (method, kind), group in key_df.groupby( + ["_prediction_method_name", "_kind"], dropna=False, sort=False + ): + method_key = _normalize_identity_value(method) + kind_key = _normalize_identity_value(kind) + groups[(method_key, kind_key)] = _key_set(group.loc[:, list(identity_columns)]) + return groups + + +def _validate_complete_prediction_coverage(df, identity_columns): + groups = _prediction_group_key_sets(df, identity_columns) + if not groups: + return + + baseline_group, baseline_keys = next(iter(groups.items())) + for group, keys in list(groups.items())[1:]: + missing = baseline_keys - keys + extra = keys - baseline_keys if missing or extra: message = [ - "combine_predictor_results requires every input to cover the " - f"same {on!r} keys; result {index} differs from result 0." + "combine_predictor_results coverage='complete' requires every " + "(prediction_method_name, kind) group to cover the same " + f"{identity_columns!r} keys; group {group!r} differs from " + f"group {baseline_group!r}." ] if missing: message.append( - f"Missing from result {index}: {_format_key_examples(missing)}" + f"Missing from group {group!r}: {_format_key_examples(missing)}" ) if extra: message.append( - f"Extra in result {index}: {_format_key_examples(extra)}" + f"Extra in group {group!r}: {_format_key_examples(extra)}" ) raise ValueError(" ".join(message)) From e22da05729ec986ae2fb692a5f9df0696b807582 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Sun, 17 May 2026 18:56:01 -0400 Subject: [PATCH 10/17] Add predictor run provenance --- CHANGELOG.md | 15 +++ docs/ranking.md | 50 ++++++-- tests/test_combine_predictor_results.py | 163 +++++++++++++++++++++++- topiary/predictor.py | 24 +++- topiary/wide.py | 3 +- 5 files changed, 239 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98bab23..4eecd40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,21 @@ produced which quantities: `prediction_method_name`, `predictor_version`, `kind`, and the value/rank columns are not duplicated into separate `kind_support` metadata. +`TopiaryPredictor(name=...)` now optionally records per-run provenance +in a `prediction_run_name` column. This is intended for split predictor +grids such as one NetMHCpan run per allele/peptide length: the logical +method remains `prediction_method_name="netmhcpan"`, while +`prediction_run_name` records the shard. `combine_predictor_results` +and `to_wide()` treat the run name as provenance, not as a separate +prediction identity, so disjoint shards combine cleanly and overlapping +shards still fail as duplicate predictions. + +The combine docs now spell out the recommended allele-grid strategy: +split NetMHCpan-style per-allele predictors can be combined under +`coverage="complete"`, while intentionally sparse grids such as +MHCflurry haplotype-mode presentation should use `coverage="partial"` +and the ranking DSL's `best_*_allele` accessors for allele attribution. + ## 5.16.1 **pirlygenes 5.1.0 integration:** diff --git a/docs/ranking.md b/docs/ranking.md index edefbd7..f4f79b5 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -391,21 +391,38 @@ combined = combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) ``` You can also shard the same predictor over allele or peptide-length batches and -combine the shards: +combine the shards. Use `TopiaryPredictor(name=...)` when you want to keep +track of which batch produced each row: ```python shards = [] for allele in ["HLA-A*02:01", "HLA-B*07:02"]: - shards.append( - TopiaryPredictor( - models=NetMHCpan, - alleles=[allele], - ).predict_from_named_peptides(peptides) - ) + for length in [8, 9, 10, 11]: + length_peptides = { + name: peptide + for name, peptide in peptides.items() + if len(peptide) == length + } + shards.append( + TopiaryPredictor( + models=NetMHCpan, + alleles=[allele], + name=f"netmhcpan_{allele}_len{length}", + ).predict_from_named_peptides(length_peptides) + ) combined = combine_predictor_results(shards) ``` +`prediction_method_name` is still the logical predictor name (`netmhcpan` in +the example above). The optional `prediction_run_name` column is only +provenance for a particular run or shard. That distinction lets distinct +NetMHCpan allele/length shards combine into one logical NetMHCpan result, +while overlapping shards with the same `(prediction_method_name, kind, +peptide, allele, source context)` still fail as duplicates. `to_wide()` drops +`prediction_run_name` from the grouping keys, so a named split run has the same +wide shape as a single unsplit run. + The helper is intentionally strict. It rejects duplicate `(prediction_method_name, kind, identity)` rows, and by default requires every emitted `(prediction_method_name, kind)` group to cover the same peptide/allele @@ -415,11 +432,24 @@ produce half-populated rows. If you intentionally want a sparse union, pass The combined result preserves the original rows: use each row's `prediction_method_name`, `predictor_version`, `kind`, and value/rank columns -to inspect which predictor produced which quantity. Allele aggregation remains -part of the ranking DSL: for example, +to inspect which predictor produced which quantity. Use +`prediction_run_name` only to audit the batch that produced a row, not as a DSL +selector. + +Allele aggregation remains part of the ranking DSL: for example, `Affinity["netmhcpan"].best_value_allele` and `Presentation["netmhcpan"].best_score_allele` report the allele associated -with the best BA or EL value across the combined allele grid. +with the best BA or EL value across the combined allele grid. For predictors +that emit one row per allele, such as NetMHCpan or MHCflurry in single-allele +mode, this is the best per-allele row after all shards are combined. For +MHCflurry presentation in haplotype mode, MHCflurry itself sees the allele set +together and may emit one deconvolved best-allele row; combining independent +single-allele MHCflurry shards is therefore not the same calculation as a +direct haplotype-mode MHCflurry run. If you intentionally combine haplotype +presentation rows with per-allele rows, use `coverage="partial"` because those +kinds have different identity grids by construction. Processing-only +quantities that do not depend on allele should be read directly rather than +through `best_*`. ## Putting it together diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index af473d7..5ddc0cc 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -99,6 +99,45 @@ def predict_dataframe(self, peptides): return pd.DataFrame(rows) +class ToyHaplotypeMHCflurryPredictor: + default_peptide_lengths = [9, 10] + supported_kinds = ("pMHC_affinity", "pMHC_presentation") + + def __init__(self, alleles): + self.prediction_method_name = "mhcflurry" + self.predictor_version = "2.1.1" + self.alleles = alleles + + def predict_dataframe(self, peptides): + rows = [] + best_allele = self.alleles[-1] + for peptide in peptides: + peptide_length = len(peptide) + for allele_i, allele in enumerate(self.alleles): + affinity = 50.0 + peptide_length * 10 + allele_i * 100 + rows.append({ + "peptide": peptide, + "allele": allele, + "kind": "pMHC_affinity", + "value": affinity, + "score": 1.0 / affinity, + "percentile_rank": affinity / 100.0, + "predictor_name": self.prediction_method_name, + "predictor_version": self.predictor_version, + }) + rows.append({ + "peptide": peptide, + "allele": best_allele, + "kind": "pMHC_presentation", + "value": 0.9, + "score": 0.9, + "percentile_rank": 0.5, + "predictor_name": self.prediction_method_name, + "predictor_version": self.predictor_version, + }) + return pd.DataFrame(rows) + + def _sort_predictions(df): cols = [ "source_sequence_name", "peptide", "allele", "kind", @@ -123,6 +162,10 @@ def _sort_wide(df): ) +def _without_run_name(df): + return df.drop(columns=["prediction_run_name"], errors="ignore") + + def _simple_result(method="netmhcpan", peptide="SIINFEKLA", allele="HLA-A*02:01"): df = pd.DataFrame([{ "source_sequence_name": "pep1", @@ -231,14 +274,15 @@ def test_combine_same_method_split_by_allele_and_length_matches_direct_run(): models=ToyGridPredictor( "netmhcpan", "4.1b", [allele], offset=10, peptide_lengths=[peptide_length], - ) + ), + name=f"netmhcpan_{allele}_len{peptide_length}", ).predict_from_named_peptides(peptides) ) combined = combine_predictor_results(split_results) pd.testing.assert_frame_equal( - _sort_predictions(combined.df), + _sort_predictions(_without_run_name(combined.df)), _sort_predictions(direct), ) pd.testing.assert_frame_equal( @@ -246,6 +290,11 @@ def test_combine_same_method_split_by_allele_and_length_matches_direct_run(): _sort_wide(to_wide(direct)), ) assert combined.models == {"netmhcpan": "4.1b"} + assert set(combined.df["prediction_run_name"]) == { + f"netmhcpan_{allele}_len{peptide_length}" + for allele in alleles + for peptide_length in [9, 10] + } def test_combine_multi_method_inputs_split_by_allele_match_direct_run(): @@ -296,7 +345,8 @@ def test_combined_split_grid_supports_best_ba_and_el_allele_aggregation(): models=ToyGridPredictor( "netmhcpan", "4.1b", [allele], offset=10, peptide_lengths=[peptide_length], - ) + ), + name=f"netmhcpan_{allele}_len{peptide_length}", ).predict_from_named_peptides(peptides) for allele in alleles for peptide_length in [8, 9, 10] @@ -314,6 +364,113 @@ def test_combined_split_grid_supports_best_ba_and_el_allele_aggregation(): assert best_el_allele.loc[key] == "HLA-B*07:02" +def test_combine_haplotype_style_presentation_uses_partial_coverage(): + peptides = _grid_peptides() + alleles = _grid_alleles() + netmhcpan_rows = TopiaryPredictor( + models=ToyGridPredictor("netmhcpan", "4.1b", alleles, offset=10), + name="netmhcpan_all", + ).predict_from_named_peptides(peptides) + mhcflurry_rows = TopiaryPredictor( + models=ToyHaplotypeMHCflurryPredictor(alleles), + name="mhcflurry_haplotype", + ).predict_from_named_peptides(peptides) + + with pytest.raises(ValueError, match="coverage='complete'"): + combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) + + combined = combine_predictor_results( + [netmhcpan_rows, mhcflurry_rows], + coverage="partial", + ) + ctx = EvalContext(combined.df) + best_flurry_allele = Presentation["mhcflurry"].best_score_allele.eval(ctx) + + for source_name, peptide in peptides.items(): + for allele in alleles: + key = (source_name, peptide, 0, allele) + assert best_flurry_allele.loc[key] == "HLA-B*07:02" + + wide = to_wide(combined.df) + assert len(wide) == len(peptides) * len(alleles) + assert not wide["mhcflurry_affinity_value"].isna().any() + assert wide.loc[ + wide["allele"] == "HLA-A*02:01", + "mhcflurry_presentation_score", + ].isna().all() + assert not wide.loc[ + wide["allele"] == "HLA-B*07:02", + "mhcflurry_presentation_score", + ].isna().any() + + +def test_topiary_predictor_name_adds_run_provenance(tmp_path): + peptides = {"pep1": "SIINFEKLA"} + run_name = "netmhcpan_A0201_len9" + predictor = TopiaryPredictor( + models=ToyGridPredictor( + "netmhcpan", "4.1b", ["HLA-A*02:01"], offset=10, + peptide_lengths=[9], + ), + name=run_name, + ) + + df = predictor.predict_from_named_peptides(peptides) + + assert set(df["prediction_run_name"]) == {run_name} + assert set(df["prediction_method_name"]) == {"netmhcpan"} + + path = tmp_path / "named-run.tsv" + TopiaryResult(df).to_tsv(path) + roundtripped = read_tsv(path) + + assert set(roundtripped.df["prediction_run_name"]) == {run_name} + assert roundtripped.models == {"netmhcpan": "4.1b"} + + +def test_prediction_run_name_does_not_split_wide_rows(): + peptides = _grid_peptides() + alleles = _grid_alleles() + split_results = [ + TopiaryPredictor( + models=ToyGridPredictor( + "netmhcpan", "4.1b", [allele], offset=10, + peptide_lengths=[len(peptide)], + ), + name=f"netmhcpan_{allele}_{source_name}", + ).predict_from_named_peptides({source_name: peptide}) + for source_name, peptide in peptides.items() + for allele in alleles + ] + + combined = combine_predictor_results(split_results) + wide = to_wide(combined.df) + + assert "prediction_run_name" not in wide.columns + assert len(wide) == len(peptides) * len(alleles) + assert not wide["netmhcpan_affinity_value"].isna().any() + assert not wide["netmhcpan_presentation_score"].isna().any() + + +def test_combine_rejects_overlapping_named_shards(): + peptides = {"pep1": "SIINFEKLA"} + shard_a = TopiaryPredictor( + models=ToyGridPredictor( + "netmhcpan", "4.1b", ["HLA-A*02:01"], offset=10, + ), + name="netmhcpan_A0201_first", + ).predict_from_named_peptides(peptides) + shard_b = TopiaryPredictor( + models=ToyGridPredictor( + "netmhcpan", "4.1b", ["HLA-A*02:01"], offset=20, + ), + name="netmhcpan_A0201_second", + ).predict_from_named_peptides(peptides) + + with pytest.raises(ValueError, match="duplicate predictions"): + combine_predictor_results([shard_a, shard_b]) + + def test_combine_roundtripped_topiary_results(tmp_path): peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} alleles = ["HLA-A*02:01", "HLA-B*07:02"] diff --git a/topiary/predictor.py b/topiary/predictor.py index 07ec9d6..f8c2c25 100644 --- a/topiary/predictor.py +++ b/topiary/predictor.py @@ -434,6 +434,7 @@ def __init__( mhc_models=None, self_proteome=None, predict_wt=False, + name=None, ): """ Parameters @@ -497,6 +498,12 @@ def __init__( ``wt_peptide`` and attach ``wt_*`` prediction columns before filter/sort expressions are evaluated. Rows without a length-compatible wildtype peptide keep NaN ``wt_*`` values. + + name : str, optional + Human-readable name for this predictor run. When provided, + public prediction outputs include ``prediction_run_name`` with + this value. This is run/shard provenance only; logical model + identity remains ``prediction_method_name``. """ # --- model setup --- raw_models = models or mhc_models or (mhc_model and [mhc_model]) @@ -539,6 +546,11 @@ def __init__( self.raise_on_error = raise_on_error self.self_proteome = self_proteome self.predict_wt = predict_wt + if name is None: + self.name = None + else: + name = str(name).strip() + self.name = name or None @property def mhc_model(self): @@ -596,7 +608,9 @@ def predict_from_named_sequences(self, name_to_sequence_dict): pandas.DataFrame with columns: source_sequence_name, peptide, peptide_offset, peptide_length, allele, kind, score, value, affinity, percentile_rank, - prediction_method_name, predictor_version, n_flank, c_flank + prediction_method_name, predictor_version, n_flank, c_flank. + If ``name`` was provided at construction time, also includes + ``prediction_run_name``. """ df = self._predict_raw(name_to_sequence_dict) return self._attach_result_attrs( @@ -615,7 +629,9 @@ def predict_from_named_peptides(self, name_to_peptide_dict): pandas.DataFrame with columns: source_sequence_name, peptide, peptide_offset, peptide_length, allele, kind, score, value, affinity, percentile_rank, - prediction_method_name, predictor_version, n_flank, c_flank + prediction_method_name, predictor_version, n_flank, c_flank. + If ``name`` was provided at construction time, also includes + ``prediction_run_name``. """ df = self._predict_raw_peptides(name_to_peptide_dict) return self._attach_result_attrs( @@ -696,6 +712,10 @@ def _strip_internal_columns(self, df): def _attach_result_attrs(self, df): """Attach lightweight metadata to public DataFrame outputs.""" + if self.name is not None: + df = df.copy() + df["prediction_run_name"] = self.name + model_versions = {} observed_methods = [] if "prediction_method_name" in df.columns: diff --git a/topiary/wide.py b/topiary/wide.py index d9bea9b..5176daa 100644 --- a/topiary/wide.py +++ b/topiary/wide.py @@ -16,7 +16,8 @@ # Columns that are prediction-specific and get pivoted in wide form. PREDICTION_COLUMNS = frozenset({ "kind", "score", "value", "percentile_rank", - "prediction_method_name", "predictor_version", "affinity", + "prediction_method_name", "predictor_version", "prediction_run_name", + "affinity", }) # Wide-form field suffixes. From 41ace85c6e0efda00e2dd27c9cfa0b057262c86a Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Sun, 17 May 2026 22:37:02 -0400 Subject: [PATCH 11/17] Include sample name in combine identity --- CHANGELOG.md | 4 ++++ docs/ranking.md | 6 +++--- tests/test_combine_predictor_results.py | 16 ++++++++++++++++ topiary/result.py | 6 ++++-- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4eecd40..82e0e22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,10 @@ and `to_wide()` treat the run name as provenance, not as a separate prediction identity, so disjoint shards combine cleanly and overlapping shards still fail as duplicate predictions. +`combine_predictor_results` also treats `sample_name` as part of the +implicit row identity when present, matching `to_wide()` grouping for +multi-sample predictor outputs. + The combine docs now spell out the recommended allele-grid strategy: split NetMHCpan-style per-allele predictors can be combined under `coverage="complete"`, while intentionally sparse grids such as diff --git a/docs/ranking.md b/docs/ranking.md index f4f79b5..7499174 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -419,9 +419,9 @@ the example above). The optional `prediction_run_name` column is only provenance for a particular run or shard. That distinction lets distinct NetMHCpan allele/length shards combine into one logical NetMHCpan result, while overlapping shards with the same `(prediction_method_name, kind, -peptide, allele, source context)` still fail as duplicates. `to_wide()` drops -`prediction_run_name` from the grouping keys, so a named split run has the same -wide shape as a single unsplit run. +peptide, allele, sample/source context)` still fail as duplicates. +`to_wide()` drops `prediction_run_name` from the grouping keys, so a named +split run has the same wide shape as a single unsplit run. The helper is intentionally strict. It rejects duplicate `(prediction_method_name, kind, identity)` rows, and by default requires every diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 5ddc0cc..916ae4c 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -192,6 +192,7 @@ def _simple_result(method="netmhcpan", peptide="SIINFEKLA", allele="HLA-A*02:01" _CONTEXT_MISMATCH_VALUES = { + "sample_name": ("sample-a", "sample-b"), "source_sequence_name": ("pep1", "pep1-copy"), "peptide_offset": (0, 1), "peptide_length": (9, 10), @@ -631,6 +632,21 @@ def test_combine_rejects_duplicate_prediction_methods(): combine_predictor_results([r1, r2]) +def test_combine_allows_same_method_across_samples(): + sample_a = _simple_result("netmhcpan") + sample_a.df["sample_name"] = "sample-a" + sample_b = _simple_result("netmhcpan") + sample_b.df["sample_name"] = "sample-b" + + combined = combine_predictor_results([sample_a, sample_b]) + wide = to_wide(combined.df) + + assert len(combined) == 2 + assert len(wide) == 2 + assert set(wide["sample_name"]) == {"sample-a", "sample-b"} + assert not wide["netmhcpan_affinity_value"].isna().any() + + def test_combine_ignores_legacy_kind_support_metadata(): r1 = _simple_result("netmhcpan") r1.extra["kind_support"] = "not a mapping" diff --git a/topiary/result.py b/topiary/result.py index a820dfa..1452b27 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -23,6 +23,7 @@ def __repr__(self): _SOURCE_CONTEXT_IDENTITY_COLUMNS = ( + "sample_name", "source_sequence_name", "peptide_offset", "peptide_length", @@ -472,8 +473,9 @@ def combine_predictor_results(results, on=("peptide", "allele"), coverage="compl on : tuple of str Columns defining the strict identity set. Defaults to ``("peptide", "allele")``. Source context columns such as - ``source_sequence_name`` and ``peptide_offset`` are also checked - when present so repeated peptide/allele rows remain distinct. + ``sample_name``, ``source_sequence_name``, and ``peptide_offset`` + are also checked when present so repeated peptide/allele rows + remain distinct. coverage : {"complete", "partial"} or bool ``"complete"`` (default) requires each emitted ``(prediction_method_name, kind)`` group to cover the same identity From 7c39d0608a8321c484ade4d8bfbc5bcbc4cb8b5e Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 18 May 2026 09:25:30 -0400 Subject: [PATCH 12/17] Make TopiaryResult manage long and wide views --- CHANGELOG.md | 6 + docs/api.md | 13 ++ docs/ranking.md | 7 + tests/test_combine_predictor_results.py | 34 ++++ tests/test_io_lens.py | 8 + tests/test_result.py | 99 +++++++++-- topiary/result.py | 215 +++++++++++++++++++++--- topiary/wide.py | 6 + 8 files changed, 357 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82e0e22..b269866 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,12 @@ split NetMHCpan-style per-allele predictors can be combined under MHCflurry haplotype-mode presentation should use `coverage="partial"` and the ranking DSL's `best_*_allele` accessors for allele attribution. +`TopiaryResult` now treats long/wide representation as an internal, +cached view concern. Results expose `long_df` and `wide_df` on demand, +`to_long()` / `to_wide()` return results with that active form, and +`topiary.concat()` normalizes mixed-form TopiaryResults internally +rather than requiring callers to pre-convert them. + ## 5.16.1 **pirlygenes 5.1.0 integration:** diff --git a/docs/api.md b/docs/api.md index 6d32234..8dc4fa3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -26,6 +26,19 @@ | `predict_from_variants(variants)` | VariantCollection | Variant pipeline (builds `ProteinFragment`s internally and delegates). | | `predict_from_mutation_effects(effects)` | EffectCollection | Same as `predict_from_variants` but starting from pre-computed effects. | +## TopiaryResult + +`TopiaryResult` is the semantic result object for Topiary prediction tables. +It can ingest either long or wide prediction tables, keeps the active `df` +view for pandas compatibility, and materializes cached `long_df` and `wide_df` +views on demand. Use `result.to_long()` / `result.to_wide()` when you want a +new `TopiaryResult` whose active `df` is that form. + +Topiary merge APIs operate on this semantic object. `topiary.concat()` accepts +mixed long/wide `TopiaryResult` inputs and normalizes internally; bare +DataFrames are still accepted by predictor-combine APIs for compatibility, but +are coerced into `TopiaryResult` before validation. + ## CachedPredictor Drop-in replacement for a live predictor that serves scores from a diff --git a/docs/ranking.md b/docs/ranking.md index 7499174..6af09b2 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -390,6 +390,13 @@ mhcflurry_rows = TopiaryPredictor( combined = combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) ``` +`TopiaryResult` owns the long/wide representation. Loaders may naturally +produce wide results (for example LENS) or long results (for example pVACseq +and fresh predictor outputs), but callers can use `result.long_df`, +`result.wide_df`, `result.to_long()`, or `result.to_wide()` on demand. Topiary +merge functions normalize those forms internally instead of making callers +choose a representation before combining results. + You can also shard the same predictor over allele or peptide-length batches and combine the shards. Use `TopiaryPredictor(name=...)` when you want to keep track of which batch produced each row: diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 916ae4c..d387200 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -512,6 +512,40 @@ def test_combine_roundtripped_topiary_results(tmp_path): assert "kind_support" not in combined.extra +def test_combine_accepts_wide_topiary_results(): + peptides = {"pep1": "SIINFEKLA", "pep2": "ELAGIGILT"} + alleles = ["HLA-A*02:01", "HLA-B*07:02"] + netmhcpan = ToyAffinityPredictor("netmhcpan", "4.1b", alleles, offset=100) + mhcflurry = ToyAffinityPredictor("mhcflurry", "2.1.1", alleles, offset=200) + + direct = TopiaryPredictor( + models=[netmhcpan, mhcflurry] + ).predict_from_named_peptides(peptides) + net_wide = TopiaryResult( + TopiaryPredictor(models=netmhcpan).predict_from_named_peptides(peptides) + ).to_wide() + flurry_wide = TopiaryResult( + TopiaryPredictor(models=mhcflurry).predict_from_named_peptides(peptides) + ).to_wide() + + combined = combine_predictor_results([net_wide, flurry_wide]) + + assert combined.form == "long" + pd.testing.assert_frame_equal( + _sort_predictions(combined.df), + _sort_predictions(direct), + ) + + +def test_combine_ignores_empty_bare_dataframes(): + result = _simple_result("netmhcpan") + + combined = combine_predictor_results([pd.DataFrame(), result]) + + assert len(combined) == 1 + assert combined.models == {"netmhcpan": "1.0"} + + def test_combine_recomputes_models_from_combined_rows(): peptides = {"pep1": "SIINFEKLA"} alleles = ["HLA-A*02:01"] diff --git a/tests/test_io_lens.py b/tests/test_io_lens.py index 2f5288a..bdb0864 100644 --- a/tests/test_io_lens.py +++ b/tests/test_io_lens.py @@ -112,6 +112,14 @@ def test_form_is_wide(self): def test_returns_topiary_result(self): assert isinstance(read_lens(V1_4), TopiaryResult) + def test_exposes_long_and_wide_forms(self): + r = read_lens(V1_4) + + assert r.df is r.wide_df + assert "netmhcpan_affinity_value" in r.wide_df.columns + assert "kind" in r.long_df.columns + assert "prediction_method_name" in r.long_df.columns + # --------------------------------------------------------------------------- # Allele normalization (mhcgnomes) diff --git a/tests/test_result.py b/tests/test_result.py index 9dc3081..0648f38 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from topiary import TopiaryResult, concat, read_tsv, to_tsv +from topiary import TopiaryResult, concat, from_wide, read_tsv, to_tsv, to_wide from topiary.io import Metadata @@ -153,6 +153,71 @@ def test_form_conversion_preserves_metadata(self): assert wide.models == {"netmhcpan": "4.1b"} assert wide.sources == ["patient01.tsv"] + def test_result_exposes_cached_long_and_wide_views(self): + r = TopiaryResult(_sample_long_df()) + wide_df = r.wide_df + + assert r.form == "long" + assert r.long_df is r.df + assert "netmhcpan_affinity_value" in wide_df.columns + assert r.wide_df is wide_df + + wide = r.to_wide() + assert wide.form == "wide" + assert wide.df is wide.wide_df + assert "kind" in wide.long_df.columns + + def test_wide_view_recomputes_after_active_long_mutation(self): + r = TopiaryResult(_sample_long_df()) + wide_before = r.wide_df + + r.df.loc[0, "value"] = 999.0 + r.df.loc[0, "affinity"] = 999.0 + wide_after = r.wide_df + + assert wide_after is not wide_before + row = wide_after[wide_after["peptide"] == "SIINFEKL"].iloc[0] + assert row["netmhcpan_affinity_value"] == 999.0 + + def test_long_view_recomputes_after_active_wide_mutation(self): + r = TopiaryResult(_sample_long_df()).to_wide() + long_before = r.long_df + + r.df.loc[ + r.df["peptide"] == "SIINFEKL", + "netmhcpan_affinity_value", + ] = 999.0 + long_after = r.long_df + + assert long_after is not long_before + row = long_after[long_after["peptide"] == "SIINFEKL"].iloc[0] + assert row["value"] == 999.0 + + def test_filter_on_wide_result_materializes_long_form(self): + wide = TopiaryResult(_multi_row_df()).to_wide() + + filtered = wide.filter_by("affinity <= 500") + + assert filtered.form == "long" + assert "kind" in filtered.columns + assert len(filtered) < len(wide.long_df) + + def test_top_level_converters_accept_result(self): + r = TopiaryResult(_sample_long_df()) + wide_df = to_wide(r) + long_df = from_wide(r.to_wide()) + + assert "netmhcpan_affinity_value" in wide_df.columns + assert "kind" in long_df.columns + + def test_form_assignment_switches_active_view_for_compatibility(self): + r = TopiaryResult(_sample_long_df()) + + r.form = "wide" + assert "netmhcpan_affinity_value" in r.df.columns + r.form = "long" + assert "kind" in r.df.columns + # --------------------------------------------------------------------------- # Serialization @@ -284,16 +349,26 @@ def test_concat_version_conflict_warns(self): with pytest.warns(UserWarning, match="conflicting versions"): concat([r1, r2]) - def test_concat_mixed_forms_raises(self): + def test_concat_mixed_forms_materializes_long_result(self): r_long = self._make_r(100.0, "p1") - r_wide = TopiaryResult( - pd.DataFrame({ - "peptide": ["A"], - "netmhcpan_affinity_value": [100.0], - }), - ) - with pytest.raises(ValueError, match="different forms"): - concat([r_long, r_wide]) + r_wide = self._make_r(200.0, "p2").to_wide() + + combined = concat([r_long, r_wide]) + + assert combined.form == "long" + assert "kind" in combined.columns + assert len(combined) == 2 + assert combined.sources == ["p1", "p2"] + + def test_concat_all_wide_preserves_wide_active_form(self): + r1 = self._make_r(100.0, "p1").to_wide() + r2 = self._make_r(200.0, "p2").to_wide() + + combined = concat([r1, r2]) + + assert combined.form == "wide" + assert "netmhcpan_affinity_value" in combined.columns + assert "kind" not in combined.columns def test_concat_empty_list(self): combined = concat([]) @@ -305,6 +380,10 @@ def test_concat_single(self): combined = concat([r]) assert len(combined) == 1 + def test_concat_requires_topiary_results(self): + with pytest.raises(TypeError, match="TopiaryResult"): + concat([self._make_r(100.0, "p1").df]) + def test_concat_then_write_roundtrip(self, tmp_path): """Concat + write + read preserves sources in comment block.""" r1 = self._make_r(100.0, "patient01") diff --git a/topiary/result.py b/topiary/result.py index 1452b27..ed6c7cc 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -32,6 +32,25 @@ def __repr__(self): ) +def _dataframe_fingerprint(df): + """Return a cheap-ish fingerprint for detecting active-frame mutation.""" + columns = tuple(str(column) for column in df.columns) + dtypes = tuple(str(dtype) for dtype in df.dtypes) + try: + value_hash = int(pd.util.hash_pandas_object(df, index=True).sum()) + except (TypeError, ValueError): + # Some object columns may carry unhashable values. Fall back to a + # stringified hash so converted views are still invalidated for common + # in-place edits without making all DataFrame access defensive copies. + try: + value_hash = int( + pd.util.hash_pandas_object(df.astype(str), index=True).sum() + ) + except (TypeError, ValueError): + value_hash = None + return (id(df), df.shape, columns, dtypes, value_hash) + + class TopiaryResult: """A prediction DataFrame bundled with its provenance and pipeline state. @@ -49,7 +68,10 @@ class TopiaryResult: Parameters ---------- df : pandas.DataFrame - The underlying prediction data. + The active prediction data view. TopiaryResult keeps lazy long and + wide views internally when conversion is possible, so Topiary-level + operations can normalize representation without requiring callers to + choose a form up front. topiary_version : str, optional form : str, optional "long" or "wide". Auto-detected from columns if not provided. @@ -84,6 +106,8 @@ def __init__( sort_by_str=None, sort_by_ast=None, extra=None, + _long_df=None, + _wide_df=None, ): # Compat: accept a Metadata positionally and unpack its fields. if metadata is not None: @@ -98,9 +122,24 @@ def __init__( if models is None and hasattr(df, "attrs"): models = _models_from_dataframe(df) - self.df = df self.topiary_version = topiary_version - self.form = form or detect_form(df) + self._active_form = form or detect_form(df) + self._unknown_df = None + self._long_df = _long_df + self._wide_df = _wide_df + if self._active_form == "long": + self._long_df = df + elif self._active_form == "wide": + self._wide_df = df + else: + self._unknown_df = df + self._long_source_fingerprint = None + self._wide_source_fingerprint = None + active_fingerprint = _dataframe_fingerprint(df) + if self._active_form == "long" and self._wide_df is not None: + self._wide_source_fingerprint = active_fingerprint + if self._active_form == "wide" and self._long_df is not None: + self._long_source_fingerprint = active_fingerprint self.models = OrderedDict(models) if models else OrderedDict() self.sources = list(sources) if sources else [] self.filter_by_str = filter_by_str @@ -111,6 +150,63 @@ def __init__( # -- DataFrame delegation --------------------------------------------- + @property + def form(self): + """Active DataFrame form, kept for backward compatibility.""" + return self._active_form + + @form.setter + def form(self, value): + value = value or "unknown" + if value == self._active_form: + return + if value == "long": + if self._long_df is None: + if self._wide_df is None: + raise ValueError("Cannot set TopiaryResult form to 'long'") + from .wide import from_wide + self._long_df = from_wide(self._wide_df, metadata=self.metadata) + self._long_source_fingerprint = _dataframe_fingerprint(self._wide_df) + self._active_form = "long" + return + if value == "wide": + if self._wide_df is None: + if self._long_df is None: + raise ValueError("Cannot set TopiaryResult form to 'wide'") + from .wide import to_wide as _to_wide + self._wide_df = _to_wide(self._long_df) + self._wide_source_fingerprint = _dataframe_fingerprint(self._long_df) + self._active_form = "wide" + return + if value == "unknown": + if self._unknown_df is None: + raise ValueError("Cannot set TopiaryResult form to 'unknown'") + self._active_form = "unknown" + return + raise ValueError(f"Unknown TopiaryResult form: {value!r}") + + @property + def df(self): + """Active DataFrame view for backward-compatible pandas access.""" + if self._active_form == "long": + return self._long_df + if self._active_form == "wide": + return self._wide_df + return self._unknown_df + + @df.setter + def df(self, value): + """Replace the active DataFrame and invalidate converted views.""" + detected = detect_form(value) + self._active_form = detected + self._unknown_df = None + self._long_df = value if detected == "long" else None + self._wide_df = value if detected == "wide" else None + if detected == "unknown": + self._unknown_df = value + self._long_source_fingerprint = None + self._wide_source_fingerprint = None + def __len__(self): return len(self.df) @@ -158,18 +254,74 @@ def itertuples(self, *args, **kwargs): # -- Form conversion -------------------------------------------------- def to_wide(self): - from .wide import to_wide as _to_wide - wide_df = _to_wide(self.df) + wide_df = self.wide_df kwargs = self._field_kwargs() kwargs["form"] = "wide" - return TopiaryResult(wide_df, **kwargs) + return TopiaryResult( + wide_df, + **kwargs, + _long_df=self._long_df, + _wide_df=wide_df, + ) def to_long(self): - from .wide import from_wide - long_df = from_wide(self.df, metadata=self.metadata) + long_df = self.long_df kwargs = self._field_kwargs() kwargs["form"] = "long" - return TopiaryResult(long_df, **kwargs) + return TopiaryResult( + long_df, + **kwargs, + _long_df=long_df, + _wide_df=self._wide_df, + ) + + @property + def long_df(self): + """Long-form DataFrame view, computed lazily when needed.""" + if ( + self._active_form == "wide" + and self._long_df is not None + and self._long_source_fingerprint != _dataframe_fingerprint(self.df) + ): + self._long_df = None + self._long_source_fingerprint = None + if self._long_df is None: + if self._active_form == "long": + self._long_df = self.df + elif self._active_form == "wide": + from .wide import from_wide + self._long_df = from_wide(self.df, metadata=self.metadata) + self._long_source_fingerprint = _dataframe_fingerprint(self.df) + else: + raise ValueError( + f"Cannot convert TopiaryResult with form {self.form!r} " + "to long form" + ) + return self._long_df + + @property + def wide_df(self): + """Wide-form DataFrame view, computed lazily when needed.""" + if ( + self._active_form == "long" + and self._wide_df is not None + and self._wide_source_fingerprint != _dataframe_fingerprint(self.df) + ): + self._wide_df = None + self._wide_source_fingerprint = None + if self._wide_df is None: + if self._active_form == "wide": + self._wide_df = self.df + elif self._active_form == "long": + from .wide import to_wide as _to_wide + self._wide_df = _to_wide(self.df) + self._wide_source_fingerprint = _dataframe_fingerprint(self.df) + else: + raise ValueError( + f"Cannot convert TopiaryResult with form {self.form!r} " + "to wide form" + ) + return self._wide_df # -- DSL operations --------------------------------------------------- @@ -205,10 +357,11 @@ def filter_by(self, expr): f"got {type(expr).__name__}" ) - if self.df.empty: - filtered_df = self.df + df = self.long_df + if df.empty: + filtered_df = df else: - filtered_df = apply_filter(self.df, new_ast) + filtered_df = apply_filter(df, new_ast) if self.filter_by_ast is not None: combined_ast = self.filter_by_ast & new_ast @@ -218,6 +371,7 @@ def filter_by(self, expr): combined_str = new_str kwargs = self._field_kwargs() + kwargs["form"] = "long" kwargs["filter_by_str"] = combined_str kwargs["filter_by_ast"] = combined_ast return TopiaryResult(filtered_df, **kwargs) @@ -268,12 +422,14 @@ def sort_by(self, expr): f"got {type(expr).__name__}" ) - if self.df.empty: - sorted_df = self.df + df = self.long_df + if df.empty: + sorted_df = df else: - sorted_df = apply_sort(self.df, sort_nodes) + sorted_df = apply_sort(df, sort_nodes) kwargs = self._field_kwargs() + kwargs["form"] = "long" kwargs["sort_by_str"] = new_str kwargs["sort_by_ast"] = new_ast return TopiaryResult(sorted_df, **kwargs) @@ -356,24 +512,33 @@ def concat(results): Parameters ---------- results : list of TopiaryResult - All must be in the same form (long or wide). + Results to concatenate. Long and wide results may be mixed; mixed + inputs are normalized to long form. Returns ------- TopiaryResult DataFrames concatenated; metadata merged (sources concatenated, models union with warning on version conflicts; filter_by / sort_by - preserved only if all inputs agree). + preserved only if all inputs agree). The active output form is the + shared input form when all inputs match, otherwise long. """ if not results: return TopiaryResult(pd.DataFrame()) + for result in results: + if not isinstance(result, TopiaryResult): + raise TypeError( + "topiary.concat expects TopiaryResult inputs; use " + "TopiaryResult(df) to attach Topiary semantics before " + f"concatenating, got {type(result).__name__}" + ) forms = {r.form for r in results} - if len(forms) > 1: + if "unknown" in forms: raise ValueError( - f"Cannot concat TopiaryResults in different forms: {forms}" + f"Cannot concat TopiaryResults with unknown form: {forms}" ) - form = results[0].form + form = results[0].form if len(forms) == 1 else "long" # Merge models with conflict detection. merged_models = OrderedDict() @@ -440,7 +605,14 @@ def concat(results): stacklevel=2, ) - df = pd.concat([r.df for r in results], ignore_index=True) + if form == "long": + frames = [r.long_df for r in results] + elif form == "wide": + frames = [r.wide_df for r in results] + else: + raise ValueError(f"Cannot concat TopiaryResults with form {form!r}") + + df = pd.concat(frames, ignore_index=True) return TopiaryResult( df, @@ -493,6 +665,7 @@ def combine_predictor_results(results, on=("peptide", "allele"), coverage="compl results = [r for r in results if not r.df.empty] if not results: return TopiaryResult(pd.DataFrame()) + results = [r.to_long() for r in results] if isinstance(on, str): on = (on,) diff --git a/topiary/wide.py b/topiary/wide.py index 5176daa..456d8b8 100644 --- a/topiary/wide.py +++ b/topiary/wide.py @@ -108,6 +108,9 @@ def to_wide(df): Wide-form DataFrame where prediction columns become ``{model}_{kind}_{field}`` columns. """ + if hasattr(df, "wide_df") and hasattr(df, "metadata") and hasattr(df, "df"): + return df.wide_df + if "kind" not in df.columns: raise ValueError( "DataFrame is not in long form: missing 'kind' column" @@ -228,6 +231,9 @@ def from_wide(df, metadata=None): ``percentile_rank``, ``prediction_method_name``, and ``predictor_version`` columns. """ + if hasattr(df, "long_df") and hasattr(df, "metadata") and hasattr(df, "df"): + return df.long_df + # Classify columns. pred_mapping = {} # (model_key, kind_short) → {field: col_name} group_cols = [] From 040599c2ef7051a17331bd487dff53e5d0889b9d Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 18 May 2026 09:38:10 -0400 Subject: [PATCH 13/17] Derive TopiaryResult form from cached views --- CHANGELOG.md | 6 ++-- docs/api.md | 8 +++-- topiary/result.py | 84 +++++++++++++++++++++-------------------------- 3 files changed, 46 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b269866..ab647ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,9 +43,9 @@ and the ranking DSL's `best_*_allele` accessors for allele attribution. `TopiaryResult` now treats long/wide representation as an internal, cached view concern. Results expose `long_df` and `wide_df` on demand, -`to_long()` / `to_wide()` return results with that active form, and -`topiary.concat()` normalizes mixed-form TopiaryResults internally -rather than requiring callers to pre-convert them. +`to_long()` / `to_wide()` return results with that active compatibility +`df` view, and `topiary.concat()` normalizes mixed-form TopiaryResults +internally rather than requiring callers to pre-convert them. ## 5.16.1 diff --git a/docs/api.md b/docs/api.md index 8dc4fa3..77ccd48 100644 --- a/docs/api.md +++ b/docs/api.md @@ -30,9 +30,11 @@ `TopiaryResult` is the semantic result object for Topiary prediction tables. It can ingest either long or wide prediction tables, keeps the active `df` -view for pandas compatibility, and materializes cached `long_df` and `wide_df` -views on demand. Use `result.to_long()` / `result.to_wide()` when you want a -new `TopiaryResult` whose active `df` is that form. +view only for pandas compatibility, and materializes cached `long_df` and +`wide_df` views on demand. The public `form` value describes that compatibility +`df` view; it is derived from the stored views rather than acting as separate +result state. Use `result.to_long()` / `result.to_wide()` when you want a new +`TopiaryResult` whose active `df` is that form. Topiary merge APIs operate on this semantic object. `topiary.concat()` accepts mixed long/wide `TopiaryResult` inputs and normalizes internally; bare diff --git a/topiary/result.py b/topiary/result.py index ed6c7cc..02b3682 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -123,23 +123,25 @@ def __init__( models = _models_from_dataframe(df) self.topiary_version = topiary_version - self._active_form = form or detect_form(df) + input_form = form or detect_form(df) + if input_form not in {"long", "wide", "unknown"}: + raise ValueError(f"Unknown TopiaryResult form: {input_form!r}") + self._df = df self._unknown_df = None self._long_df = _long_df self._wide_df = _wide_df - if self._active_form == "long": + if input_form == "long": self._long_df = df - elif self._active_form == "wide": + elif input_form == "wide": self._wide_df = df else: self._unknown_df = df self._long_source_fingerprint = None self._wide_source_fingerprint = None - active_fingerprint = _dataframe_fingerprint(df) - if self._active_form == "long" and self._wide_df is not None: - self._wide_source_fingerprint = active_fingerprint - if self._active_form == "wide" and self._long_df is not None: - self._long_source_fingerprint = active_fingerprint + if self._df is self._long_df and self._wide_df is not None: + self._wide_source_fingerprint = _dataframe_fingerprint(self._long_df) + if self._df is self._wide_df and self._long_df is not None: + self._long_source_fingerprint = _dataframe_fingerprint(self._wide_df) self.models = OrderedDict(models) if models else OrderedDict() self.sources = list(sources) if sources else [] self.filter_by_str = filter_by_str @@ -152,53 +154,47 @@ def __init__( @property def form(self): - """Active DataFrame form, kept for backward compatibility.""" - return self._active_form + """Form of the active ``.df`` view, kept for backward compatibility.""" + if self._df is self._long_df: + return "long" + if self._df is self._wide_df: + return "wide" + if self._df is self._unknown_df: + return "unknown" + return detect_form(self._df) @form.setter def form(self, value): value = value or "unknown" - if value == self._active_form: + if value == self.form: return if value == "long": - if self._long_df is None: - if self._wide_df is None: - raise ValueError("Cannot set TopiaryResult form to 'long'") - from .wide import from_wide - self._long_df = from_wide(self._wide_df, metadata=self.metadata) - self._long_source_fingerprint = _dataframe_fingerprint(self._wide_df) - self._active_form = "long" + self._df = self.long_df + if self._wide_df is not None: + self._wide_source_fingerprint = _dataframe_fingerprint(self._long_df) return if value == "wide": - if self._wide_df is None: - if self._long_df is None: - raise ValueError("Cannot set TopiaryResult form to 'wide'") - from .wide import to_wide as _to_wide - self._wide_df = _to_wide(self._long_df) - self._wide_source_fingerprint = _dataframe_fingerprint(self._long_df) - self._active_form = "wide" + self._df = self.wide_df + if self._long_df is not None: + self._long_source_fingerprint = _dataframe_fingerprint(self._wide_df) return if value == "unknown": if self._unknown_df is None: raise ValueError("Cannot set TopiaryResult form to 'unknown'") - self._active_form = "unknown" + self._df = self._unknown_df return raise ValueError(f"Unknown TopiaryResult form: {value!r}") @property def df(self): """Active DataFrame view for backward-compatible pandas access.""" - if self._active_form == "long": - return self._long_df - if self._active_form == "wide": - return self._wide_df - return self._unknown_df + return self._df @df.setter def df(self, value): """Replace the active DataFrame and invalidate converted views.""" detected = detect_form(value) - self._active_form = detected + self._df = value self._unknown_df = None self._long_df = value if detected == "long" else None self._wide_df = value if detected == "wide" else None @@ -279,19 +275,17 @@ def to_long(self): def long_df(self): """Long-form DataFrame view, computed lazily when needed.""" if ( - self._active_form == "wide" + self._df is self._wide_df and self._long_df is not None - and self._long_source_fingerprint != _dataframe_fingerprint(self.df) + and self._long_source_fingerprint != _dataframe_fingerprint(self._wide_df) ): self._long_df = None self._long_source_fingerprint = None if self._long_df is None: - if self._active_form == "long": - self._long_df = self.df - elif self._active_form == "wide": + if self._wide_df is not None: from .wide import from_wide - self._long_df = from_wide(self.df, metadata=self.metadata) - self._long_source_fingerprint = _dataframe_fingerprint(self.df) + self._long_df = from_wide(self._wide_df, metadata=self.metadata) + self._long_source_fingerprint = _dataframe_fingerprint(self._wide_df) else: raise ValueError( f"Cannot convert TopiaryResult with form {self.form!r} " @@ -303,19 +297,17 @@ def long_df(self): def wide_df(self): """Wide-form DataFrame view, computed lazily when needed.""" if ( - self._active_form == "long" + self._df is self._long_df and self._wide_df is not None - and self._wide_source_fingerprint != _dataframe_fingerprint(self.df) + and self._wide_source_fingerprint != _dataframe_fingerprint(self._long_df) ): self._wide_df = None self._wide_source_fingerprint = None if self._wide_df is None: - if self._active_form == "wide": - self._wide_df = self.df - elif self._active_form == "long": + if self._long_df is not None: from .wide import to_wide as _to_wide - self._wide_df = _to_wide(self.df) - self._wide_source_fingerprint = _dataframe_fingerprint(self.df) + self._wide_df = _to_wide(self._long_df) + self._wide_source_fingerprint = _dataframe_fingerprint(self._long_df) else: raise ValueError( f"Cannot convert TopiaryResult with form {self.form!r} " From 13d953025b4beef17f5092751dc91eb68be8ea89 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 18 May 2026 10:24:27 -0400 Subject: [PATCH 14/17] Avoid result cache aliasing --- tests/test_result.py | 31 +++++++++++++++++++++++++++++++ tests/test_wide.py | 27 +++++++++++++++++++++++++++ topiary/result.py | 10 ++++++---- topiary/wide.py | 41 +++++++++++++++++++++++++++++++++++------ 4 files changed, 99 insertions(+), 10 deletions(-) diff --git a/tests/test_result.py b/tests/test_result.py index 0648f38..2e1dc22 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -167,6 +167,37 @@ def test_result_exposes_cached_long_and_wide_views(self): assert wide.df is wide.wide_df assert "kind" in wide.long_df.columns + def test_to_wide_result_does_not_alias_source_cache(self): + r = TopiaryResult(_sample_long_df()) + source_wide = r.wide_df + + wide = r.to_wide() + wide.df.loc[ + wide.df["peptide"] == "SIINFEKL", + "netmhcpan_affinity_value", + ] = 999.0 + + assert wide.df is not source_wide + source_row = r.wide_df[r.wide_df["peptide"] == "SIINFEKL"].iloc[0] + converted_row = wide.wide_df[wide.wide_df["peptide"] == "SIINFEKL"].iloc[0] + assert source_row["netmhcpan_affinity_value"] == 120.0 + assert converted_row["netmhcpan_affinity_value"] == 999.0 + + def test_to_long_result_does_not_alias_source_cache(self): + wide_source = TopiaryResult(_sample_long_df()).to_wide() + source_long = wide_source.long_df + + long = wide_source.to_long() + long.df.loc[long.df["peptide"] == "SIINFEKL", "value"] = 999.0 + + assert long.df is not source_long + source_row = wide_source.long_df[ + wide_source.long_df["peptide"] == "SIINFEKL" + ].iloc[0] + converted_row = long.long_df[long.long_df["peptide"] == "SIINFEKL"].iloc[0] + assert source_row["value"] == 120.0 + assert converted_row["value"] == 999.0 + def test_wide_view_recomputes_after_active_long_mutation(self): r = TopiaryResult(_sample_long_df()) wide_before = r.wide_df diff --git a/tests/test_wide.py b/tests/test_wide.py index 0e5cb1d..a063cd9 100644 --- a/tests/test_wide.py +++ b/tests/test_wide.py @@ -309,6 +309,33 @@ def test_model_versions_in_attrs(self): assert models.get("netmhcpan") == "4.1b" assert models.get("mhcflurry") == "2.1.1" + def test_model_versions_fill_blank_rows_from_attrs(self): + df = _long_df_multi_model() + df.loc[ + df["prediction_method_name"] == "mhcflurry", + "predictor_version", + ] = "" + df.attrs["topiary_models"] = { + "netmhcpan": "4.1b", + "mhcflurry": "2.1.1", + "stale": "0.0", + } + + wide = to_wide(df) + + assert wide.attrs["topiary_models"] == { + "netmhcpan": "4.1b", + "mhcflurry": "2.1.1", + } + long = from_wide(wide) + mhcflurry_versions = set( + long.loc[ + long["prediction_method_name"] == "mhcflurry", + "predictor_version", + ] + ) + assert mhcflurry_versions == {"2.1.1"} + def test_flanks_preserved(self): df = _long_df_single_model() wide = to_wide(df) diff --git a/topiary/result.py b/topiary/result.py index 02b3682..758217f 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -250,25 +250,27 @@ def itertuples(self, *args, **kwargs): # -- Form conversion -------------------------------------------------- def to_wide(self): - wide_df = self.wide_df + wide_df = self.wide_df.copy() + long_df = self.long_df.copy() if self._long_df is not None else None kwargs = self._field_kwargs() kwargs["form"] = "wide" return TopiaryResult( wide_df, **kwargs, - _long_df=self._long_df, + _long_df=long_df, _wide_df=wide_df, ) def to_long(self): - long_df = self.long_df + long_df = self.long_df.copy() + wide_df = self.wide_df.copy() if self._wide_df is not None else None kwargs = self._field_kwargs() kwargs["form"] = "long" return TopiaryResult( long_df, **kwargs, _long_df=long_df, - _wide_df=self._wide_df, + _wide_df=wide_df, ) @property diff --git a/topiary/wide.py b/topiary/wide.py index 456d8b8..9542681 100644 --- a/topiary/wide.py +++ b/topiary/wide.py @@ -51,6 +51,15 @@ def _kind_short_to_canonical(short_name): return _kind_name(kind) +def _version_str(value): + try: + if pd.isna(value): + return "" + except (TypeError, ValueError): + pass + return str(value).strip() + + def _parse_wide_column(col_name): """Parse a wide-form column name into (model_key, kind_short, field). @@ -157,15 +166,35 @@ def to_wide(df): # Build model→version metadata for .attrs. model_versions = {} + observed_methods = set() + if "prediction_method_name" in df.columns: + observed_methods = { + str(method).strip() + for method in df["prediction_method_name"].dropna().unique() + if str(method).strip() + } if "prediction_method_name" in df.columns and "predictor_version" in df.columns: - for method, version in ( + for method, rows in ( df.dropna(subset=["prediction_method_name"]) - .groupby("prediction_method_name")["predictor_version"] - .first() - .items() + .groupby("prediction_method_name", sort=False) ): - if pd.notna(version) and str(version): - model_versions[str(method)] = str(version) + method_str = str(method).strip() + if not method_str: + continue + for version in rows["predictor_version"]: + version_str = _version_str(version) + if version_str: + model_versions[method_str] = version_str + break + attr_models = getattr(df, "attrs", {}).get("topiary_models", {}) + if observed_methods and hasattr(attr_models, "items"): + for method, version in attr_models.items(): + method_str = str(method).strip() + if not method_str or method_str not in observed_methods: + continue + version_str = _version_str(version) + if version_str and not model_versions.get(method_str): + model_versions[method_str] = version_str # Melt each long field into wide column entries. records = [] From f4b44cb2d19e1f0764945ede52abfc0692064646 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 18 May 2026 11:36:32 -0400 Subject: [PATCH 15/17] Align sample identity with ranking groups --- tests/test_combine_predictor_results.py | 30 +++++++++++++++ tests/test_io.py | 40 ++++++++++++++++++++ tests/test_io_pvacseq.py | 3 ++ tests/test_wide.py | 10 +++++ topiary/io.py | 2 +- topiary/ranking/nodes.py | 50 ++++++++++++++++++------- topiary/wide.py | 25 +++++++++---- 7 files changed, 138 insertions(+), 22 deletions(-) diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index d387200..5ca83c1 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -671,16 +671,46 @@ def test_combine_allows_same_method_across_samples(): sample_a.df["sample_name"] = "sample-a" sample_b = _simple_result("netmhcpan") sample_b.df["sample_name"] = "sample-b" + sample_b.df["value"] = 1000.0 + sample_b.df["affinity"] = 1000.0 combined = combine_predictor_results([sample_a, sample_b]) + ctx = EvalContext(combined.df) + scores = Affinity["netmhcpan"].value.eval(ctx) + filtered = combined.filter_by("affinity <= 500") wide = to_wide(combined.df) + sample_a_key = ( + "sample-a", "pep1", "SIINFEKLA", 0, "HLA-A*02:01", + ) + sample_b_key = ( + "sample-b", "pep1", "SIINFEKLA", 0, "HLA-A*02:01", + ) + assert len(combined) == 2 + assert ctx.group_keys == [ + "sample_name", "source_sequence_name", "peptide", + "peptide_offset", "allele", + ] + assert scores.loc[sample_a_key] == 100.0 + assert scores.loc[sample_b_key] == 1000.0 + assert filtered.df["sample_name"].tolist() == ["sample-a"] assert len(wide) == 2 assert set(wide["sample_name"]) == {"sample-a", "sample-b"} assert not wide["netmhcpan_affinity_value"].isna().any() +def test_null_sample_name_is_not_a_ranking_group_key(): + result = _simple_result("netmhcpan") + result.df["sample_name"] = pd.NA + + ctx = EvalContext(result.df) + filtered = result.filter_by("affinity <= 500") + + assert "sample_name" not in ctx.group_keys + assert len(filtered.df) == 1 + + def test_combine_ignores_legacy_kind_support_metadata(): r1 = _simple_result("netmhcpan") r1.extra["kind_support"] = "not a mapping" diff --git a/tests/test_io.py b/tests/test_io.py index fcf4c8b..03b91b5 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -230,6 +230,8 @@ def test_long_form_roundtrip(self, tmp_path): result = read_tsv(path) df2, meta = result.df, result.metadata assert meta.form == "long" + assert result.df is result.long_df + assert "netmhcpan_affinity_value" in result.wide_df.columns assert meta.topiary_version is not None assert meta.models.get("netmhcpan") == "4.1b" assert len(df2) == len(df) @@ -244,6 +246,8 @@ def test_wide_form_roundtrip(self, tmp_path): result = read_tsv(path) df2, meta = result.df, result.metadata assert meta.form == "wide" + assert result.df is result.wide_df + assert "kind" in result.long_df.columns assert "netmhcpan_affinity_value" in df2.columns assert len(df2) == len(wide) @@ -350,6 +354,40 @@ def test_result_writer_fills_blank_row_versions_from_metadata( meta = reader(path).metadata assert meta.models == {"netmhcpan": "4.1b"} + @pytest.mark.parametrize( + "writer,method_name,reader,suffix", + [ + (to_tsv, "to_tsv", read_tsv, "tsv"), + (to_csv, "to_csv", read_csv, "csv"), + ], + ) + @pytest.mark.parametrize("call_style", ["function", "method"]) + def test_result_writer_preserves_empty_result_model_metadata( + self, tmp_path, writer, method_name, reader, suffix, call_style, + ): + from topiary import TopiaryResult + + df = _sample_long_df().iloc[0:0].copy() + result = TopiaryResult( + df, + models={"netmhcpan": "4.1b"}, + sources=["empty-run"], + ) + + path = tmp_path / f"empty_result.{suffix}" + if call_style == "function": + writer(result, path) + else: + getattr(result, method_name)(path) + + reloaded = reader(path) + assert reloaded.form == "long" + assert reloaded.models == {"netmhcpan": "4.1b"} + assert reloaded.sources[:1] == ["empty-run"] + assert reloaded.long_df.empty + assert reloaded.wide_df.empty + assert "peptide" in reloaded.wide_df.columns + class TestReadWriteCSV: def test_csv_roundtrip(self, tmp_path): @@ -359,6 +397,8 @@ def test_csv_roundtrip(self, tmp_path): result = read_csv(path) df2, meta = result.df, result.metadata assert meta.form == "long" + assert result.df is result.long_df + assert "netmhcpan_affinity_value" in result.wide_df.columns assert len(df2) == len(df) assert df2.iloc[0]["value"] == pytest.approx(120.0) diff --git a/tests/test_io_pvacseq.py b/tests/test_io_pvacseq.py index e6b3cdf..581f070 100644 --- a/tests/test_io_pvacseq.py +++ b/tests/test_io_pvacseq.py @@ -63,6 +63,9 @@ def test_returns_topiary_result_in_long_form(self): r = read_pvacseq(MHC_I_AGG) assert isinstance(r, TopiaryResult) assert r.form == "long" + assert r.df is r.long_df + assert "pvacseq_affinity_value" in r.wide_df.columns + assert len(r.wide_df) == len(r.df) def test_one_row_per_variant(self): r = read_pvacseq(MHC_I_AGG) diff --git a/tests/test_wide.py b/tests/test_wide.py index a063cd9..05e8b26 100644 --- a/tests/test_wide.py +++ b/tests/test_wide.py @@ -297,6 +297,16 @@ def test_extra_columns_preserved(self): assert "gene_tpm" in wide.columns assert wide.iloc[0]["gene"] == "BRAF" + def test_null_context_columns_do_not_drop_wide_rows(self): + df = _long_df_single_model() + df["gene"] = [pd.NA, "BRAF"] + + wide = to_wide(df) + + assert len(wide) == len(df) + assert wide["gene"].isna().sum() == 1 + assert not wide["netmhcpan_affinity_value"].isna().any() + def test_missing_kind_raises(self): df = pd.DataFrame({"peptide": ["A"], "score": [0.5]}) with pytest.raises(ValueError, match="missing 'kind' column"): diff --git a/topiary/io.py b/topiary/io.py index 9629326..dbe3c09 100644 --- a/topiary/io.py +++ b/topiary/io.py @@ -167,7 +167,7 @@ def _models_from_long_rows(df): if not method_str: continue models[method_str] = _version_from_rows(rows) - return models + return models or None def _version_from_rows(rows): diff --git a/topiary/ranking/nodes.py b/topiary/ranking/nodes.py index 407634a..24be35c 100644 --- a/topiary/ranking/nodes.py +++ b/topiary/ranking/nodes.py @@ -111,6 +111,7 @@ def _build_kind_aliases(kind_source=Kind): # Group key detection and EvalContext # ============================================================================= +_SAMPLE_GROUP_KEY = "sample_name" _GROUP_KEYS = ["source_sequence_name", "peptide", "peptide_offset", "allele"] _GROUP_KEYS_VARIANT = ["variant", "peptide", "peptide_offset", "allele"] @@ -118,15 +119,26 @@ def _build_kind_aliases(kind_source=Kind): _GROUP_KEYS_FRAGMENT = ["fragment_id", "peptide", "peptide_offset", "allele"] +def _with_optional_sample_key(df, group_keys): + group_keys = list(group_keys) + if ( + _SAMPLE_GROUP_KEY in df.columns + and _SAMPLE_GROUP_KEY not in group_keys + and df[_SAMPLE_GROUP_KEY].notna().any() + ): + group_keys.insert(0, _SAMPLE_GROUP_KEY) + return group_keys + + def _pick_group_keys(df): # fragment_id is the most specific identity (from predict_from_fragments); # variant is for the legacy varcode pipeline; source_sequence_name is the # generic fallback. if "fragment_id" in df.columns: - return list(_GROUP_KEYS_FRAGMENT) + return _with_optional_sample_key(df, _GROUP_KEYS_FRAGMENT) if "variant" in df.columns: - return list(_GROUP_KEYS_VARIANT) - return list(_GROUP_KEYS) + return _with_optional_sample_key(df, _GROUP_KEYS_VARIANT) + return _with_optional_sample_key(df, _GROUP_KEYS) def _normalize_default_methods(mapping): @@ -537,7 +549,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: else: msg += f" Available: {available}" raise ValueError(msg) - vals = ctx.df.groupby(ctx.group_keys, sort=False)[self.col_name].first() + vals = ctx.df.groupby( + ctx.group_keys, sort=False, dropna=False + )[self.col_name].first() vals = vals.reindex(ctx.group_index) try: return vals.astype(float) @@ -627,7 +641,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: else: msg += f" Available: {available}" raise ValueError(msg) - vals = df.groupby(ctx.group_keys, sort=False)[self.col_name].first() + vals = df.groupby( + ctx.group_keys, sort=False, dropna=False + )[self.col_name].first() vals = vals.reindex(ctx.group_index) mask = vals.isin(self.values) if self.negate: @@ -711,7 +727,7 @@ def _filter_kind_method_version(ctx, kind, method, version): # Ambiguity: unqualified access with multiple methods in any group if effective_method is None and "prediction_method_name" in sub.columns: methods_per_group = sub.groupby( - ctx.group_keys, sort=False + ctx.group_keys, sort=False, dropna=False )["prediction_method_name"].nunique() if (methods_per_group > 1).any(): default = ctx.default_methods.get(_kind_value(kind)) @@ -786,7 +802,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: if col_name not in sub.columns: return ctx.empty_series() - vals = sub.groupby(ctx.group_keys, sort=False)[col_name].first() + vals = sub.groupby( + ctx.group_keys, sort=False, dropna=False + )[col_name].first() vals = vals.reindex(ctx.group_index) return pd.to_numeric(vals, errors="coerce") @@ -952,7 +970,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: __best_value=numeric[valid_mask], ) - groups = valid.groupby(peptide_keys, sort=False)["__best_value"] + groups = valid.groupby( + peptide_keys, sort=False, dropna=False + )["__best_value"] best_idx = groups.idxmax() if direction == "max" else groups.idxmin() target = "allele" if self.return_allele else "__best_value" @@ -1012,7 +1032,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: col = self.scope + "peptide_length" if ctx.df.empty or col not in ctx.df.columns: return ctx.empty_series() - vals = ctx.df.groupby(ctx.group_keys, sort=False)[col].first() + vals = ctx.df.groupby( + ctx.group_keys, sort=False, dropna=False + )[col].first() return vals.reindex(ctx.group_index).astype(float) def __repr__(self): @@ -1036,7 +1058,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: peptide_col = self.scope + "peptide" if self.scope else "peptide" if ctx.df.empty or peptide_col not in ctx.df.columns: return ctx.empty_series() - peptides = ctx.df.groupby(ctx.group_keys, sort=False)[peptide_col].first() + peptides = ctx.df.groupby( + ctx.group_keys, sort=False, dropna=False + )[peptide_col].first() peptides = peptides.reindex(ctx.group_index) chars = self.chars @@ -1079,7 +1103,9 @@ def eval(self, ctx: EvalContext) -> pd.Series: peptide_col = self.scope + "peptide" if self.scope else "peptide" if ctx.df.empty or peptide_col not in ctx.df.columns: return ctx.empty_series() - peptides = ctx.df.groupby(ctx.group_keys, sort=False)[peptide_col].first() + peptides = ctx.df.groupby( + ctx.group_keys, sort=False, dropna=False + )[peptide_col].first() peptides = peptides.reindex(ctx.group_index) valid = peptides.notna() & peptides.astype(str).str.len().gt(0) result = pd.Series(np.nan, index=ctx.group_index, dtype=float) @@ -2081,5 +2107,3 @@ def _resolve_field(name): else: msg += f" Available: {available}" raise ValueError(msg) - - diff --git a/topiary/wide.py b/topiary/wide.py index 9542681..d226665 100644 --- a/topiary/wide.py +++ b/topiary/wide.py @@ -212,12 +212,17 @@ def to_wide(df): return work[group_cols].drop_duplicates().reset_index(drop=True) melted = pd.concat(records, ignore_index=True) + if group_cols: + group_index = melted[group_cols].drop_duplicates().reset_index(drop=True) + group_index["_topiary_group_id"] = range(len(group_index)) + melted = melted.merge(group_index, on=group_cols, how="left") + else: + group_index = pd.DataFrame({"_topiary_group_id": [0]}) + melted["_topiary_group_id"] = 0 # Check for duplicates that would silently collapse in the pivot. - # Fill NaN with sentinel to avoid pandas groupby NaN-skipping. - dup_cols = group_cols + ["_wide_col"] - dup_df = melted[dup_cols].fillna("__nan__") - dup_check = dup_df.groupby(dup_cols).size() + dup_cols = ["_topiary_group_id", "_wide_col"] + dup_check = melted.groupby(dup_cols, dropna=False).size() n_dupes = (dup_check > 1).sum() if n_dupes > 0: warnings.warn( @@ -227,14 +232,18 @@ def to_wide(df): stacklevel=2, ) - # Pivot: group keys as index, wide column names as columns. - wide = melted.pivot_table( - index=group_cols, + wide_values = melted.pivot_table( + index="_topiary_group_id", columns="_wide_col", values="_wide_val", aggfunc="first", ).reset_index() - wide.columns.name = None + wide_values.columns.name = None + wide = ( + group_index + .merge(wide_values, on="_topiary_group_id", how="left") + .drop(columns=["_topiary_group_id"]) + ) if model_versions: wide.attrs["topiary_models"] = model_versions From db4ee09a97ce511594e38dd1db8f5e0ea039ab7f Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 18 May 2026 13:21:13 -0400 Subject: [PATCH 16/17] Clarify result merge APIs --- CHANGELOG.md | 17 +++-- docs/api.md | 18 +++-- docs/pvacseq.md | 6 +- docs/ranking.md | 8 +-- tests/test_combine_predictor_results.py | 14 +++- tests/test_result.py | 24 ++++++- topiary/__init__.py | 10 ++- topiary/result.py | 88 ++++++++++++++++++------- 8 files changed, 140 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab647ec..03d170e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ **Combine separate predictor runs (#170):** -`topiary.combine_predictor_results([a, b, ...])` stacks separate +`topiary.combine_predictions([a, b, ...])` stacks separate predictor outputs into the same long-form shape produced by running those predictors together. It accepts `TopiaryResult` or fresh `TopiaryPredictor` DataFrame outputs, supports both split-by-predictor @@ -26,12 +26,12 @@ produced which quantities: `prediction_method_name`, `predictor_version`, in a `prediction_run_name` column. This is intended for split predictor grids such as one NetMHCpan run per allele/peptide length: the logical method remains `prediction_method_name="netmhcpan"`, while -`prediction_run_name` records the shard. `combine_predictor_results` +`prediction_run_name` records the shard. `combine_predictions` and `to_wide()` treat the run name as provenance, not as a separate prediction identity, so disjoint shards combine cleanly and overlapping shards still fail as duplicate predictions. -`combine_predictor_results` also treats `sample_name` as part of the +`combine_predictions` also treats `sample_name` as part of the implicit row identity when present, matching `to_wide()` grouping for multi-sample predictor outputs. @@ -44,9 +44,16 @@ and the ranking DSL's `best_*_allele` accessors for allele attribution. `TopiaryResult` now treats long/wide representation as an internal, cached view concern. Results expose `long_df` and `wide_df` on demand, `to_long()` / `to_wide()` return results with that active compatibility -`df` view, and `topiary.concat()` normalizes mixed-form TopiaryResults +`df` view, and `topiary.append_results()` normalizes mixed-form TopiaryResults internally rather than requiring callers to pre-convert them. +Result merging now has user-facing names for the two distinct operations: +use `append_results` / `result.append(...)` when inputs are more rows +(files, samples, cohorts), and use `combine_predictions` / +`result.combine_predictions(...)` when inputs are complementary predictor +outputs for the same logical identity grid. The older `concat` and +`combine_predictor_results` names remain as compatibility aliases. + ## 5.16.1 **pirlygenes 5.1.0 integration:** @@ -96,7 +103,7 @@ selector won't find them — callers wanting per-algorithm DSL access should melt them out themselves or re-predict via `TopiaryPredictor`. Multiple files (MHC-I + MHC-II, or a mix of flavors) compose through -`topiary.concat([read_pvacseq(p1), read_pvacseq(p2)])`; no dedicated +`topiary.append_results([read_pvacseq(p1), read_pvacseq(p2)])`; no dedicated multi-file entry point is exposed. Loader-derived columns aligned with `TopiaryPredictor` output so diff --git a/docs/api.md b/docs/api.md index 77ccd48..92fee8d 100644 --- a/docs/api.md +++ b/docs/api.md @@ -36,10 +36,20 @@ view only for pandas compatibility, and materializes cached `long_df` and result state. Use `result.to_long()` / `result.to_wide()` when you want a new `TopiaryResult` whose active `df` is that form. -Topiary merge APIs operate on this semantic object. `topiary.concat()` accepts -mixed long/wide `TopiaryResult` inputs and normalizes internally; bare -DataFrames are still accepted by predictor-combine APIs for compatibility, but -are coerced into `TopiaryResult` before validation. +Topiary has two result-merging operations: + +| Operation | Meaning | Use when | +| --- | --- | --- | +| `topiary.append_results([a, b])` / `a.append(b)` | More result rows | Inputs are separate files, samples, cohorts, or independent result sets. | +| `topiary.combine_predictions([a, b])` / `a.combine_predictions(b)` | More predictions for the same logical identity grid | Inputs are separate predictors or predictor shards that should behave like one run. | + +Both operations accept mixed long/wide `TopiaryResult` inputs and normalize +internally. `combine_predictions` is stricter: it rejects duplicate +`(prediction_method_name, kind, identity)` predictions and validates prediction +coverage. Bare DataFrames are still accepted by `combine_predictions` for +compatibility with fresh `TopiaryPredictor` outputs, but are coerced into +`TopiaryResult` before validation. The older `concat` and +`combine_predictor_results` names remain compatibility aliases. ## CachedPredictor diff --git a/docs/pvacseq.md b/docs/pvacseq.md index 75bc14d..b229541 100644 --- a/docs/pvacseq.md +++ b/docs/pvacseq.md @@ -76,12 +76,12 @@ print(ranked.head()) ### MHC-I + MHC-II combined -`read_pvacseq()` doesn't expose a multi-file entry point — composition is just `topiary.concat`: +`read_pvacseq()` doesn't expose a multi-file entry point — composition is just `topiary.append_results`: ```python -from topiary import read_pvacseq, concat +from topiary import read_pvacseq, append_results -combined = concat([ +combined = append_results([ read_pvacseq("HCC1395.MHC_I.all_epitopes.aggregated.tsv"), read_pvacseq("HCC1395.MHC_II.all_epitopes.aggregated.tsv"), ]) diff --git a/docs/ranking.md b/docs/ranking.md index 6af09b2..41d5438 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -370,12 +370,12 @@ combined = TopiaryPredictor( ).predict_from_named_peptides(peptides) ``` -When predictors need to run separately, use `combine_predictor_results` to +When predictors need to run separately, use `combine_predictions` to stack the outputs back into the same long-form shape: ```python from mhctools import NetMHCpan, MHCflurry -from topiary import TopiaryPredictor, combine_predictor_results +from topiary import TopiaryPredictor, combine_predictions netmhcpan_rows = TopiaryPredictor( models=NetMHCpan, @@ -387,7 +387,7 @@ mhcflurry_rows = TopiaryPredictor( alleles=["HLA-A*02:01", "HLA-B*07:02"], ).predict_from_named_peptides(peptides) -combined = combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) +combined = combine_predictions([netmhcpan_rows, mhcflurry_rows]) ``` `TopiaryResult` owns the long/wide representation. Loaders may naturally @@ -418,7 +418,7 @@ for allele in ["HLA-A*02:01", "HLA-B*07:02"]: ).predict_from_named_peptides(length_peptides) ) -combined = combine_predictor_results(shards) +combined = combine_predictions(shards) ``` `prediction_method_name` is still the logical predictor name (`netmhcpan` in diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index 5ca83c1..a7fc47e 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -9,6 +9,7 @@ Presentation, TopiaryPredictor, TopiaryResult, + combine_predictions, combine_predictor_results, read_csv, read_tsv, @@ -247,7 +248,7 @@ def test_combine_separate_predictor_runs_matches_combined_run(): net_only = TopiaryPredictor(models=netmhcpan).predict_from_named_peptides(peptides) flurry_only = TopiaryPredictor(models=mhcflurry).predict_from_named_peptides(peptides) - combined = combine_predictor_results([net_only, flurry_only]) + combined = combine_predictions([net_only, flurry_only]) pd.testing.assert_frame_equal( _sort_predictions(combined.df), @@ -700,6 +701,17 @@ def test_combine_allows_same_method_across_samples(): assert not wide["netmhcpan_affinity_value"].isna().any() +def test_result_combine_predictions_convenience(): + net_only = _simple_result("netmhcpan") + flurry_only = _simple_result("mhcflurry") + + combined = net_only.combine_predictions(flurry_only) + + assert set(combined.df["prediction_method_name"]) == { + "netmhcpan", "mhcflurry", + } + + def test_null_sample_name_is_not_a_ranking_group_key(): result = _simple_result("netmhcpan") result.df["sample_name"] = pd.NA diff --git a/tests/test_result.py b/tests/test_result.py index 2e1dc22..cdb968c 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -3,7 +3,15 @@ import pandas as pd import pytest -from topiary import TopiaryResult, concat, from_wide, read_tsv, to_tsv, to_wide +from topiary import ( + TopiaryResult, + append_results, + concat, + from_wide, + read_tsv, + to_tsv, + to_wide, +) from topiary.io import Metadata @@ -346,13 +354,23 @@ def _make_r(self, value, source_tag, model_version="4.1b"): def test_concat_basic(self): r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") - combined = concat([r1, r2]) + combined = append_results([r1, r2]) assert len(combined) == 2 + def test_result_append_convenience(self): + r1 = self._make_r(100.0, "patient01") + r2 = self._make_r(200.0, "patient02") + r3 = self._make_r(300.0, "patient03") + + combined = r1.append([r2, r3]) + + assert len(combined) == 3 + assert combined.sources == ["patient01", "patient02", "patient03"] + def test_concat_sources_merged(self): r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") - combined = concat([r1, r2]) + combined = append_results([r1, r2]) assert combined.sources == ["patient01", "patient02"] def test_concat_preserves_source_column(self): diff --git a/topiary/__init__.py b/topiary/__init__.py index 692c86f..e97a67c 100644 --- a/topiary/__init__.py +++ b/topiary/__init__.py @@ -68,7 +68,13 @@ melt_pvacseq_algorithms, read_pvacseq, ) -from .result import TopiaryResult, combine_predictor_results, concat +from .result import ( + TopiaryResult, + append_results, + combine_predictions, + combine_predictor_results, + concat, +) from .wide import detect_form, from_wide, to_wide __version__ = "5.16.2" @@ -149,6 +155,8 @@ "from_wide", "to_wide", "TopiaryResult", + "append_results", + "combine_predictions", "combine_predictor_results", "concat", ] diff --git a/topiary/result.py b/topiary/result.py index 758217f..72534e4 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -428,6 +428,26 @@ def sort_by(self, expr): kwargs["sort_by_ast"] = new_ast return TopiaryResult(sorted_df, **kwargs) + # -- Result merging ---------------------------------------------------- + + def append(self, *others): + """Append independent TopiaryResults as more result rows. + + This is the object-oriented form of :func:`append_results`. + """ + return append_results(_with_self(self, others)) + + def combine_predictions( + self, *others, on=("peptide", "allele"), coverage="complete", + ): + """Combine complementary predictor outputs for the same identity grid. + + This is the object-oriented form of :func:`combine_predictions`. + """ + return combine_predictions( + _with_self(self, others), on=on, coverage=coverage, + ) + # -- Serialization ----------------------------------------------------- def to_tsv(self, path): @@ -500,37 +520,45 @@ def _dsl_filter_to_string(node): return repr(node) -def concat(results): - """Concatenate TopiaryResults, preserving provenance. +def _with_self(result, others): + """Build a merge input list for instance methods.""" + if len(others) == 1 and isinstance(others[0], (list, tuple)): + return [result, *others[0]] + return [result, *others] + + +def append_results(results): + """Append independent TopiaryResults as more result rows. Parameters ---------- - results : list of TopiaryResult - Results to concatenate. Long and wide results may be mixed; mixed + results : iterable of TopiaryResult + Results to append. Long and wide results may be mixed; mixed inputs are normalized to long form. Returns ------- TopiaryResult - DataFrames concatenated; metadata merged (sources concatenated, + DataFrames appended; metadata merged (sources concatenated, models union with warning on version conflicts; filter_by / sort_by preserved only if all inputs agree). The active output form is the shared input form when all inputs match, otherwise long. """ + results = list(results) if not results: return TopiaryResult(pd.DataFrame()) for result in results: if not isinstance(result, TopiaryResult): raise TypeError( - "topiary.concat expects TopiaryResult inputs; use " + "topiary.append_results expects TopiaryResult inputs; use " "TopiaryResult(df) to attach Topiary semantics before " - f"concatenating, got {type(result).__name__}" + f"appending, got {type(result).__name__}" ) forms = {r.form for r in results} if "unknown" in forms: raise ValueError( - f"Cannot concat TopiaryResults with unknown form: {forms}" + f"Cannot append TopiaryResults with unknown form: {forms}" ) form = results[0].form if len(forms) == 1 else "long" @@ -573,7 +601,7 @@ def concat(results): if any(r.filter_by_str for r in results): present = sorted({r.filter_by_str for r in results if r.filter_by_str}) warnings.warn( - "Dropping filter_by metadata: inputs to concat() have " + "Dropping filter_by metadata: inputs to append_results() have " f"differing filter history (found: {present}). The rows are " "still filtered per their individual histories, but the " "combined result has no single filter expression that " @@ -592,7 +620,7 @@ def concat(results): if any(r.sort_by_str for r in results): present = sorted({r.sort_by_str for r in results if r.sort_by_str}) warnings.warn( - "Dropping sort_by metadata: inputs to concat() have " + "Dropping sort_by metadata: inputs to append_results() have " f"differing sort history (found: {present}). The concatenated " "rows are no longer in a consistent sort order.", UserWarning, @@ -604,7 +632,7 @@ def concat(results): elif form == "wide": frames = [r.wide_df for r in results] else: - raise ValueError(f"Cannot concat TopiaryResults with form {form!r}") + raise ValueError(f"Cannot append TopiaryResults with form {form!r}") df = pd.concat(frames, ignore_index=True) @@ -622,12 +650,17 @@ def concat(results): ) -def combine_predictor_results(results, on=("peptide", "allele"), coverage="complete"): - """Combine separate predictor outputs into one predictor-equivalent result. +def concat(results): + """Compatibility alias for :func:`append_results`.""" + return append_results(results) + + +def combine_predictions(results, on=("peptide", "allele"), coverage="complete"): + """Combine complementary predictor outputs into one prediction result. - This is stricter than :func:`concat`: duplicate predictions are rejected, - and by default every emitted prediction method/kind must cover the same - identity key set. It supports both common split patterns: + This is stricter than :func:`append_results`: duplicate predictions are + rejected, and by default every emitted prediction method/kind must cover + the same identity key set. It supports both common split patterns: - different predictors run separately on the same peptides; - the same predictor run separately over disjoint allele/length shards. @@ -671,7 +704,7 @@ def combine_predictor_results(results, on=("peptide", "allele"), coverage="compl results = [_drop_non_identity_source(result, on) for result in results] identity_columns = _identity_columns(results, on) - combined = concat(results) + combined = append_results(results) _validate_no_duplicate_predictions(combined.df, identity_columns) if coverage == "complete": _validate_complete_prediction_coverage(combined.df, identity_columns) @@ -683,13 +716,20 @@ def combine_predictor_results(results, on=("peptide", "allele"), coverage="compl return combined +def combine_predictor_results( + results, on=("peptide", "allele"), coverage="complete", +): + """Compatibility alias for :func:`combine_predictions`.""" + return combine_predictions(results, on=on, coverage=coverage) + + def _as_topiary_result(result): if isinstance(result, TopiaryResult): return result if isinstance(result, pd.DataFrame): return TopiaryResult(result) raise TypeError( - "combine_predictor_results expects TopiaryResult or pandas.DataFrame " + "combine_predictions expects TopiaryResult or pandas.DataFrame " f"inputs, got {type(result).__name__}" ) @@ -697,19 +737,19 @@ def _as_topiary_result(result): def _validate_predictor_result(result, index, on): if result.form != "long": raise ValueError( - "combine_predictor_results only supports long-form predictor " + "combine_predictions only supports long-form predictor " f"results; result {index} has form {result.form!r}" ) required = set(on) | {"kind", "prediction_method_name"} missing = sorted(c for c in required if c not in result.df.columns) if missing: raise ValueError( - f"combine_predictor_results result {index} is missing required " + f"combine_predictions result {index} is missing required " f"column(s): {missing}" ) if not _prediction_methods(result): raise ValueError( - f"combine_predictor_results result {index} has no " + f"combine_predictions result {index} has no " "prediction_method_name values" ) @@ -728,7 +768,7 @@ def _normalize_coverage_mode(coverage): return "partial" if coverage not in {"complete", "partial"}: raise ValueError( - "combine_predictor_results coverage must be 'complete' or " + "combine_predictions coverage must be 'complete' or " f"'partial', got {coverage!r}" ) return coverage @@ -832,7 +872,7 @@ def _validate_no_duplicate_predictions(df, identity_columns): seen[normalized] = row_index if duplicates: raise ValueError( - "combine_predictor_results found duplicate predictions for " + "combine_predictions found duplicate predictions for " "(prediction_method_name, kind, identity) keys: " f"{_format_key_examples(set(duplicates))}" ) @@ -867,7 +907,7 @@ def _validate_complete_prediction_coverage(df, identity_columns): extra = keys - baseline_keys if missing or extra: message = [ - "combine_predictor_results coverage='complete' requires every " + "combine_predictions coverage='complete' requires every " "(prediction_method_name, kind) group to cover the same " f"{identity_columns!r} keys; group {group!r} differs from " f"group {baseline_group!r}." From a83f59d9dd708d5566965ed6585207da1b32dc6c Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Mon, 18 May 2026 14:00:33 -0400 Subject: [PATCH 17/17] Clarify result merge API names --- CHANGELOG.md | 21 ++++--- docs/api.md | 33 +++++++---- docs/pvacseq.md | 12 ++-- docs/ranking.md | 3 +- tests/test_combine_predictor_results.py | 43 +++++++------- tests/test_io_pvacseq.py | 44 +++++++++----- tests/test_result.py | 77 ++++++++++++------------- topiary/__init__.py | 8 +-- topiary/io_pvacseq.py | 6 +- topiary/result.py | 50 ++++++---------- 10 files changed, 150 insertions(+), 147 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03d170e..6fd75df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ **Combine separate predictor runs (#170):** -`topiary.combine_predictions([a, b, ...])` stacks separate +`topiary.combine_predictions([a, b, ...])` combines separate predictor outputs into the same long-form shape produced by running those predictors together. It accepts `TopiaryResult` or fresh `TopiaryPredictor` DataFrame outputs, supports both split-by-predictor @@ -43,16 +43,15 @@ and the ranking DSL's `best_*_allele` accessors for allele attribution. `TopiaryResult` now treats long/wide representation as an internal, cached view concern. Results expose `long_df` and `wide_df` on demand, -`to_long()` / `to_wide()` return results with that active compatibility -`df` view, and `topiary.append_results()` normalizes mixed-form TopiaryResults +`to_long()` / `to_wide()` return results with that active `df` view, +and `topiary.stack_results()` normalizes mixed-form TopiaryResults internally rather than requiring callers to pre-convert them. Result merging now has user-facing names for the two distinct operations: -use `append_results` / `result.append(...)` when inputs are more rows -(files, samples, cohorts), and use `combine_predictions` / +use `stack_results` / `result.stack_with(...)` when inputs are independent +result sets (files, samples, cohorts), and use `combine_predictions` / `result.combine_predictions(...)` when inputs are complementary predictor -outputs for the same logical identity grid. The older `concat` and -`combine_predictor_results` names remain as compatibility aliases. +outputs for the same logical identity grid. ## 5.16.1 @@ -103,7 +102,7 @@ selector won't find them — callers wanting per-algorithm DSL access should melt them out themselves or re-predict via `TopiaryPredictor`. Multiple files (MHC-I + MHC-II, or a mix of flavors) compose through -`topiary.append_results([read_pvacseq(p1), read_pvacseq(p2)])`; no dedicated +`topiary.stack_results([read_pvacseq(p1), read_pvacseq(p2)])`; no dedicated multi-file entry point is exposed. Loader-derived columns aligned with `TopiaryPredictor` output so @@ -111,7 +110,7 @@ downstream consumers (vaxrank, etc.) don't have to special-case the loader source: - `mhc_class` (`"I"` / `"II"`) — derived from the allele string; - lets concat-ed MHC-I + MHC-II results be filtered or split by class. + lets stacked MHC-I + MHC-II results be filtered or split by class. - `contains_mutant_residues` (boolean) — true iff the row's mutation position falls inside the candidate peptide; false for flanking-only peptides that pVACseq scored but where the mutation lies outside. @@ -119,7 +118,7 @@ loader source: 0-based half-open) — derived from pVACseq's 1-based Pos / Mutation Position. - `source` — per-row provenance label, matching `read_tsv` - convention; keeps multi-file concats distinguishable without rooting + convention; keeps multi-file stacks distinguishable without rooting through `Metadata.sources`. `Metadata.extra["kind_support"]` mirrors `TopiaryPredictor.kind_support` @@ -912,7 +911,7 @@ changes, just internal cleanup. `iterrows`, etc.) so most existing DataFrame-style code continues to work. Provides `to_wide()`, `to_long()`, `to_tsv()`, `to_csv()`, `filter_by()`, `sort_by()`. -- `topiary.concat([r1, r2, ...])` merges `TopiaryResult`s, unioning +- `topiary.stack_results([r1, r2, ...])` merges `TopiaryResult`s, unioning models (warns on version conflicts), concatenating sources, and preserving filter/sort history only if all inputs agree. - `read_tsv` / `read_csv` accept a `tag=` kwarg to label the source of diff --git a/docs/api.md b/docs/api.md index 92fee8d..4e4913f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -30,26 +30,35 @@ `TopiaryResult` is the semantic result object for Topiary prediction tables. It can ingest either long or wide prediction tables, keeps the active `df` -view only for pandas compatibility, and materializes cached `long_df` and -`wide_df` views on demand. The public `form` value describes that compatibility -`df` view; it is derived from the stored views rather than acting as separate -result state. Use `result.to_long()` / `result.to_wide()` when you want a new -`TopiaryResult` whose active `df` is that form. +view for pandas-style access, and materializes cached `long_df` and `wide_df` +views on demand. The public `form` value describes the active `df` view; it is +derived from the stored views rather than acting as separate result state. Use +`result.to_long()` / `result.to_wide()` when you want a new `TopiaryResult` +whose active `df` is that form. Topiary has two result-merging operations: | Operation | Meaning | Use when | | --- | --- | --- | -| `topiary.append_results([a, b])` / `a.append(b)` | More result rows | Inputs are separate files, samples, cohorts, or independent result sets. | +| `topiary.stack_results([a, b])` / `a.stack_with(b)` | More result sets | Inputs are separate files, samples, cohorts, or independent result sets. | | `topiary.combine_predictions([a, b])` / `a.combine_predictions(b)` | More predictions for the same logical identity grid | Inputs are separate predictors or predictor shards that should behave like one run. | +`stack_results` is a row-union operation. The inputs do not need to describe +the same peptides, alleles, samples, sources, predictors, or score kinds; they +are just more Topiary result rows with merged provenance. + +`combine_predictions` is a prediction-union operation. The inputs are pieces +of one logical prediction table: separate predictors over the same candidates, +or one predictor split into disjoint allele/peptide-length shards. It rejects +duplicate `(prediction_method_name, kind, identity)` rows and, by default, +requires every emitted `(prediction_method_name, kind)` group to cover the +same identity grid. Use `coverage="partial"` only for intentionally sparse +prediction unions. + Both operations accept mixed long/wide `TopiaryResult` inputs and normalize -internally. `combine_predictions` is stricter: it rejects duplicate -`(prediction_method_name, kind, identity)` predictions and validates prediction -coverage. Bare DataFrames are still accepted by `combine_predictions` for -compatibility with fresh `TopiaryPredictor` outputs, but are coerced into -`TopiaryResult` before validation. The older `concat` and -`combine_predictor_results` names remain compatibility aliases. +internally. Fresh `TopiaryPredictor` DataFrame outputs may also be passed to +`combine_predictions`; they are first wrapped as `TopiaryResult` objects and +then validated with the same rules. ## CachedPredictor diff --git a/docs/pvacseq.md b/docs/pvacseq.md index b229541..12536af 100644 --- a/docs/pvacseq.md +++ b/docs/pvacseq.md @@ -52,10 +52,10 @@ Loader-derived columns aligned with `TopiaryPredictor` output so downstream code | Column | Type | What it carries | |--------|------|-----------------| -| `mhc_class` | `"I"` / `"II"` / `pd.NA` | Per-row class derived from the allele. Lets concat-ed multi-class results be filtered or split by class without re-parsing alleles. | +| `mhc_class` | `"I"` / `"II"` / `pd.NA` | Per-row class derived from the allele. Lets stacked multi-class results be filtered or split by class without re-parsing alleles. | | `contains_mutant_residues` | `boolean` (nullable) | True iff the row's mutation position falls inside the candidate peptide. False for flanking-only peptides where pVACseq scored a 9-mer adjacent to the mutation but the mutation lies outside. | | `mutation_start_in_peptide` / `mutation_end_in_peptide` | `Int64` | 0-based half-open mutation interval within the peptide. Derived from pVACseq's 1-based Pos (aggregated) or Mutation Position (all_epitopes). Single-residue semantics — multi-residue mutations collapse to a representative position. | -| `source` | `str` | Per-row provenance label, matching `read_tsv` convention so multi-file concats stay distinguishable without rooting through `Metadata.sources`. | +| `source` | `str` | Per-row provenance label, matching `read_tsv` convention so multi-file stacks stay distinguishable without rooting through `Metadata.sources`. | ### Annotation passthroughs @@ -76,12 +76,12 @@ print(ranked.head()) ### MHC-I + MHC-II combined -`read_pvacseq()` doesn't expose a multi-file entry point — composition is just `topiary.append_results`: +`read_pvacseq()` doesn't expose a multi-file entry point — composition is just `topiary.stack_results`: ```python -from topiary import read_pvacseq, append_results +from topiary import read_pvacseq, stack_results -combined = append_results([ +combined = stack_results([ read_pvacseq("HCC1395.MHC_I.all_epitopes.aggregated.tsv"), read_pvacseq("HCC1395.MHC_II.all_epitopes.aggregated.tsv"), ]) @@ -232,7 +232,7 @@ r.extra["kind_support"] apply_filter(r.df, my_filter, kind_support=r.extra["kind_support"]) ``` -`pvacseq_format` is `"aggregated"` or `"all_epitopes"` (or a comma-joined string after melting / concat-ing). +`pvacseq_format` is `"aggregated"` or `"all_epitopes"` (or a comma-joined string after melting / stacking). ## Caveats and known limitations diff --git a/docs/ranking.md b/docs/ranking.md index 41d5438..d8cfcb9 100644 --- a/docs/ranking.md +++ b/docs/ranking.md @@ -371,7 +371,8 @@ combined = TopiaryPredictor( ``` When predictors need to run separately, use `combine_predictions` to -stack the outputs back into the same long-form shape: +turn their complementary prediction rows back into the same long-form +shape: ```python from mhctools import NetMHCpan, MHCflurry diff --git a/tests/test_combine_predictor_results.py b/tests/test_combine_predictor_results.py index a7fc47e..7d50fa2 100644 --- a/tests/test_combine_predictor_results.py +++ b/tests/test_combine_predictor_results.py @@ -10,7 +10,6 @@ TopiaryPredictor, TopiaryResult, combine_predictions, - combine_predictor_results, read_csv, read_tsv, to_wide, @@ -281,7 +280,7 @@ def test_combine_same_method_split_by_allele_and_length_matches_direct_run(): ).predict_from_named_peptides(peptides) ) - combined = combine_predictor_results(split_results) + combined = combine_predictions(split_results) pd.testing.assert_frame_equal( _sort_predictions(_without_run_name(combined.df)), @@ -326,7 +325,7 @@ def test_combine_multi_method_inputs_split_by_allele_match_direct_run(): ]).predict_from_named_peptides(peptides) ) - combined = combine_predictor_results(split_results) + combined = combine_predictions(split_results) pd.testing.assert_frame_equal( _sort_predictions(combined.df), @@ -353,7 +352,7 @@ def test_combined_split_grid_supports_best_ba_and_el_allele_aggregation(): for allele in alleles for peptide_length in [8, 9, 10] ] - combined = combine_predictor_results(split_results) + combined = combine_predictions(split_results) ctx = EvalContext(combined.df) best_ba_allele = Affinity["netmhcpan"].best_value_allele.eval(ctx) @@ -379,9 +378,9 @@ def test_combine_haplotype_style_presentation_uses_partial_coverage(): ).predict_from_named_peptides(peptides) with pytest.raises(ValueError, match="coverage='complete'"): - combine_predictor_results([netmhcpan_rows, mhcflurry_rows]) + combine_predictions([netmhcpan_rows, mhcflurry_rows]) - combined = combine_predictor_results( + combined = combine_predictions( [netmhcpan_rows, mhcflurry_rows], coverage="partial", ) @@ -445,7 +444,7 @@ def test_prediction_run_name_does_not_split_wide_rows(): for allele in alleles ] - combined = combine_predictor_results(split_results) + combined = combine_predictions(split_results) wide = to_wide(combined.df) assert "prediction_run_name" not in wide.columns @@ -470,7 +469,7 @@ def test_combine_rejects_overlapping_named_shards(): ).predict_from_named_peptides(peptides) with pytest.raises(ValueError, match="duplicate predictions"): - combine_predictor_results([shard_a, shard_b]) + combine_predictions([shard_a, shard_b]) def test_combine_roundtripped_topiary_results(tmp_path): @@ -494,7 +493,7 @@ def test_combine_roundtripped_topiary_results(tmp_path): net_only.to_tsv(net_path) flurry_only.to_csv(flurry_path) - combined = combine_predictor_results([ + combined = combine_predictions([ read_tsv(net_path), read_csv(flurry_path), ]) @@ -529,7 +528,7 @@ def test_combine_accepts_wide_topiary_results(): TopiaryPredictor(models=mhcflurry).predict_from_named_peptides(peptides) ).to_wide() - combined = combine_predictor_results([net_wide, flurry_wide]) + combined = combine_predictions([net_wide, flurry_wide]) assert combined.form == "long" pd.testing.assert_frame_equal( @@ -541,7 +540,7 @@ def test_combine_accepts_wide_topiary_results(): def test_combine_ignores_empty_bare_dataframes(): result = _simple_result("netmhcpan") - combined = combine_predictor_results([pd.DataFrame(), result]) + combined = combine_predictions([pd.DataFrame(), result]) assert len(combined) == 1 assert combined.models == {"netmhcpan": "1.0"} @@ -570,7 +569,7 @@ def test_combine_recomputes_models_from_combined_rows(): models=stale_models, ) - combined = combine_predictor_results([net_only, flurry_only]) + combined = combine_predictions([net_only, flurry_only]) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} assert combined.metadata.models == combined.models @@ -590,7 +589,7 @@ def test_combine_fills_missing_row_versions_from_observed_metadata(): assert TopiaryResult(flurry_df).models == {"mhcflurry": "2.1.1"} - combined = combine_predictor_results([net_df, flurry_df]) + combined = combine_predictions([net_df, flurry_df]) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} @@ -600,7 +599,7 @@ def test_combine_rejects_different_identity_sets(): r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") with pytest.raises(ValueError, match="coverage='complete'"): - combine_predictor_results([r1, r2]) + combine_predictions([r1, r2]) def test_combine_rejects_incomplete_method_coverage_within_one_input(): @@ -620,21 +619,21 @@ def test_combine_rejects_incomplete_method_coverage_within_one_input(): result = TopiaryResult(pd.concat([method_a, method_b], ignore_index=True)) with pytest.raises(ValueError, match="coverage='complete'"): - combine_predictor_results([result]) + combine_predictions([result]) def test_combine_partial_coverage_allows_sparse_union(): r1 = _simple_result("netmhcpan", peptide="SIINFEKLA") r2 = _simple_result("mhcflurry", peptide="ELAGIGILT") - combined = combine_predictor_results([r1, r2], coverage="partial") + combined = combine_predictions([r1, r2], coverage="partial") assert len(combined) == 2 def test_combine_rejects_unknown_coverage_mode(): with pytest.raises(ValueError, match="coverage"): - combine_predictor_results([_simple_result("netmhcpan")], coverage="loose") + combine_predictions([_simple_result("netmhcpan")], coverage="loose") @pytest.mark.parametrize("column", sorted(_CONTEXT_MISMATCH_VALUES)) @@ -647,14 +646,14 @@ def test_combine_rejects_mismatched_context_columns( inputs = _input_pair(tmp_path, input_type, r1, r2) with pytest.raises(ValueError, match=column): - combine_predictor_results(inputs) + combine_predictions(inputs) def test_combine_treats_null_identity_keys_as_equal(): r1 = _simple_result("netmhcpan", allele=pd.NA) r2 = _simple_result("mhcflurry", allele=pd.NA) - combined = combine_predictor_results([r1, r2]) + combined = combine_predictions([r1, r2]) assert len(combined) == 2 @@ -664,7 +663,7 @@ def test_combine_rejects_duplicate_prediction_methods(): r2 = _simple_result("netmhcpan") with pytest.raises(ValueError, match="duplicate predictions"): - combine_predictor_results([r1, r2]) + combine_predictions([r1, r2]) def test_combine_allows_same_method_across_samples(): @@ -675,7 +674,7 @@ def test_combine_allows_same_method_across_samples(): sample_b.df["value"] = 1000.0 sample_b.df["affinity"] = 1000.0 - combined = combine_predictor_results([sample_a, sample_b]) + combined = combine_predictions([sample_a, sample_b]) ctx = EvalContext(combined.df) scores = Affinity["netmhcpan"].value.eval(ctx) filtered = combined.filter_by("affinity <= 500") @@ -728,7 +727,7 @@ def test_combine_ignores_legacy_kind_support_metadata(): r1.extra["kind_support"] = "not a mapping" r2 = _simple_result("mhcflurry") - combined = combine_predictor_results([r1, r2]) + combined = combine_predictions([r1, r2]) assert "kind_support" not in combined.extra diff --git a/tests/test_io_pvacseq.py b/tests/test_io_pvacseq.py index 581f070..9478d94 100644 --- a/tests/test_io_pvacseq.py +++ b/tests/test_io_pvacseq.py @@ -13,11 +13,11 @@ TopiaryResult, apply_filter, apply_sort, - concat, detect_pvacseq_format, melt_pvacseq_algorithms, read_pvacseq, read_tsv, + stack_results, to_tsv, wt, ) @@ -170,8 +170,8 @@ def test_round_trip_through_topiary_tsv(self, tmp_path): ) def test_predictor_version_is_na(self): - # Pinned at pd.NA so concat across files doesn't trigger the - # "conflicting versions" warning in topiary.concat. + # Pinned at pd.NA so stacking files doesn't trigger the + # "conflicting versions" warning in topiary.stack_results. r = read_pvacseq(MHC_I_AGG) assert r.df["predictor_version"].isna().all() assert r.df["wt_predictor_version"].isna().all() @@ -303,26 +303,32 @@ def test_chr_coord_variant_fallback_when_index_missing(self, tmp_path): # --------------------------------------------------------------------------- -# Combining files via topiary.concat +# Stacking files via topiary.stack_results # --------------------------------------------------------------------------- -class TestConcatMultipleFiles: - def test_mhc_i_plus_mhc_ii_via_concat(self): - combined = concat([read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG)]) +class TestStackMultipleFiles: + def test_mhc_i_plus_mhc_ii_via_stack_results(self): + combined = stack_results([ + read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG), + ]) assert len(combined) == _data_row_count(MHC_I_AGG) + _data_row_count(MHC_II_AGG) alleles = set(combined.df["allele"].dropna()) assert any(a.startswith(("HLA-A", "HLA-B", "HLA-C")) for a in alleles) assert any("D" in a for a in alleles) - def test_concat_mixed_flavors(self): - combined = concat([read_pvacseq(MHC_I_ALL), read_pvacseq(MHC_II_AGG)]) + def test_stack_mixed_flavors(self): + combined = stack_results([ + read_pvacseq(MHC_I_ALL), read_pvacseq(MHC_II_AGG), + ]) assert len(combined) == _data_row_count(MHC_I_ALL) + _data_row_count(MHC_II_AGG) - def test_concat_preserves_per_row_source_and_mhc_class(self): + def test_stack_preserves_per_row_source_and_mhc_class(self): # Vaxrank wants to combine MHC-I + MHC-II in one ranking run and # split or filter by class afterward. - combined = concat([read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG)]) + combined = stack_results([ + read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG), + ]) # Two distinct provenance labels. assert combined.df["source"].nunique() == 2 # mhc_class lets downstream filter by class without parsing alleles. @@ -442,11 +448,13 @@ def test_mhc_ii_file_records_single_allele_class_ii(self): r = read_pvacseq(MHC_II_AGG) assert r.extra["kind_support"]["pvacseq"]["pMHC_affinity"]["mhc_class"] == "II" - def test_concat_summary_can_be_recomputed_post_concat(self): - # concat doesn't merge kind_support — callers can recompute + def test_stack_summary_can_be_recomputed_post_stack(self): + # stack_results doesn't merge kind_support — callers can recompute # from the combined allele column if needed. from topiary.io_pvacseq import _summarize_mhc_class - combined = concat([read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG)]) + combined = stack_results([ + read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG), + ]) assert _summarize_mhc_class(combined.df["allele"]) == "both" @@ -522,7 +530,9 @@ def test_vaxrank_shape_filter_expression(self): # epitope_io.py loader for topiary.read_pvacseq. Fully native # DSL: numeric + categorical clauses in one expression via the # IsIn nodes introduced for class-I/II filtering. - combined = concat([read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG)]) + combined = stack_results([ + read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG), + ]) strong = apply_filter( combined.df, (Affinity.value <= 500) @@ -537,7 +547,9 @@ def test_vaxrank_shape_filter_expression(self): def test_vaxrank_shape_filter_via_parsed_string(self): # Same filter shape via the string DSL — what a CLI flag would feed. from topiary import parse - combined = concat([read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG)]) + combined = stack_results([ + read_pvacseq(MHC_I_AGG), read_pvacseq(MHC_II_AGG), + ]) node = parse('affinity.value <= 500 & mhc_class == "I"') strong = apply_filter(combined.df, node) assert (strong["value"] <= 500).all() diff --git a/tests/test_result.py b/tests/test_result.py index cdb968c..afdab63 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -1,14 +1,13 @@ -"""Tests for topiary.result — TopiaryResult wrapper and concat().""" +"""Tests for topiary.result — TopiaryResult wrapper and stack_results().""" import pandas as pd import pytest from topiary import ( TopiaryResult, - append_results, - concat, from_wide, read_tsv, + stack_results, to_tsv, to_wide, ) @@ -249,7 +248,7 @@ def test_top_level_converters_accept_result(self): assert "netmhcpan_affinity_value" in wide_df.columns assert "kind" in long_df.columns - def test_form_assignment_switches_active_view_for_compatibility(self): + def test_form_assignment_switches_active_view(self): r = TopiaryResult(_sample_long_df()) r.form = "wide" @@ -289,7 +288,7 @@ def test_to_csv_from_result(self, tmp_path): assert len(r2) == 2 def test_to_tsv_accepts_bare_df(self, tmp_path): - """Backward compat: to_tsv still works on a bare DataFrame.""" + """Top-level writers still accept a bare DataFrame.""" df = _sample_long_df() path = tmp_path / "bare.tsv" to_tsv(df, path) @@ -330,11 +329,11 @@ def test_source_column_not_overwritten(self, tmp_path): # --------------------------------------------------------------------------- -# concat() +# stack_results() # --------------------------------------------------------------------------- -class TestConcat: +class TestStackResults: def _make_r(self, value, source_tag, model_version="4.1b"): df = pd.DataFrame([dict( peptide="SIINFEKL", allele="HLA-A*02:01", @@ -351,35 +350,35 @@ def _make_r(self, value, source_tag, model_version="4.1b"): ) return TopiaryResult(df, meta) - def test_concat_basic(self): + def test_stack_basic(self): r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") - combined = append_results([r1, r2]) + combined = stack_results([r1, r2]) assert len(combined) == 2 - def test_result_append_convenience(self): + def test_result_stack_with_convenience(self): r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") r3 = self._make_r(300.0, "patient03") - combined = r1.append([r2, r3]) + combined = r1.stack_with([r2, r3]) assert len(combined) == 3 assert combined.sources == ["patient01", "patient02", "patient03"] - def test_concat_sources_merged(self): + def test_stack_sources_merged(self): r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") - combined = append_results([r1, r2]) + combined = stack_results([r1, r2]) assert combined.sources == ["patient01", "patient02"] - def test_concat_preserves_source_column(self): + def test_stack_preserves_source_column(self): r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert set(combined["source"].unique()) == {"patient01", "patient02"} - def test_concat_models_union(self): + def test_stack_models_union(self): r1 = self._make_r(100.0, "p1", model_version="4.1b") r2 = pd.DataFrame([dict( peptide="X", allele="A", @@ -389,55 +388,55 @@ def test_concat_models_union(self): source="p2", )]) r2 = TopiaryResult(r2, Metadata(form="long", models={"mhcflurry": "2.1.1"}, sources=["p2"])) - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.models == {"netmhcpan": "4.1b", "mhcflurry": "2.1.1"} - def test_concat_version_conflict_warns(self): + def test_stack_version_conflict_warns(self): r1 = self._make_r(100.0, "p1", model_version="4.1b") r2 = self._make_r(200.0, "p2", model_version="4.2") with pytest.warns(UserWarning, match="conflicting versions"): - concat([r1, r2]) + stack_results([r1, r2]) - def test_concat_mixed_forms_materializes_long_result(self): + def test_stack_mixed_forms_materializes_long_result(self): r_long = self._make_r(100.0, "p1") r_wide = self._make_r(200.0, "p2").to_wide() - combined = concat([r_long, r_wide]) + combined = stack_results([r_long, r_wide]) assert combined.form == "long" assert "kind" in combined.columns assert len(combined) == 2 assert combined.sources == ["p1", "p2"] - def test_concat_all_wide_preserves_wide_active_form(self): + def test_stack_all_wide_preserves_wide_active_form(self): r1 = self._make_r(100.0, "p1").to_wide() r2 = self._make_r(200.0, "p2").to_wide() - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.form == "wide" assert "netmhcpan_affinity_value" in combined.columns assert "kind" not in combined.columns - def test_concat_empty_list(self): - combined = concat([]) + def test_stack_empty_list(self): + combined = stack_results([]) assert isinstance(combined, TopiaryResult) assert len(combined) == 0 - def test_concat_single(self): + def test_stack_single(self): r = self._make_r(100.0, "p1") - combined = concat([r]) + combined = stack_results([r]) assert len(combined) == 1 - def test_concat_requires_topiary_results(self): + def test_stack_requires_topiary_results(self): with pytest.raises(TypeError, match="TopiaryResult"): - concat([self._make_r(100.0, "p1").df]) + stack_results([self._make_r(100.0, "p1").df]) - def test_concat_then_write_roundtrip(self, tmp_path): - """Concat + write + read preserves sources in comment block.""" + def test_stack_then_write_roundtrip(self, tmp_path): + """Stack + write + read preserves sources in comment block.""" r1 = self._make_r(100.0, "patient01") r2 = self._make_r(200.0, "patient02") - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) path = tmp_path / "merged.tsv" combined.to_tsv(path) @@ -653,11 +652,11 @@ def test_sort_then_filter(self): # --------------------------------------------------------------------------- -# concat warnings for dropped filter/sort history +# stack_results warnings for dropped filter/sort history # --------------------------------------------------------------------------- -class TestConcatHistoryDrop: +class TestStackHistoryDrop: def _make_r(self, value, source_tag, filter_str=None, sort_str=None): df = pd.DataFrame([dict( peptide="SIINFEKL", allele="HLA-A*02:01", @@ -679,7 +678,7 @@ def _make_r(self, value, source_tag, filter_str=None, sort_str=None): def test_matching_filters_preserved_silently(self, recwarn): r1 = self._make_r(100, "p1", filter_str="affinity <= 500") r2 = self._make_r(200, "p2", filter_str="affinity <= 500") - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.filter_by_str == "affinity <= 500" # No warning about filter/sort drop filter_warnings = [w for w in recwarn.list if "Dropping" in str(w.message)] @@ -689,27 +688,27 @@ def test_differing_filters_warn_and_drop(self): r1 = self._make_r(100, "p1", filter_str="affinity <= 500") r2 = self._make_r(200, "p2", filter_str="affinity <= 1000") with pytest.warns(UserWarning, match="Dropping filter_by metadata"): - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.filter_by_str is None def test_one_has_filter_one_doesnt_warns(self): r1 = self._make_r(100, "p1", filter_str="affinity <= 500") r2 = self._make_r(200, "p2", filter_str=None) with pytest.warns(UserWarning, match="Dropping filter_by metadata"): - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.filter_by_str is None def test_differing_sorts_warn_and_drop(self): r1 = self._make_r(100, "p1", sort_str="affinity.score") r2 = self._make_r(200, "p2", sort_str="presentation.score") with pytest.warns(UserWarning, match="Dropping sort_by metadata"): - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.sort_by_str is None def test_matching_sorts_preserved_silently(self, recwarn): r1 = self._make_r(100, "p1", sort_str="affinity.score") r2 = self._make_r(200, "p2", sort_str="affinity.score") - combined = concat([r1, r2]) + combined = stack_results([r1, r2]) assert combined.sort_by_str == "affinity.score" sort_warnings = [w for w in recwarn.list if "Dropping sort_by" in str(w.message)] assert not sort_warnings diff --git a/topiary/__init__.py b/topiary/__init__.py index e97a67c..0d469ad 100644 --- a/topiary/__init__.py +++ b/topiary/__init__.py @@ -70,10 +70,8 @@ ) from .result import ( TopiaryResult, - append_results, + stack_results, combine_predictions, - combine_predictor_results, - concat, ) from .wide import detect_form, from_wide, to_wide @@ -155,8 +153,6 @@ "from_wide", "to_wide", "TopiaryResult", - "append_results", + "stack_results", "combine_predictions", - "combine_predictor_results", - "concat", ] diff --git a/topiary/io_pvacseq.py b/topiary/io_pvacseq.py index 17de25a..0cc85d2 100644 --- a/topiary/io_pvacseq.py +++ b/topiary/io_pvacseq.py @@ -26,7 +26,7 @@ Position. Single-residue semantics; indels / frameshifts collapse to a representative position. - ``source`` — per-row provenance label (tag or ``pvacseq-{flavor}:{filename}``), - matching :func:`read_tsv` convention so multi-file concats stay + matching :func:`read_tsv` convention so multi-file stacks stay distinguishable. For missense aggregated rows, the WT peptide sequence is reconstructed @@ -492,7 +492,7 @@ def _finalize(parsed, *, source): """Add synthesized constants, mirrors, and column order in one allocation. *source* is stamped on every row to match topiary's read_tsv provenance - convention; downstream concat across MHC-I and MHC-II files stays + convention; downstream stacking across MHC-I and MHC-II files stays distinguishable without rooting through Metadata. """ # peptide_offset = 0: pVACseq doesn't ship the source-protein offset @@ -550,7 +550,7 @@ def read_pvacseq(path, *, tag=None) -> TopiaryResult: TopiaryResult Long-form DataFrame with one row per (peptide, allele) and ``Metadata.extra["pvacseq_format"]`` recording the file flavor. - Compose multiple files with :func:`topiary.concat`. + Compose multiple files with :func:`topiary.stack_results`. """ path = Path(path) # "X" is pVACseq's sentinel for "this algorithm didn't score this diff --git a/topiary/result.py b/topiary/result.py index 72534e4..714633b 100644 --- a/topiary/result.py +++ b/topiary/result.py @@ -154,7 +154,7 @@ def __init__( @property def form(self): - """Form of the active ``.df`` view, kept for backward compatibility.""" + """Form of the active ``.df`` view.""" if self._df is self._long_df: return "long" if self._df is self._wide_df: @@ -187,7 +187,7 @@ def form(self, value): @property def df(self): - """Active DataFrame view for backward-compatible pandas access.""" + """Active DataFrame view for pandas-style access.""" return self._df @df.setter @@ -430,12 +430,12 @@ def sort_by(self, expr): # -- Result merging ---------------------------------------------------- - def append(self, *others): - """Append independent TopiaryResults as more result rows. + def stack_with(self, *others): + """Stack independent TopiaryResults as more result rows. - This is the object-oriented form of :func:`append_results`. + This is the object-oriented form of :func:`stack_results`. """ - return append_results(_with_self(self, others)) + return stack_results(_with_self(self, others)) def combine_predictions( self, *others, on=("peptide", "allele"), coverage="complete", @@ -527,19 +527,19 @@ def _with_self(result, others): return [result, *others] -def append_results(results): - """Append independent TopiaryResults as more result rows. +def stack_results(results): + """Stack independent TopiaryResults as more result rows. Parameters ---------- results : iterable of TopiaryResult - Results to append. Long and wide results may be mixed; mixed + Results to stack. Long and wide results may be mixed; mixed inputs are normalized to long form. Returns ------- TopiaryResult - DataFrames appended; metadata merged (sources concatenated, + DataFrames stacked; metadata merged (sources combined, models union with warning on version conflicts; filter_by / sort_by preserved only if all inputs agree). The active output form is the shared input form when all inputs match, otherwise long. @@ -550,15 +550,15 @@ def append_results(results): for result in results: if not isinstance(result, TopiaryResult): raise TypeError( - "topiary.append_results expects TopiaryResult inputs; use " + "topiary.stack_results expects TopiaryResult inputs; use " "TopiaryResult(df) to attach Topiary semantics before " - f"appending, got {type(result).__name__}" + f"stacking, got {type(result).__name__}" ) forms = {r.form for r in results} if "unknown" in forms: raise ValueError( - f"Cannot append TopiaryResults with unknown form: {forms}" + f"Cannot stack TopiaryResults with unknown form: {forms}" ) form = results[0].form if len(forms) == 1 else "long" @@ -601,7 +601,7 @@ def append_results(results): if any(r.filter_by_str for r in results): present = sorted({r.filter_by_str for r in results if r.filter_by_str}) warnings.warn( - "Dropping filter_by metadata: inputs to append_results() have " + "Dropping filter_by metadata: inputs to stack_results() have " f"differing filter history (found: {present}). The rows are " "still filtered per their individual histories, but the " "combined result has no single filter expression that " @@ -620,8 +620,8 @@ def append_results(results): if any(r.sort_by_str for r in results): present = sorted({r.sort_by_str for r in results if r.sort_by_str}) warnings.warn( - "Dropping sort_by metadata: inputs to append_results() have " - f"differing sort history (found: {present}). The concatenated " + "Dropping sort_by metadata: inputs to stack_results() have " + f"differing sort history (found: {present}). The stacked " "rows are no longer in a consistent sort order.", UserWarning, stacklevel=2, @@ -632,7 +632,7 @@ def append_results(results): elif form == "wide": frames = [r.wide_df for r in results] else: - raise ValueError(f"Cannot append TopiaryResults with form {form!r}") + raise ValueError(f"Cannot stack TopiaryResults with form {form!r}") df = pd.concat(frames, ignore_index=True) @@ -650,15 +650,10 @@ def append_results(results): ) -def concat(results): - """Compatibility alias for :func:`append_results`.""" - return append_results(results) - - def combine_predictions(results, on=("peptide", "allele"), coverage="complete"): """Combine complementary predictor outputs into one prediction result. - This is stricter than :func:`append_results`: duplicate predictions are + This is stricter than :func:`stack_results`: duplicate predictions are rejected, and by default every emitted prediction method/kind must cover the same identity key set. It supports both common split patterns: @@ -704,7 +699,7 @@ def combine_predictions(results, on=("peptide", "allele"), coverage="complete"): results = [_drop_non_identity_source(result, on) for result in results] identity_columns = _identity_columns(results, on) - combined = append_results(results) + combined = stack_results(results) _validate_no_duplicate_predictions(combined.df, identity_columns) if coverage == "complete": _validate_complete_prediction_coverage(combined.df, identity_columns) @@ -716,13 +711,6 @@ def combine_predictions(results, on=("peptide", "allele"), coverage="complete"): return combined -def combine_predictor_results( - results, on=("peptide", "allele"), coverage="complete", -): - """Compatibility alias for :func:`combine_predictions`.""" - return combine_predictions(results, on=on, coverage=coverage) - - def _as_topiary_result(result): if isinstance(result, TopiaryResult): return result