From 6858f3e9d317b41bb37c5276f24cb2a673a16c4e Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Thu, 18 Jun 2026 13:31:55 -0400 Subject: [PATCH 1/2] Fix #394: in-frame stop-codon deletion with empty 3' UTR is a Deletion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An in-frame deletion that removes the stop codon of a transcript with no 3' UTR sequence (e.g. MAPK3-006 / ENST00000395199, whose three_prime_utr_sequence is "") raised: ValueError: If no amino acids added by StopLoss then it should be Silent translate_in_frame_mutation sets using_three_prime_utr=True whenever the mutation runs past the reference stop codon, even when there is no UTR sequence to translate into. predict_in_frame_coding_effect then took the StopLoss branch with an empty aa_alt, which the StopLoss constructor rejects. Only emit StopLoss when readthrough actually adds residues (n_aa_alt > 0); otherwise fall through to the existing n_aa_alt == 0 branch, which reports a C-terminal Deletion. This is the honest classification: with no UTR sequence we cannot predict an extended protein. The earlier #246 fix only covered transcripts with a non-empty 3' UTR. Tests: the reported MAPK3 variant exercises both sides from one input — ENST00000395199 (no UTR) -> Deletion, ENST00000403394 (804nt UTR) -> StopLoss readthrough — plus a splice-free unit test of the branch. Claude-Session: https://claude.ai/code/session_0149VWj5Rm1rYFf9azu4ry62 --- CHANGELOG.md | 14 ++ tests/test_stop_codon_classification_bugs.py | 125 +++++++++++++++++- .../effect_prediction_coding_in_frame.py | 18 ++- 3 files changed, 155 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27b5d12..c85ae83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Change Log +## [Unreleased](https://github.com/openvax/varcode/tree/HEAD) + +**Fixed** +- In-frame deletions that remove the stop codon of a transcript with no + 3' UTR sequence (e.g. MAPK3-006 / `ENST00000395199`, whose + `three_prime_utr_sequence` is `""`) no longer raise + `ValueError: If no amino acids added by StopLoss then it should be Silent`. + With no readthrough sequence to translate into, the effect is now + classified as a C-terminal `Deletion` instead of an invalid `StopLoss` + with an empty `aa_alt`. The earlier + [#246](https://github.com/openvax/varcode/issues/246) fix only covered + transcripts with a non-empty 3' UTR + ([#394](https://github.com/openvax/varcode/issues/394)). + ## [v6.0.0](https://github.com/openvax/varcode/tree/v6.0.0) (2026-05-26) **Fixed** diff --git a/tests/test_stop_codon_classification_bugs.py b/tests/test_stop_codon_classification_bugs.py index 4f391b5..44fbe20 100644 --- a/tests/test_stop_codon_classification_bugs.py +++ b/tests/test_stop_codon_classification_bugs.py @@ -21,10 +21,17 @@ * https://github.com/openvax/varcode/issues/201 - Insertion before the stop codon that produces an identical protein sequence is reported as an Insertion rather than Silent. +* https://github.com/openvax/varcode/issues/394 - In-frame deletion that + removes the stop codon of a transcript with no 3' UTR sequence crashes + while constructing a StopLoss with an empty aa_alt; it should be a + Deletion. (The earlier #246 only fixed the non-empty-UTR variant.) """ from varcode import Variant -from varcode.effects import Silent, StopLoss +from varcode.effects import Silent, StopLoss, Deletion +from varcode.effects.effect_prediction_coding_in_frame import ( + predict_in_frame_coding_effect, +) # ----------------------------------------------------------------------- @@ -122,3 +129,119 @@ def test_201_synonymous_insertion_before_stop_is_silent(): "Expected Silent, got %s (%s)" % ( effect.__class__.__name__, getattr(effect, "short_description", effect)) + + +# ----------------------------------------------------------------------- +# Issue #394: in-frame deletion of the stop codon on a transcript with an +# empty 3' UTR. +# +# The reported variant is a large in-frame deletion in MAPK3 on GRCh37 +# (Ensembl 75) that removes the stop codon. It overlaps several MAPK3 +# transcripts, which gives us both sides of the fix from a single variant: +# +# * ENST00000395199 (MAPK3-006) has NO 3' UTR sequence, so there's +# nothing to translate into once the stop codon is gone — aa_alt ends +# up empty and StopLoss can't be constructed. Correct answer: a +# C-terminal Deletion. This is the case that used to raise +# "If no amino acids added by StopLoss then it should be Silent". +# * ENST00000403394 has an 804nt 3' UTR, so the same deletion reads +# through and adds residues — that one must stay a StopLoss, proving +# the fix didn't over-broaden the Deletion fallback. +# +# The reporter's start coordinate was off by one against the reference +# genome; the equivalent variant that matches GRCh37 ref starts at +# 30128151 (normalized to a pure 18nt deletion at 30128152). +# ----------------------------------------------------------------------- + + +def _mapk3_stop_deletion_variant(): + return Variant( + contig="16", + start=30128151, + ref="GGGATGCCTACGTGCCCCC", + alt="G", + genome="GRCh37", + ) + + +def _coding_effect(effect): + """Unwrap a SpliceOutcomeSet to the coding effect if splicing is + unchanged; otherwise return the effect itself.""" + return getattr(effect, "effect_if_splicing_unchanged", effect) + + +def test_394_stop_deletion_with_empty_3p_utr_is_deletion(): + variant = _mapk3_stop_deletion_variant() + transcript = variant.ensembl.transcript_by_id("ENST00000395199") + # precondition for the bug: this transcript has no 3' UTR sequence + assert transcript.three_prime_utr_sequence == "" + + # used to raise ValueError before the fix + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is Deletion, \ + "Expected Deletion, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert effect.aa_ref == "GGT", \ + "Expected aa_ref='GGT', got %r" % effect.aa_ref + assert effect.aa_alt == "", \ + "Expected empty aa_alt, got %r" % effect.aa_alt + assert effect.short_description == "p.GGT354del", \ + "Expected p.GGT354del, got %r" % effect.short_description + + +def test_394_full_effects_call_does_not_raise(): + # The crux of the bug: predicting effects across *all* MAPK3 + # transcripts (including ENST00000395199) must not raise. + variant = _mapk3_stop_deletion_variant() + effects = variant.effects() + assert len(effects) > 0 + + +def test_394_same_deletion_with_nonempty_3p_utr_stays_stoploss(): + # Same variant, different transcript: a non-empty 3' UTR means the + # stop-loss reads through and adds residues, so this must remain a + # StopLoss (guards against over-broadening the Deletion fallback). + variant = _mapk3_stop_deletion_variant() + transcript = variant.ensembl.transcript_by_id("ENST00000403394") + assert len(transcript.three_prime_utr_sequence) > 0 + + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is StopLoss, \ + "Expected StopLoss, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert len(effect.aa_alt) > 0, \ + "StopLoss should have added residues, got aa_alt=%r" % effect.aa_alt + + +def test_394_predict_in_frame_empty_3p_utr_is_deletion_unit(): + # Splice-free unit test of the exact branch in + # predict_in_frame_coding_effect: deleting the final residue codon plus + # the stop codon (in-frame) on a transcript with no 3' UTR. + variant = _mapk3_stop_deletion_variant() + transcript = variant.ensembl.transcript_by_id("ENST00000395199") + assert transcript.three_prime_utr_sequence == "" + + start = transcript.first_start_codon_spliced_offset + sequence_from_start_codon = str(transcript.sequence[start:]) + # length of the CDS including the stop codon + cds_plus_stop_len = (len(transcript.protein_sequence) + 1) * 3 + + # delete the last residue codon + stop codon (6nt, stays in-frame) + cds_offset = cds_plus_stop_len - 6 + trimmed_cdna_ref = sequence_from_start_codon[cds_offset:cds_offset + 6] + + effect = predict_in_frame_coding_effect( + variant=variant, + transcript=transcript, + trimmed_cdna_ref=trimmed_cdna_ref, + trimmed_cdna_alt="", + sequence_from_start_codon=sequence_from_start_codon, + cds_offset=cds_offset) + assert effect.__class__ is Deletion, \ + "Expected Deletion, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert effect.aa_ref == "T" + assert effect.aa_alt == "" diff --git a/varcode/effects/effect_prediction_coding_in_frame.py b/varcode/effects/effect_prediction_coding_in_frame.py index ad263cd..81d3e43 100644 --- a/varcode/effects/effect_prediction_coding_in_frame.py +++ b/varcode/effects/effect_prediction_coding_in_frame.py @@ -283,7 +283,7 @@ def predict_in_frame_coding_effect( transcript=transcript, aa_pos=aa_mutation_start_offset - n_aa_shared, aa_ref=shared_prefix + shared_suffix) - elif using_three_prime_utr: + elif using_three_prime_utr and n_aa_alt > 0: # if non-silent mutation is at the end of the protein then # should be a stop-loss return StopLoss( @@ -292,6 +292,22 @@ def predict_in_frame_coding_effect( aa_ref=aa_ref, aa_alt=aa_alt) elif n_aa_alt == 0: + # A mutation can disrupt the original stop codon (so + # using_three_prime_utr is True) yet add no new amino acids when the + # transcript has no 3' UTR sequence to translate into — e.g. MAPK3-006 + # / ENST00000395199, whose three_prime_utr_sequence is "". With no + # readthrough sequence we can't predict an extended protein, so the + # honest classification is an in-frame deletion of the C-terminal + # residues rather than a StopLoss with an empty aa_alt (which the + # StopLoss constructor rejects). Closes #394; the earlier #246 fix + # only covered transcripts with a non-empty 3' UTR. + # + # This is a deliberate choice over a dedicated StopLossDeletion class: + # the readthrough peptide is unknowable here, so a Deletion carries + # all the predictable consequence. The trade-off is that it ranks at + # in-frame-deletion priority rather than stop-loss priority; in + # practice a sibling transcript with a real 3' UTR surfaces the + # StopLoss for top_priority_effect. See #394 discussion. return Deletion( variant, transcript, From e6451d80cc69fcc996e02e15eaafe070f6fbb2e8 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Thu, 18 Jun 2026 15:40:53 -0400 Subject: [PATCH 2/2] Also fix protein_diff classifier + bump to 6.0.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default annotator (protein_diff -> classify_from_protein_diff) independently mislabeled the same empty-3'UTR stop deletion as a PrematureStop ("p.G355*") on a clean in-exon variant. That's wrong: the mutant CDS contains no stop codon at all — the protein simply runs off the end of the available sequence. A PrematureStop requires a stop codon to exist. Guard the PrematureStop branch so it only fires when the mutant transcript actually terminated at a stop codon (i.e. translation did not consume every available codon). When translation ran off the end, fall through to the C-terminal Deletion classification, matching the in-frame predictor. Genuine premature stops (a real stop codon in the mutant CDS) are unaffected. Now both annotators agree this scenario is a Deletion, honoring the #394 decision that an empty-UTR stop deletion is a C-terminal deletion rather than a stop-loss with no predictable readthrough. Tests: clean in-exon deletion via the default annotator -> Deletion; genuine premature-stop insertion (BRCA1) -> still PrematureStop. Bump version to 6.0.1 and date the CHANGELOG entry. Claude-Session: https://claude.ai/code/session_0149VWj5Rm1rYFf9azu4ry62 --- CHANGELOG.md | 9 +++- tests/test_stop_codon_classification_bugs.py | 52 ++++++++++++++++++++ varcode/effects/classify.py | 25 +++++++++- varcode/version.py | 2 +- 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c85ae83..c525bb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Change Log -## [Unreleased](https://github.com/openvax/varcode/tree/HEAD) +## [v6.0.1](https://github.com/openvax/varcode/tree/v6.0.1) (2026-06-18) **Fixed** - In-frame deletions that remove the stop codon of a transcript with no @@ -9,7 +9,12 @@ `ValueError: If no amino acids added by StopLoss then it should be Silent`. With no readthrough sequence to translate into, the effect is now classified as a C-terminal `Deletion` instead of an invalid `StopLoss` - with an empty `aa_alt`. The earlier + with an empty `aa_alt`. Both annotators agree on this: the in-frame + predictor (`FastEffectAnnotator`) no longer constructs the invalid + `StopLoss`, and the default protein-diff classifier + (`classify_from_protein_diff`) no longer mislabels the truncated protein + as a `PrematureStop` — there is no stop codon in the mutant CDS, so a + premature stop is incorrect. The earlier [#246](https://github.com/openvax/varcode/issues/246) fix only covered transcripts with a non-empty 3' UTR ([#394](https://github.com/openvax/varcode/issues/394)). diff --git a/tests/test_stop_codon_classification_bugs.py b/tests/test_stop_codon_classification_bugs.py index 44fbe20..2c7f34d 100644 --- a/tests/test_stop_codon_classification_bugs.py +++ b/tests/test_stop_codon_classification_bugs.py @@ -245,3 +245,55 @@ def test_394_predict_in_frame_empty_3p_utr_is_deletion_unit(): getattr(effect, "short_description", effect)) assert effect.aa_ref == "T" assert effect.aa_alt == "" + + +def test_394_default_annotator_agrees_on_deletion(): + # The reported variant spans the exon boundary, so the default + # (protein_diff) annotator falls back to the fast path. This variant + # is a clean in-frame deletion fully inside the last exon that still + # removes the stop codon of the empty-3'UTR transcript, so it exercises + # the protein_diff *slow path* (classify_from_protein_diff) directly. + # + # Without the classify.py guard this returned PrematureStop ("p.G355*"), + # which is wrong — there is no stop codon in the mutant CDS at all, the + # protein just runs out of sequence. Both annotators must agree it's a + # C-terminal Deletion, matching the #394 decision. + variant = Variant( + contig="16", + start=30128158, + ref="CTACGTGCCCCC", + alt="", + genome="GRCh37", + ) + transcript = variant.ensembl.transcript_by_id("ENST00000395199") + assert transcript.three_prime_utr_sequence == "" + + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is Deletion, \ + "Expected Deletion, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert effect.short_description == "p.GGT354del", \ + "Expected p.GGT354del, got %r" % effect.short_description + + +def test_394_genuine_premature_stop_still_classified_as_premature_stop(): + # Regression guard for the classify.py change: an insertion that + # introduces a real stop codon (so the mutant CDS *does* contain a + # stop) must still be a PrematureStop, not rerouted to Deletion. + # BRCA1-001 / ENST00000357654, reverse strand; inserting "CTA" near + # the phase-0 start of exon 12 places a stop codon early in the CDS. + from varcode.effects import PrematureStop + variant = Variant( + contig="17", + start=43082575 - 6, + ref="", + alt="CTA", + genome="GRCh38", + ) + transcript = variant.ensembl.transcript_by_id("ENST00000357654") + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is PrematureStop, \ + "Expected PrematureStop, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) diff --git a/varcode/effects/classify.py b/varcode/effects/classify.py index 527093f..c99eb24 100644 --- a/varcode/effects/classify.py +++ b/varcode/effects/classify.py @@ -120,13 +120,36 @@ def classify_from_protein_diff( aa_mutation_start_offset=aa_offset, shifted_sequence=alt_delta) + # A mutant protein that is shorter than the reference and truncated + # at the tail is only a PrematureStop if the mutant CDS actually + # contains a stop codon. When an in-frame deletion removes the stop + # codon of a transcript with no 3' UTR to read into, translation + # simply runs off the end of the available sequence — there is no + # stop codon at all, so the shortened protein is a C-terminal + # Deletion (a stop-loss with no predictable readthrough), not a + # PrematureStop. Mirror the in-frame predictor, which reports a + # Deletion here. See #394. (When mutant_transcript is unavailable — + # e.g. the splice-outcome builder — we can't tell, so fall back to + # the historical PrematureStop classification.) + mutant_translation_ran_off_end = False + if (mutant_transcript is not None + and mutant_transcript.cdna_sequence is not None): + cds_start = min(transcript.start_codon_spliced_offsets) + n_mutant_codons = ( + len(mutant_transcript.cdna_sequence) - cds_start) // 3 + # translate() stops *before* the stop codon, so a protein that + # consumed every available codon never encountered one. + mutant_translation_ran_off_end = len(mut_protein) >= n_mutant_codons + # Premature stop: mutant protein shorter than reference and the # change is at the tail (the trimmed alt runs to the end of the # mutant protein). Use the single reference residue at the stop- # creation point as aa_ref (matching fast's convention, which # shows the codon that became a stop rather than the entire # truncated tail). - if len(mut_protein) < len(ref_protein) and aa_offset + n_alt == len(mut_protein): + if (len(mut_protein) < len(ref_protein) + and aa_offset + n_alt == len(mut_protein) + and not mutant_translation_ran_off_end): aa_ref = ref_protein[aa_offset] if aa_offset < len(ref_protein) else ref_delta # Whole-protein trimming can't distinguish a codon-aligned diff --git a/varcode/version.py b/varcode/version.py index 0f607a5..79a961b 100644 --- a/varcode/version.py +++ b/varcode/version.py @@ -1 +1 @@ -__version__ = "6.0.0" +__version__ = "6.0.1"