diff --git a/CHANGELOG.md b/CHANGELOG.md index 27b5d12..c525bb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # Change Log +## [v6.0.1](https://github.com/openvax/varcode/tree/v6.0.1) (2026-06-18) + +**Fixed** +- In-frame deletions that remove the stop codon of a transcript with no + 3' UTR sequence (e.g. MAPK3-006 / `ENST00000395199`, whose + `three_prime_utr_sequence` is `""`) no longer raise + `ValueError: If no amino acids added by StopLoss then it should be Silent`. + With no readthrough sequence to translate into, the effect is now + classified as a C-terminal `Deletion` instead of an invalid `StopLoss` + with an empty `aa_alt`. Both annotators agree on this: the in-frame + predictor (`FastEffectAnnotator`) no longer constructs the invalid + `StopLoss`, and the default protein-diff classifier + (`classify_from_protein_diff`) no longer mislabels the truncated protein + as a `PrematureStop` — there is no stop codon in the mutant CDS, so a + premature stop is incorrect. The earlier + [#246](https://github.com/openvax/varcode/issues/246) fix only covered + transcripts with a non-empty 3' UTR + ([#394](https://github.com/openvax/varcode/issues/394)). + ## [v6.0.0](https://github.com/openvax/varcode/tree/v6.0.0) (2026-05-26) **Fixed** diff --git a/tests/test_stop_codon_classification_bugs.py b/tests/test_stop_codon_classification_bugs.py index 4f391b5..2c7f34d 100644 --- a/tests/test_stop_codon_classification_bugs.py +++ b/tests/test_stop_codon_classification_bugs.py @@ -21,10 +21,17 @@ * https://github.com/openvax/varcode/issues/201 - Insertion before the stop codon that produces an identical protein sequence is reported as an Insertion rather than Silent. +* https://github.com/openvax/varcode/issues/394 - In-frame deletion that + removes the stop codon of a transcript with no 3' UTR sequence crashes + while constructing a StopLoss with an empty aa_alt; it should be a + Deletion. (The earlier #246 only fixed the non-empty-UTR variant.) """ from varcode import Variant -from varcode.effects import Silent, StopLoss +from varcode.effects import Silent, StopLoss, Deletion +from varcode.effects.effect_prediction_coding_in_frame import ( + predict_in_frame_coding_effect, +) # ----------------------------------------------------------------------- @@ -122,3 +129,171 @@ def test_201_synonymous_insertion_before_stop_is_silent(): "Expected Silent, got %s (%s)" % ( effect.__class__.__name__, getattr(effect, "short_description", effect)) + + +# ----------------------------------------------------------------------- +# Issue #394: in-frame deletion of the stop codon on a transcript with an +# empty 3' UTR. +# +# The reported variant is a large in-frame deletion in MAPK3 on GRCh37 +# (Ensembl 75) that removes the stop codon. It overlaps several MAPK3 +# transcripts, which gives us both sides of the fix from a single variant: +# +# * ENST00000395199 (MAPK3-006) has NO 3' UTR sequence, so there's +# nothing to translate into once the stop codon is gone — aa_alt ends +# up empty and StopLoss can't be constructed. Correct answer: a +# C-terminal Deletion. This is the case that used to raise +# "If no amino acids added by StopLoss then it should be Silent". +# * ENST00000403394 has an 804nt 3' UTR, so the same deletion reads +# through and adds residues — that one must stay a StopLoss, proving +# the fix didn't over-broaden the Deletion fallback. +# +# The reporter's start coordinate was off by one against the reference +# genome; the equivalent variant that matches GRCh37 ref starts at +# 30128151 (normalized to a pure 18nt deletion at 30128152). +# ----------------------------------------------------------------------- + + +def _mapk3_stop_deletion_variant(): + return Variant( + contig="16", + start=30128151, + ref="GGGATGCCTACGTGCCCCC", + alt="G", + genome="GRCh37", + ) + + +def _coding_effect(effect): + """Unwrap a SpliceOutcomeSet to the coding effect if splicing is + unchanged; otherwise return the effect itself.""" + return getattr(effect, "effect_if_splicing_unchanged", effect) + + +def test_394_stop_deletion_with_empty_3p_utr_is_deletion(): + variant = _mapk3_stop_deletion_variant() + transcript = variant.ensembl.transcript_by_id("ENST00000395199") + # precondition for the bug: this transcript has no 3' UTR sequence + assert transcript.three_prime_utr_sequence == "" + + # used to raise ValueError before the fix + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is Deletion, \ + "Expected Deletion, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert effect.aa_ref == "GGT", \ + "Expected aa_ref='GGT', got %r" % effect.aa_ref + assert effect.aa_alt == "", \ + "Expected empty aa_alt, got %r" % effect.aa_alt + assert effect.short_description == "p.GGT354del", \ + "Expected p.GGT354del, got %r" % effect.short_description + + +def test_394_full_effects_call_does_not_raise(): + # The crux of the bug: predicting effects across *all* MAPK3 + # transcripts (including ENST00000395199) must not raise. + variant = _mapk3_stop_deletion_variant() + effects = variant.effects() + assert len(effects) > 0 + + +def test_394_same_deletion_with_nonempty_3p_utr_stays_stoploss(): + # Same variant, different transcript: a non-empty 3' UTR means the + # stop-loss reads through and adds residues, so this must remain a + # StopLoss (guards against over-broadening the Deletion fallback). + variant = _mapk3_stop_deletion_variant() + transcript = variant.ensembl.transcript_by_id("ENST00000403394") + assert len(transcript.three_prime_utr_sequence) > 0 + + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is StopLoss, \ + "Expected StopLoss, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert len(effect.aa_alt) > 0, \ + "StopLoss should have added residues, got aa_alt=%r" % effect.aa_alt + + +def test_394_predict_in_frame_empty_3p_utr_is_deletion_unit(): + # Splice-free unit test of the exact branch in + # predict_in_frame_coding_effect: deleting the final residue codon plus + # the stop codon (in-frame) on a transcript with no 3' UTR. + variant = _mapk3_stop_deletion_variant() + transcript = variant.ensembl.transcript_by_id("ENST00000395199") + assert transcript.three_prime_utr_sequence == "" + + start = transcript.first_start_codon_spliced_offset + sequence_from_start_codon = str(transcript.sequence[start:]) + # length of the CDS including the stop codon + cds_plus_stop_len = (len(transcript.protein_sequence) + 1) * 3 + + # delete the last residue codon + stop codon (6nt, stays in-frame) + cds_offset = cds_plus_stop_len - 6 + trimmed_cdna_ref = sequence_from_start_codon[cds_offset:cds_offset + 6] + + effect = predict_in_frame_coding_effect( + variant=variant, + transcript=transcript, + trimmed_cdna_ref=trimmed_cdna_ref, + trimmed_cdna_alt="", + sequence_from_start_codon=sequence_from_start_codon, + cds_offset=cds_offset) + assert effect.__class__ is Deletion, \ + "Expected Deletion, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert effect.aa_ref == "T" + assert effect.aa_alt == "" + + +def test_394_default_annotator_agrees_on_deletion(): + # The reported variant spans the exon boundary, so the default + # (protein_diff) annotator falls back to the fast path. This variant + # is a clean in-frame deletion fully inside the last exon that still + # removes the stop codon of the empty-3'UTR transcript, so it exercises + # the protein_diff *slow path* (classify_from_protein_diff) directly. + # + # Without the classify.py guard this returned PrematureStop ("p.G355*"), + # which is wrong — there is no stop codon in the mutant CDS at all, the + # protein just runs out of sequence. Both annotators must agree it's a + # C-terminal Deletion, matching the #394 decision. + variant = Variant( + contig="16", + start=30128158, + ref="CTACGTGCCCCC", + alt="", + genome="GRCh37", + ) + transcript = variant.ensembl.transcript_by_id("ENST00000395199") + assert transcript.three_prime_utr_sequence == "" + + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is Deletion, \ + "Expected Deletion, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) + assert effect.short_description == "p.GGT354del", \ + "Expected p.GGT354del, got %r" % effect.short_description + + +def test_394_genuine_premature_stop_still_classified_as_premature_stop(): + # Regression guard for the classify.py change: an insertion that + # introduces a real stop codon (so the mutant CDS *does* contain a + # stop) must still be a PrematureStop, not rerouted to Deletion. + # BRCA1-001 / ENST00000357654, reverse strand; inserting "CTA" near + # the phase-0 start of exon 12 places a stop codon early in the CDS. + from varcode.effects import PrematureStop + variant = Variant( + contig="17", + start=43082575 - 6, + ref="", + alt="CTA", + genome="GRCh38", + ) + transcript = variant.ensembl.transcript_by_id("ENST00000357654") + effect = _coding_effect(variant.effect_on_transcript(transcript)) + assert effect.__class__ is PrematureStop, \ + "Expected PrematureStop, got %s (%s)" % ( + effect.__class__.__name__, + getattr(effect, "short_description", effect)) diff --git a/varcode/effects/classify.py b/varcode/effects/classify.py index 527093f..c99eb24 100644 --- a/varcode/effects/classify.py +++ b/varcode/effects/classify.py @@ -120,13 +120,36 @@ def classify_from_protein_diff( aa_mutation_start_offset=aa_offset, shifted_sequence=alt_delta) + # A mutant protein that is shorter than the reference and truncated + # at the tail is only a PrematureStop if the mutant CDS actually + # contains a stop codon. When an in-frame deletion removes the stop + # codon of a transcript with no 3' UTR to read into, translation + # simply runs off the end of the available sequence — there is no + # stop codon at all, so the shortened protein is a C-terminal + # Deletion (a stop-loss with no predictable readthrough), not a + # PrematureStop. Mirror the in-frame predictor, which reports a + # Deletion here. See #394. (When mutant_transcript is unavailable — + # e.g. the splice-outcome builder — we can't tell, so fall back to + # the historical PrematureStop classification.) + mutant_translation_ran_off_end = False + if (mutant_transcript is not None + and mutant_transcript.cdna_sequence is not None): + cds_start = min(transcript.start_codon_spliced_offsets) + n_mutant_codons = ( + len(mutant_transcript.cdna_sequence) - cds_start) // 3 + # translate() stops *before* the stop codon, so a protein that + # consumed every available codon never encountered one. + mutant_translation_ran_off_end = len(mut_protein) >= n_mutant_codons + # Premature stop: mutant protein shorter than reference and the # change is at the tail (the trimmed alt runs to the end of the # mutant protein). Use the single reference residue at the stop- # creation point as aa_ref (matching fast's convention, which # shows the codon that became a stop rather than the entire # truncated tail). - if len(mut_protein) < len(ref_protein) and aa_offset + n_alt == len(mut_protein): + if (len(mut_protein) < len(ref_protein) + and aa_offset + n_alt == len(mut_protein) + and not mutant_translation_ran_off_end): aa_ref = ref_protein[aa_offset] if aa_offset < len(ref_protein) else ref_delta # Whole-protein trimming can't distinguish a codon-aligned diff --git a/varcode/effects/effect_prediction_coding_in_frame.py b/varcode/effects/effect_prediction_coding_in_frame.py index ad263cd..81d3e43 100644 --- a/varcode/effects/effect_prediction_coding_in_frame.py +++ b/varcode/effects/effect_prediction_coding_in_frame.py @@ -283,7 +283,7 @@ def predict_in_frame_coding_effect( transcript=transcript, aa_pos=aa_mutation_start_offset - n_aa_shared, aa_ref=shared_prefix + shared_suffix) - elif using_three_prime_utr: + elif using_three_prime_utr and n_aa_alt > 0: # if non-silent mutation is at the end of the protein then # should be a stop-loss return StopLoss( @@ -292,6 +292,22 @@ def predict_in_frame_coding_effect( aa_ref=aa_ref, aa_alt=aa_alt) elif n_aa_alt == 0: + # A mutation can disrupt the original stop codon (so + # using_three_prime_utr is True) yet add no new amino acids when the + # transcript has no 3' UTR sequence to translate into — e.g. MAPK3-006 + # / ENST00000395199, whose three_prime_utr_sequence is "". With no + # readthrough sequence we can't predict an extended protein, so the + # honest classification is an in-frame deletion of the C-terminal + # residues rather than a StopLoss with an empty aa_alt (which the + # StopLoss constructor rejects). Closes #394; the earlier #246 fix + # only covered transcripts with a non-empty 3' UTR. + # + # This is a deliberate choice over a dedicated StopLossDeletion class: + # the readthrough peptide is unknowable here, so a Deletion carries + # all the predictable consequence. The trade-off is that it ranks at + # in-frame-deletion priority rather than stop-loss priority; in + # practice a sibling transcript with a real 3' UTR surfaces the + # StopLoss for top_priority_effect. See #394 discussion. return Deletion( variant, transcript, diff --git a/varcode/version.py b/varcode/version.py index 0f607a5..79a961b 100644 --- a/varcode/version.py +++ b/varcode/version.py @@ -1 +1 @@ -__version__ = "6.0.0" +__version__ = "6.0.1"