Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
# Change Log

## [v6.0.1](https://github.com/openvax/varcode/tree/v6.0.1) (2026-06-18)

**Fixed**
- In-frame deletions that remove the stop codon of a transcript with no
3' UTR sequence (e.g. MAPK3-006 / `ENST00000395199`, whose
`three_prime_utr_sequence` is `""`) no longer raise
`ValueError: If no amino acids added by StopLoss then it should be Silent`.
With no readthrough sequence to translate into, the effect is now
classified as a C-terminal `Deletion` instead of an invalid `StopLoss`
with an empty `aa_alt`. Both annotators agree on this: the in-frame
predictor (`FastEffectAnnotator`) no longer constructs the invalid
`StopLoss`, and the default protein-diff classifier
(`classify_from_protein_diff`) no longer mislabels the truncated protein
as a `PrematureStop` — there is no stop codon in the mutant CDS, so a
premature stop is incorrect. The earlier
[#246](https://github.com/openvax/varcode/issues/246) fix only covered
transcripts with a non-empty 3' UTR
([#394](https://github.com/openvax/varcode/issues/394)).

## [v6.0.0](https://github.com/openvax/varcode/tree/v6.0.0) (2026-05-26)

**Fixed**
Expand Down
177 changes: 176 additions & 1 deletion tests/test_stop_codon_classification_bugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,17 @@
* https://github.com/openvax/varcode/issues/201 - Insertion before the stop
codon that produces an identical protein sequence is reported as an
Insertion rather than Silent.
* https://github.com/openvax/varcode/issues/394 - In-frame deletion that
removes the stop codon of a transcript with no 3' UTR sequence crashes
while constructing a StopLoss with an empty aa_alt; it should be a
Deletion. (The earlier #246 only fixed the non-empty-UTR variant.)
"""

from varcode import Variant
from varcode.effects import Silent, StopLoss
from varcode.effects import Silent, StopLoss, Deletion
from varcode.effects.effect_prediction_coding_in_frame import (
predict_in_frame_coding_effect,
)


# -----------------------------------------------------------------------
Expand Down Expand Up @@ -122,3 +129,171 @@ def test_201_synonymous_insertion_before_stop_is_silent():
"Expected Silent, got %s (%s)" % (
effect.__class__.__name__,
getattr(effect, "short_description", effect))


# -----------------------------------------------------------------------
# Issue #394: in-frame deletion of the stop codon on a transcript with an
# empty 3' UTR.
#
# The reported variant is a large in-frame deletion in MAPK3 on GRCh37
# (Ensembl 75) that removes the stop codon. It overlaps several MAPK3
# transcripts, which gives us both sides of the fix from a single variant:
#
# * ENST00000395199 (MAPK3-006) has NO 3' UTR sequence, so there's
# nothing to translate into once the stop codon is gone — aa_alt ends
# up empty and StopLoss can't be constructed. Correct answer: a
# C-terminal Deletion. This is the case that used to raise
# "If no amino acids added by StopLoss then it should be Silent".
# * ENST00000403394 has an 804nt 3' UTR, so the same deletion reads
# through and adds residues — that one must stay a StopLoss, proving
# the fix didn't over-broaden the Deletion fallback.
#
# The reporter's start coordinate was off by one against the reference
# genome; the equivalent variant that matches GRCh37 ref starts at
# 30128151 (normalized to a pure 18nt deletion at 30128152).
# -----------------------------------------------------------------------


def _mapk3_stop_deletion_variant():
return Variant(
contig="16",
start=30128151,
ref="GGGATGCCTACGTGCCCCC",
alt="G",
genome="GRCh37",
)


def _coding_effect(effect):
"""Unwrap a SpliceOutcomeSet to the coding effect if splicing is
unchanged; otherwise return the effect itself."""
return getattr(effect, "effect_if_splicing_unchanged", effect)


def test_394_stop_deletion_with_empty_3p_utr_is_deletion():
variant = _mapk3_stop_deletion_variant()
transcript = variant.ensembl.transcript_by_id("ENST00000395199")
# precondition for the bug: this transcript has no 3' UTR sequence
assert transcript.three_prime_utr_sequence == ""

# used to raise ValueError before the fix
effect = _coding_effect(variant.effect_on_transcript(transcript))
assert effect.__class__ is Deletion, \
"Expected Deletion, got %s (%s)" % (
effect.__class__.__name__,
getattr(effect, "short_description", effect))
assert effect.aa_ref == "GGT", \
"Expected aa_ref='GGT', got %r" % effect.aa_ref
assert effect.aa_alt == "", \
"Expected empty aa_alt, got %r" % effect.aa_alt
assert effect.short_description == "p.GGT354del", \
"Expected p.GGT354del, got %r" % effect.short_description


def test_394_full_effects_call_does_not_raise():
# The crux of the bug: predicting effects across *all* MAPK3
# transcripts (including ENST00000395199) must not raise.
variant = _mapk3_stop_deletion_variant()
effects = variant.effects()
assert len(effects) > 0


def test_394_same_deletion_with_nonempty_3p_utr_stays_stoploss():
# Same variant, different transcript: a non-empty 3' UTR means the
# stop-loss reads through and adds residues, so this must remain a
# StopLoss (guards against over-broadening the Deletion fallback).
variant = _mapk3_stop_deletion_variant()
transcript = variant.ensembl.transcript_by_id("ENST00000403394")
assert len(transcript.three_prime_utr_sequence) > 0

effect = _coding_effect(variant.effect_on_transcript(transcript))
assert effect.__class__ is StopLoss, \
"Expected StopLoss, got %s (%s)" % (
effect.__class__.__name__,
getattr(effect, "short_description", effect))
assert len(effect.aa_alt) > 0, \
"StopLoss should have added residues, got aa_alt=%r" % effect.aa_alt


def test_394_predict_in_frame_empty_3p_utr_is_deletion_unit():
# Splice-free unit test of the exact branch in
# predict_in_frame_coding_effect: deleting the final residue codon plus
# the stop codon (in-frame) on a transcript with no 3' UTR.
variant = _mapk3_stop_deletion_variant()
transcript = variant.ensembl.transcript_by_id("ENST00000395199")
assert transcript.three_prime_utr_sequence == ""

start = transcript.first_start_codon_spliced_offset
sequence_from_start_codon = str(transcript.sequence[start:])
# length of the CDS including the stop codon
cds_plus_stop_len = (len(transcript.protein_sequence) + 1) * 3

# delete the last residue codon + stop codon (6nt, stays in-frame)
cds_offset = cds_plus_stop_len - 6
trimmed_cdna_ref = sequence_from_start_codon[cds_offset:cds_offset + 6]

effect = predict_in_frame_coding_effect(
variant=variant,
transcript=transcript,
trimmed_cdna_ref=trimmed_cdna_ref,
trimmed_cdna_alt="",
sequence_from_start_codon=sequence_from_start_codon,
cds_offset=cds_offset)
assert effect.__class__ is Deletion, \
"Expected Deletion, got %s (%s)" % (
effect.__class__.__name__,
getattr(effect, "short_description", effect))
assert effect.aa_ref == "T"
assert effect.aa_alt == ""


def test_394_default_annotator_agrees_on_deletion():
# The reported variant spans the exon boundary, so the default
# (protein_diff) annotator falls back to the fast path. This variant
# is a clean in-frame deletion fully inside the last exon that still
# removes the stop codon of the empty-3'UTR transcript, so it exercises
# the protein_diff *slow path* (classify_from_protein_diff) directly.
#
# Without the classify.py guard this returned PrematureStop ("p.G355*"),
# which is wrong — there is no stop codon in the mutant CDS at all, the
# protein just runs out of sequence. Both annotators must agree it's a
# C-terminal Deletion, matching the #394 decision.
variant = Variant(
contig="16",
start=30128158,
ref="CTACGTGCCCCC",
alt="",
genome="GRCh37",
)
transcript = variant.ensembl.transcript_by_id("ENST00000395199")
assert transcript.three_prime_utr_sequence == ""

effect = _coding_effect(variant.effect_on_transcript(transcript))
assert effect.__class__ is Deletion, \
"Expected Deletion, got %s (%s)" % (
effect.__class__.__name__,
getattr(effect, "short_description", effect))
assert effect.short_description == "p.GGT354del", \
"Expected p.GGT354del, got %r" % effect.short_description


def test_394_genuine_premature_stop_still_classified_as_premature_stop():
# Regression guard for the classify.py change: an insertion that
# introduces a real stop codon (so the mutant CDS *does* contain a
# stop) must still be a PrematureStop, not rerouted to Deletion.
# BRCA1-001 / ENST00000357654, reverse strand; inserting "CTA" near
# the phase-0 start of exon 12 places a stop codon early in the CDS.
from varcode.effects import PrematureStop
variant = Variant(
contig="17",
start=43082575 - 6,
ref="",
alt="CTA",
genome="GRCh38",
)
transcript = variant.ensembl.transcript_by_id("ENST00000357654")
effect = _coding_effect(variant.effect_on_transcript(transcript))
assert effect.__class__ is PrematureStop, \
"Expected PrematureStop, got %s (%s)" % (
effect.__class__.__name__,
getattr(effect, "short_description", effect))
25 changes: 24 additions & 1 deletion varcode/effects/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,36 @@ def classify_from_protein_diff(
aa_mutation_start_offset=aa_offset,
shifted_sequence=alt_delta)

# A mutant protein that is shorter than the reference and truncated
# at the tail is only a PrematureStop if the mutant CDS actually
# contains a stop codon. When an in-frame deletion removes the stop
# codon of a transcript with no 3' UTR to read into, translation
# simply runs off the end of the available sequence — there is no
# stop codon at all, so the shortened protein is a C-terminal
# Deletion (a stop-loss with no predictable readthrough), not a
# PrematureStop. Mirror the in-frame predictor, which reports a
# Deletion here. See #394. (When mutant_transcript is unavailable —
# e.g. the splice-outcome builder — we can't tell, so fall back to
# the historical PrematureStop classification.)
mutant_translation_ran_off_end = False
if (mutant_transcript is not None
and mutant_transcript.cdna_sequence is not None):
cds_start = min(transcript.start_codon_spliced_offsets)
n_mutant_codons = (
len(mutant_transcript.cdna_sequence) - cds_start) // 3
# translate() stops *before* the stop codon, so a protein that
# consumed every available codon never encountered one.
mutant_translation_ran_off_end = len(mut_protein) >= n_mutant_codons

# Premature stop: mutant protein shorter than reference and the
# change is at the tail (the trimmed alt runs to the end of the
# mutant protein). Use the single reference residue at the stop-
# creation point as aa_ref (matching fast's convention, which
# shows the codon that became a stop rather than the entire
# truncated tail).
if len(mut_protein) < len(ref_protein) and aa_offset + n_alt == len(mut_protein):
if (len(mut_protein) < len(ref_protein)
and aa_offset + n_alt == len(mut_protein)
and not mutant_translation_ran_off_end):
aa_ref = ref_protein[aa_offset] if aa_offset < len(ref_protein) else ref_delta

# Whole-protein trimming can't distinguish a codon-aligned
Expand Down
18 changes: 17 additions & 1 deletion varcode/effects/effect_prediction_coding_in_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def predict_in_frame_coding_effect(
transcript=transcript,
aa_pos=aa_mutation_start_offset - n_aa_shared,
aa_ref=shared_prefix + shared_suffix)
elif using_three_prime_utr:
elif using_three_prime_utr and n_aa_alt > 0:
# if non-silent mutation is at the end of the protein then
# should be a stop-loss
return StopLoss(
Expand All @@ -292,6 +292,22 @@ def predict_in_frame_coding_effect(
aa_ref=aa_ref,
aa_alt=aa_alt)
elif n_aa_alt == 0:
# A mutation can disrupt the original stop codon (so
# using_three_prime_utr is True) yet add no new amino acids when the
# transcript has no 3' UTR sequence to translate into — e.g. MAPK3-006
# / ENST00000395199, whose three_prime_utr_sequence is "". With no
# readthrough sequence we can't predict an extended protein, so the
# honest classification is an in-frame deletion of the C-terminal
# residues rather than a StopLoss with an empty aa_alt (which the
# StopLoss constructor rejects). Closes #394; the earlier #246 fix
# only covered transcripts with a non-empty 3' UTR.
#
# This is a deliberate choice over a dedicated StopLossDeletion class:
# the readthrough peptide is unknowable here, so a Deletion carries
# all the predictable consequence. The trade-off is that it ranks at
# in-frame-deletion priority rather than stop-loss priority; in
# practice a sibling transcript with a real 3' UTR surfaces the
# StopLoss for top_priority_effect. See #394 discussion.
return Deletion(
variant,
transcript,
Expand Down
2 changes: 1 addition & 1 deletion varcode/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "6.0.0"
__version__ = "6.0.1"
Loading