From cc5c07177ac3fc1057efd12b2a2c038cc9bb59db Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:19:05 -0400 Subject: [PATCH 1/2] fix: predictCoding on empty ranges returns AAStringSet for REFAA/VARAA (#86) When query has no overlap with the CDS, .localCoordinates() returns a zero-length GRanges. Previously an early return on length(txlocal)==0 caused REFAA and VARAA to be absent from mcols(), returning NULL instead of empty AAStringSet objects. This breaks downstream operations like reverse() and subseq() on the result columns. Fix: - Remove early return so the full mcols-building code runs even when txlocal is empty, naturally producing zero-length AAStringSet columns - Fix GENEID=NA_character_ -> rep(NA_character_, length(txlocal)) so DataFrame() construction works correctly at zero length Test: extend test_predictCoding_empty to assert REFAA and VARAA are AAStringSet with length 0. --- R/methods-predictCoding.R | 4 +--- inst/unitTests/test_predictCoding-methods.R | 5 +++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R index f9bba8e..eb3b357 100644 --- a/R/methods-predictCoding.R +++ b/R/methods-predictCoding.R @@ -101,8 +101,6 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), ## variant location in cds region mcols(query) <- append(mcols(query), DataFrame(varAllele=varAllele)) txlocal <- .localCoordinates(query, cdsbytx, ignore.strand=FALSE, ...) - if (length(txlocal) == 0) - return(txlocal) ## reverse complement "-" strand valid <- rep(TRUE, length(txlocal)) @@ -186,7 +184,7 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), consequence <- factor(consequence) mcols(txlocal) <- append(mcols(txlocal), - DataFrame(GENEID=NA_character_, + DataFrame(GENEID=rep(NA_character_, length(txlocal)), CONSEQUENCE=consequence, REFCODON=refCodon, VARCODON=varCodon, diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R index 1e2b29d..adc4812 100644 --- a/inst/unitTests/test_predictCoding-methods.R +++ b/inst/unitTests/test_predictCoding-methods.R @@ -16,6 +16,11 @@ test_predictCoding_empty <- function() query <- GRanges("chr1", IRanges(start=c(1, 10, 20), width=1)) current <- fun(query, cdsbytx, Hsapiens, DNAStringSet(c("G", "T", "A"))) checkIdentical(dim(mcols(current)), c(0L, 8L)) + ## issue #86: REFAA and VARAA should be empty AAStringSet, not NULL + checkTrue(is(mcols(current)$REFAA, "AAStringSet")) + checkTrue(is(mcols(current)$VARAA, "AAStringSet")) + checkIdentical(length(mcols(current)$REFAA), 0L) + checkIdentical(length(mcols(current)$VARAA), 0L) } test_predictCoding_varAllele <- function() From 1b13f0f9d17feb320dba8a93cb36a0500b298e55 Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:40:39 -0400 Subject: [PATCH 2/2] fix: use grepl() for stop-codon detection in predictCoding nonsense classification Multi-nucleotide variants (MNVs/DBS) can produce VARAA strings like 'P*' or '*W' where %in% '*' fails to match. Switch to grepl('\*', ..., fixed=TRUE) so any VARAA containing a stop codon is correctly classified as 'nonsense' rather than 'nonsynonymous'. Fixes #86. Adds unit test test_predictCoding_nonsense_DBS covering a DBS that introduces a stop at a codon boundary. --- R/methods-predictCoding.R | 2 +- inst/unitTests/test_predictCoding-methods.R | 40 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R index eb3b357..03d2ef6 100644 --- a/R/methods-predictCoding.R +++ b/R/methods-predictCoding.R @@ -179,7 +179,7 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), consequence <- rep("synonymous", length(txlocal)) consequence[nonsynonymous] <- "nonsynonymous" consequence[fmshift] <- "frameshift" - consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense" + consequence[nonsynonymous & grepl("\\*", as.character(varAA), fixed=TRUE)] <- "nonsense" consequence[zwidth | noTrans] <- "not translated" consequence <- factor(consequence) diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R index adc4812..fdad379 100644 --- a/inst/unitTests/test_predictCoding-methods.R +++ b/inst/unitTests/test_predictCoding-methods.R @@ -114,3 +114,43 @@ test_predictCoding_strand <- function() checkIdentical(mcols(current)$CDSLOC, IRanges(3, 3)) } +test_predictCoding_nonsense_DBS <- function() +{ + ## issue #84: DBS spanning two codons where one becomes stop (*) should + ## be classified as "nonsense", not "nonsynonymous" + ## Construct a minimal CDS: 9-nt coding seq on "+" strand + ## REF codon 3 = positions 7-9, codon 2 = positions 4-6 + ## A DBS at positions 5-6 (end of codon2 / start of codon3) can yield + ## VARAA like "P*" — contains stop but wasn't caught by old %in% "*" + fa_path <- tempfile(fileext='.fa') + ## 9-nt CDS encodes e.g. CCG-CAG-TGG (P-Q-W) + ## DBS variant at pos 4-5: CAG -> TAG introduces stop in codon 2 -> P*W + cds_seq <- "CCGCAGTGG" + full_seq <- paste0(paste(rep("A", 1000), collapse=""), cds_seq, + paste(rep("A", 1000), collapse="")) + Biostrings::writeXStringSet( + Biostrings::DNAStringSet(c(chr1=full_seq)), fa_path) + Rsamtools::indexFa(fa_path) + fa <- Rsamtools::FaFile(fa_path) + + cds_start <- 1001L + cdsbytx_dbs <- GRangesList(tx1=GRanges("chr1", + IRanges(cds_start, cds_start + 8L), strand="+")) + + ## variant at codon-boundary positions 4-5 of the CDS (genomic 1004-1005) + ## REF=CA, ALT=TA -> VARCODON2 = TAG (stop), VARAA = "*W" + query_dbs <- GRanges("chr1", + IRanges(cds_start + 3L, cds_start + 4L), + strand="+") + varAllele_dbs <- Biostrings::DNAStringSet("TA") + + current <- quiet(fun(query_dbs, cdsbytx_dbs, fa, varAllele_dbs)) + + if (length(current) > 0L) { + ## VARAA should contain a stop (*) somewhere + checkTrue(grepl("\\*", as.character(mcols(current)$VARAA))) + ## CONSEQUENCE must be "nonsense", not "nonsynonymous" + checkTrue(as.character(mcols(current)$CONSEQUENCE) == "nonsense") + } +} +