diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R index f9bba8e..03d2ef6 100644 --- a/R/methods-predictCoding.R +++ b/R/methods-predictCoding.R @@ -101,8 +101,6 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), ## variant location in cds region mcols(query) <- append(mcols(query), DataFrame(varAllele=varAllele)) txlocal <- .localCoordinates(query, cdsbytx, ignore.strand=FALSE, ...) - if (length(txlocal) == 0) - return(txlocal) ## reverse complement "-" strand valid <- rep(TRUE, length(txlocal)) @@ -181,12 +179,12 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), consequence <- rep("synonymous", length(txlocal)) consequence[nonsynonymous] <- "nonsynonymous" consequence[fmshift] <- "frameshift" - consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense" + consequence[nonsynonymous & grepl("\\*", as.character(varAA), fixed=TRUE)] <- "nonsense" consequence[zwidth | noTrans] <- "not translated" consequence <- factor(consequence) mcols(txlocal) <- append(mcols(txlocal), - DataFrame(GENEID=NA_character_, + DataFrame(GENEID=rep(NA_character_, length(txlocal)), CONSEQUENCE=consequence, REFCODON=refCodon, VARCODON=varCodon, diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R index 1e2b29d..fdad379 100644 --- a/inst/unitTests/test_predictCoding-methods.R +++ b/inst/unitTests/test_predictCoding-methods.R @@ -16,6 +16,11 @@ test_predictCoding_empty <- function() query <- GRanges("chr1", IRanges(start=c(1, 10, 20), width=1)) current <- fun(query, cdsbytx, Hsapiens, DNAStringSet(c("G", "T", "A"))) checkIdentical(dim(mcols(current)), c(0L, 8L)) + ## issue #86: REFAA and VARAA should be empty AAStringSet, not NULL + checkTrue(is(mcols(current)$REFAA, "AAStringSet")) + checkTrue(is(mcols(current)$VARAA, "AAStringSet")) + checkIdentical(length(mcols(current)$REFAA), 0L) + checkIdentical(length(mcols(current)$VARAA), 0L) } test_predictCoding_varAllele <- function() @@ -109,3 +114,43 @@ test_predictCoding_strand <- function() checkIdentical(mcols(current)$CDSLOC, IRanges(3, 3)) } +test_predictCoding_nonsense_DBS <- function() +{ + ## issue #84: DBS spanning two codons where one becomes stop (*) should + ## be classified as "nonsense", not "nonsynonymous" + ## Construct a minimal CDS: 9-nt coding seq on "+" strand + ## REF codon 3 = positions 7-9, codon 2 = positions 4-6 + ## A DBS at positions 5-6 (end of codon2 / start of codon3) can yield + ## VARAA like "P*" — contains stop but wasn't caught by old %in% "*" + fa_path <- tempfile(fileext='.fa') + ## 9-nt CDS encodes e.g. CCG-CAG-TGG (P-Q-W) + ## DBS variant at pos 4-5: CAG -> TAG introduces stop in codon 2 -> P*W + cds_seq <- "CCGCAGTGG" + full_seq <- paste0(paste(rep("A", 1000), collapse=""), cds_seq, + paste(rep("A", 1000), collapse="")) + Biostrings::writeXStringSet( + Biostrings::DNAStringSet(c(chr1=full_seq)), fa_path) + Rsamtools::indexFa(fa_path) + fa <- Rsamtools::FaFile(fa_path) + + cds_start <- 1001L + cdsbytx_dbs <- GRangesList(tx1=GRanges("chr1", + IRanges(cds_start, cds_start + 8L), strand="+")) + + ## variant at codon-boundary positions 4-5 of the CDS (genomic 1004-1005) + ## REF=CA, ALT=TA -> VARCODON2 = TAG (stop), VARAA = "*W" + query_dbs <- GRanges("chr1", + IRanges(cds_start + 3L, cds_start + 4L), + strand="+") + varAllele_dbs <- Biostrings::DNAStringSet("TA") + + current <- quiet(fun(query_dbs, cdsbytx_dbs, fa, varAllele_dbs)) + + if (length(current) > 0L) { + ## VARAA should contain a stop (*) somewhere + checkTrue(grepl("\\*", as.character(mcols(current)$VARAA))) + ## CONSEQUENCE must be "nonsense", not "nonsynonymous" + checkTrue(as.character(mcols(current)$CONSEQUENCE) == "nonsense") + } +} +