diff --git a/R/parse_files.R b/R/parse_files.R index f701a1cf1..9184d6657 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -61,8 +61,13 @@ ApeTime <- function(filepath, format = "double") { ExtractTaxa <- function(matrixLines, character_num = NULL, continuous = FALSE) { taxonLine.pattern <- "('([^']+)'|\"([^\"+])\"|(\\S+))\\s+(.+)$" + # Also recognise taxon-name-only lines (name without data on the same line, + # used e.g. in TNT files where data runs across multiple lines per taxon) + nameOnly.pattern <- "^[A-Za-z][^\\s]*$" taxonLines <- regexpr(taxonLine.pattern, matrixLines, perl = TRUE) > -1 + nameOnlyLines <- grepl(nameOnly.pattern, matrixLines, perl = TRUE) + taxonLines <- taxonLines | nameOnlyLines # If a line does not start with a taxon name, join it to the preceding line taxonLineNumber <- which(taxonLines) previousTaxon <- vapply(which(!taxonLines), function(x) { @@ -72,10 +77,14 @@ ExtractTaxa <- function(matrixLines, character_num = NULL, taxa <- sub(taxonLine.pattern, "\\2\\3\\4", matrixLines, perl = TRUE) taxa <- gsub(" ", "_", taxa, fixed=TRUE) + # Strip TNT @taxonomy classification suffixes (e.g. Name_@Family_Genus) + taxa <- sub("@\\S*$", "", taxa, perl = TRUE) + taxa <- sub("_+$", "", taxa, perl = TRUE) # remove trailing underscores taxa[!taxonLines] <- taxa[previousTaxon] uniqueTaxa <- unique(taxa) tokens <- sub(taxonLine.pattern, "\\5", matrixLines, perl = TRUE) + tokens[nameOnlyLines] <- "" # name-only lines carry no character data if (continuous) { tokens <- strsplit(tokens, "\\s+") lengths <- lengths(tokens) @@ -366,6 +375,13 @@ ReadTntCharacters <- function(filepath, character_num = NULL, closeComment <- multilineComments[seq_len(nmlc) * 2L] lines[openComment] <- gsub("'.*", "", lines[openComment]) lines[closeComment] <- gsub(".*'", "", lines[closeComment]) + if (nmlc > 0) { + for (i in seq_len(nmlc)) { + innerStart <- openComment[i] + 1L + innerEnd <- closeComment[i] - 1L + if (innerStart <= innerEnd) lines[innerStart:innerEnd] <- "" + } + } lines <- trimws(lines) lines <- lines[lines != ""] @@ -373,7 +389,7 @@ ReadTntCharacters <- function(filepath, character_num = NULL, semicolons <- grep(";", lines, fixed = TRUE) upperLines <- toupper(lines) - xread <- grep("^XREAD\\b", lines, ignore.case = TRUE, perl = TRUE) + xread <- grep("\\bXREAD\\b", lines, ignore.case = TRUE, perl = TRUE) if (length(xread) < 1) return(NULL) if (length(xread) > 1) { message("Multiple character blocks not yet supported;", @@ -381,6 +397,10 @@ ReadTntCharacters <- function(filepath, character_num = NULL, "Returning first block only.") xread <- xread[1] } + # If xread appears mid-line (e.g. after other TNT directives separated by ;), + # strip everything before the xread keyword so dimension parsing works. + lines[xread] <- sub("^.*\\bxread\\b", "xread", lines[xread], + ignore.case = TRUE, perl = TRUE) xreadEnd <- semicolons[semicolons > xread][1] if (lines[xreadEnd] == ";") { @@ -397,6 +417,8 @@ ReadTntCharacters <- function(filepath, character_num = NULL, attr(dimHit, "match.length")[3] - 1L)) matrixLines <- xreadLines[-seq_len(xDimLine)] + bareAmpLines <- grep("^&\\s*$", matrixLines, perl = TRUE) + if (length(bareAmpLines)) matrixLines <- matrixLines[-bareAmpLines] ctypeLines <- grep("^&\\[[\\w\\s]+\\]$", matrixLines, perl = TRUE) if (is.null(type)) { if (length(ctypeLines)) matrixLines <- matrixLines[-ctypeLines] diff --git a/inst/extdata/tests/tnt-amp-continuation.tnt b/inst/extdata/tests/tnt-amp-continuation.tnt new file mode 100644 index 000000000..862764158 --- /dev/null +++ b/inst/extdata/tests/tnt-amp-continuation.tnt @@ -0,0 +1,10 @@ +xread +4 3 +& +taxon_a 0 +& +taxon_a 001 +taxon_b 010 +taxon_c 111 +; +proc /; diff --git a/inst/extdata/tests/tnt-midline-xread.tnt b/inst/extdata/tests/tnt-midline-xread.tnt new file mode 100644 index 000000000..8324f8f02 --- /dev/null +++ b/inst/extdata/tests/tnt-midline-xread.tnt @@ -0,0 +1,6 @@ +piwe=; mxr 100 ; nstates 8 ; xread 4 3 +taxon_a 0001 +taxon_b 0110 +taxon_c 1111 +; +proc /; diff --git a/inst/extdata/tests/tnt-multiline-comment.tnt b/inst/extdata/tests/tnt-multiline-comment.tnt new file mode 100644 index 000000000..22b818b46 --- /dev/null +++ b/inst/extdata/tests/tnt-multiline-comment.tnt @@ -0,0 +1,13 @@ +xread +'Multi-line comment with semicolons +piwe =10 ; +xpiwe = ; +option_x (*0.80 < 5 /10 ; +' +4 4 +taxon_a 0001 +taxon_b 0110 +taxon_c 1010 +taxon_d 1101 +; +proc /; diff --git a/inst/extdata/tests/tnt-multiline-taxa.tnt b/inst/extdata/tests/tnt-multiline-taxa.tnt new file mode 100644 index 000000000..8d46d8c35 --- /dev/null +++ b/inst/extdata/tests/tnt-multiline-taxa.tnt @@ -0,0 +1,13 @@ +xread +4 3 +Hypochilus +0011 +0100 +Filistata +1100 +1011 +Thaida +0100 +0001 +; +proc /; diff --git a/inst/extdata/tests/tnt-taxon-taxonomy.tnt b/inst/extdata/tests/tnt-taxon-taxonomy.tnt new file mode 100644 index 000000000..621d9ae2b --- /dev/null +++ b/inst/extdata/tests/tnt-taxon-taxonomy.tnt @@ -0,0 +1,7 @@ +taxname +100 ; taxonomy=; xread +4 3 +taxon_a_@Family_Genus 0001 +taxon_b_@Family_Genus 0110 +taxon_c_@OtherFamily 1010 +; +proc /; diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index 6fb4cba37..cfd0e2140 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -39,6 +39,42 @@ test_that("ReadTntCharacter()", { expect_equal(ReadTntAsPhyDat(testFile), expectedPhyDat) }) +test_that("ReadTntCharacters() multi-line comment", { + mlcFile <- TestFile("tnt-multiline-comment.tnt") + result <- ReadTntCharacters(mlcFile) + expect_equal(dim(result), c(4L, 4L)) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c", "taxon_d")) +}) + +test_that("ReadTntCharacters() bare & continuation", { + ampFile <- TestFile("tnt-amp-continuation.tnt") + result <- ReadTntCharacters(ampFile) + expect_equal(dim(result), c(3L, 4L)) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) + expect_equal(result["taxon_a", ], c("0", "0", "0", "1")) +}) + +test_that("ReadTntCharacters() taxon name on own line", { + mltFile <- TestFile("tnt-multiline-taxa.tnt") + result <- ReadTntCharacters(mltFile) + expect_equal(dim(result), c(3L, 8L)) + expect_equal(rownames(result), c("Hypochilus", "Filistata", "Thaida")) + expect_equal(result["Hypochilus", ], c("0", "0", "1", "1", "0", "1", "0", "0")) +}) + +test_that("ReadTntCharacters() xread mid-line", { + mxFile <- TestFile("tnt-midline-xread.tnt") + result <- ReadTntCharacters(mxFile) + expect_equal(dim(result), c(3L, 4L)) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) +}) + +test_that("ReadTntCharacters() strips @taxonomy from taxon names", { + ttFile <- TestFile("tnt-taxon-taxonomy.tnt") + result <- ReadTntCharacters(ttFile) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) +}) + test_that("TntTextToTree()", { expect_equal(TNTText2Tree("(A (B (C (D E ))));"), ape::read.tree(text = "(A, (B, (C, (D, E))));"))