Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion R/parse_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,13 @@ ApeTime <- function(filepath, format = "double") {
ExtractTaxa <- function(matrixLines, character_num = NULL,
continuous = FALSE) {
taxonLine.pattern <- "('([^']+)'|\"([^\"+])\"|(\\S+))\\s+(.+)$"
# Also recognise taxon-name-only lines (name without data on the same line,
# used e.g. in TNT files where data runs across multiple lines per taxon)
nameOnly.pattern <- "^[A-Za-z][^\\s]*$"

taxonLines <- regexpr(taxonLine.pattern, matrixLines, perl = TRUE) > -1
nameOnlyLines <- grepl(nameOnly.pattern, matrixLines, perl = TRUE)
taxonLines <- taxonLines | nameOnlyLines
# If a line does not start with a taxon name, join it to the preceding line
taxonLineNumber <- which(taxonLines)
previousTaxon <- vapply(which(!taxonLines), function(x) {
Expand All @@ -72,10 +77,14 @@ ExtractTaxa <- function(matrixLines, character_num = NULL,

taxa <- sub(taxonLine.pattern, "\\2\\3\\4", matrixLines, perl = TRUE)
taxa <- gsub(" ", "_", taxa, fixed=TRUE)
# Strip TNT @taxonomy classification suffixes (e.g. Name_@Family_Genus)
taxa <- sub("@\\S*$", "", taxa, perl = TRUE)
taxa <- sub("_+$", "", taxa, perl = TRUE) # remove trailing underscores
taxa[!taxonLines] <- taxa[previousTaxon]
uniqueTaxa <- unique(taxa)

tokens <- sub(taxonLine.pattern, "\\5", matrixLines, perl = TRUE)
tokens[nameOnlyLines] <- "" # name-only lines carry no character data
if (continuous) {
tokens <- strsplit(tokens, "\\s+")
lengths <- lengths(tokens)
Expand Down Expand Up @@ -366,21 +375,32 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
closeComment <- multilineComments[seq_len(nmlc) * 2L]
lines[openComment] <- gsub("'.*", "", lines[openComment])
lines[closeComment] <- gsub(".*'", "", lines[closeComment])
if (nmlc > 0) {
for (i in seq_len(nmlc)) {
innerStart <- openComment[i] + 1L
innerEnd <- closeComment[i] - 1L
if (innerStart <= innerEnd) lines[innerStart:innerEnd] <- ""
}
}

lines <- trimws(lines)
lines <- lines[lines != ""]

semicolons <- grep(";", lines, fixed = TRUE)
upperLines <- toupper(lines)

xread <- grep("^XREAD\\b", lines, ignore.case = TRUE, perl = TRUE)
xread <- grep("\\bXREAD\\b", lines, ignore.case = TRUE, perl = TRUE)
if (length(xread) < 1) return(NULL)
if (length(xread) > 1) {
message("Multiple character blocks not yet supported;",
"contact 'TreeTools' maintainer to request.",
"Returning first block only.")
xread <- xread[1]
}
# If xread appears mid-line (e.g. after other TNT directives separated by ;),
# strip everything before the xread keyword so dimension parsing works.
lines[xread] <- sub("^.*\\bxread\\b", "xread", lines[xread],
ignore.case = TRUE, perl = TRUE)

xreadEnd <- semicolons[semicolons > xread][1]
if (lines[xreadEnd] == ";") {
Expand All @@ -397,6 +417,8 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
attr(dimHit, "match.length")[3] - 1L))
matrixLines <- xreadLines[-seq_len(xDimLine)]

bareAmpLines <- grep("^&\\s*$", matrixLines, perl = TRUE)
if (length(bareAmpLines)) matrixLines <- matrixLines[-bareAmpLines]
ctypeLines <- grep("^&\\[[\\w\\s]+\\]$", matrixLines, perl = TRUE)
if (is.null(type)) {
if (length(ctypeLines)) matrixLines <- matrixLines[-ctypeLines]
Expand Down
10 changes: 10 additions & 0 deletions inst/extdata/tests/tnt-amp-continuation.tnt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
xread
4 3
&
taxon_a 0
&
taxon_a 001
taxon_b 010
taxon_c 111
;
proc /;
6 changes: 6 additions & 0 deletions inst/extdata/tests/tnt-midline-xread.tnt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
piwe=; mxr 100 ; nstates 8 ; xread 4 3
taxon_a 0001
taxon_b 0110
taxon_c 1111
;
proc /;
13 changes: 13 additions & 0 deletions inst/extdata/tests/tnt-multiline-comment.tnt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
xread
'Multi-line comment with semicolons
piwe =10 ;
xpiwe = ;
option_x (*0.80 < 5 /10 ;
'
4 4
taxon_a 0001
taxon_b 0110
taxon_c 1010
taxon_d 1101
;
proc /;
13 changes: 13 additions & 0 deletions inst/extdata/tests/tnt-multiline-taxa.tnt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
xread
4 3
Hypochilus
0011
0100
Filistata
1100
1011
Thaida
0100
0001
;
proc /;
7 changes: 7 additions & 0 deletions inst/extdata/tests/tnt-taxon-taxonomy.tnt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
taxname +100 ; taxonomy=; xread
4 3
taxon_a_@Family_Genus 0001
taxon_b_@Family_Genus 0110
taxon_c_@OtherFamily 1010
;
proc /;
36 changes: 36 additions & 0 deletions tests/testthat/test-ReadTntTree.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,42 @@ test_that("ReadTntCharacter()", {
expect_equal(ReadTntAsPhyDat(testFile), expectedPhyDat)
})

test_that("ReadTntCharacters() multi-line comment", {
mlcFile <- TestFile("tnt-multiline-comment.tnt")
result <- ReadTntCharacters(mlcFile)
expect_equal(dim(result), c(4L, 4L))
expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c", "taxon_d"))
})

test_that("ReadTntCharacters() bare & continuation", {
ampFile <- TestFile("tnt-amp-continuation.tnt")
result <- ReadTntCharacters(ampFile)
expect_equal(dim(result), c(3L, 4L))
expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
expect_equal(result["taxon_a", ], c("0", "0", "0", "1"))
})

test_that("ReadTntCharacters() taxon name on own line", {
mltFile <- TestFile("tnt-multiline-taxa.tnt")
result <- ReadTntCharacters(mltFile)
expect_equal(dim(result), c(3L, 8L))
expect_equal(rownames(result), c("Hypochilus", "Filistata", "Thaida"))
expect_equal(result["Hypochilus", ], c("0", "0", "1", "1", "0", "1", "0", "0"))
})

test_that("ReadTntCharacters() xread mid-line", {
mxFile <- TestFile("tnt-midline-xread.tnt")
result <- ReadTntCharacters(mxFile)
expect_equal(dim(result), c(3L, 4L))
expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
})

test_that("ReadTntCharacters() strips @taxonomy from taxon names", {
ttFile <- TestFile("tnt-taxon-taxonomy.tnt")
result <- ReadTntCharacters(ttFile)
expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
})

test_that("TntTextToTree()", {
expect_equal(TNTText2Tree("(A (B (C (D E ))));"),
ape::read.tree(text = "(A, (B, (C, (D, E))));"))
Expand Down
Loading