From 25160c700f0d56dd98681af2c1c983fa277e3835 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 09:35:31 +0100 Subject: [PATCH 1/5] Handle multi-line TNT comments in ReadTntCharacters Blank inner lines between open/close multi-line comment markers so that semicolons and text within comments no longer corrupt xreadEnd detection or appear as spurious matrix content (fixes dinosaurs- and wasps-style failures). Guarded with innerStart <= innerEnd to avoid descending seq on adjacent-line comments. Co-Authored-By: Claude Sonnet 4.6 --- R/parse_files.R | 7 +++++++ inst/extdata/tests/tnt-multiline-comment.tnt | 13 +++++++++++++ tests/testthat/test-ReadTntTree.R | 7 +++++++ 3 files changed, 27 insertions(+) create mode 100644 inst/extdata/tests/tnt-multiline-comment.tnt diff --git a/R/parse_files.R b/R/parse_files.R index f701a1cf1..823726bda 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -366,6 +366,13 @@ ReadTntCharacters <- function(filepath, character_num = NULL, closeComment <- multilineComments[seq_len(nmlc) * 2L] lines[openComment] <- gsub("'.*", "", lines[openComment]) lines[closeComment] <- gsub(".*'", "", lines[closeComment]) + if (nmlc > 0) { + for (i in seq_len(nmlc)) { + innerStart <- openComment[i] + 1L + innerEnd <- closeComment[i] - 1L + if (innerStart <= innerEnd) lines[innerStart:innerEnd] <- "" + } + } lines <- trimws(lines) lines <- lines[lines != ""] diff --git a/inst/extdata/tests/tnt-multiline-comment.tnt b/inst/extdata/tests/tnt-multiline-comment.tnt new file mode 100644 index 000000000..22b818b46 --- /dev/null +++ b/inst/extdata/tests/tnt-multiline-comment.tnt @@ -0,0 +1,13 @@ +xread +'Multi-line comment with semicolons +piwe =10 ; +xpiwe = ; +option_x (*0.80 < 5 /10 ; +' +4 4 +taxon_a 0001 +taxon_b 0110 +taxon_c 1010 +taxon_d 1101 +; +proc /; diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index 6fb4cba37..734b9b655 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -39,6 +39,13 @@ test_that("ReadTntCharacter()", { expect_equal(ReadTntAsPhyDat(testFile), expectedPhyDat) }) +test_that("ReadTntCharacters() multi-line comment", { + mlcFile <- TestFile("tnt-multiline-comment.tnt") + result <- ReadTntCharacters(mlcFile) + expect_equal(dim(result), c(4L, 4L)) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c", "taxon_d")) +}) + test_that("TntTextToTree()", { expect_equal(TNTText2Tree("(A (B (C (D E ))));"), ape::read.tree(text = "(A, (B, (C, (D, E))));")) From ccc572c47259bb3e3b33f2d140f6dcdc6a0169cb Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 09:37:14 +0100 Subject: [PATCH 2/5] Strip bare & continuation markers from TNT matrix lines Bare & lines (used as block separators in multi-segment TNT files like beetles.tnt) were not removed before ExtractTaxa, causing max(integer(0)) = -Inf and a vapply type error. Remove them unconditionally before ctypeLines processing. Co-Authored-By: Claude Sonnet 4.6 --- R/parse_files.R | 2 ++ inst/extdata/tests/tnt-amp-continuation.tnt | 10 ++++++++++ tests/testthat/test-ReadTntTree.R | 8 ++++++++ 3 files changed, 20 insertions(+) create mode 100644 inst/extdata/tests/tnt-amp-continuation.tnt diff --git a/R/parse_files.R b/R/parse_files.R index 823726bda..74db24832 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -404,6 +404,8 @@ ReadTntCharacters <- function(filepath, character_num = NULL, attr(dimHit, "match.length")[3] - 1L)) matrixLines <- xreadLines[-seq_len(xDimLine)] + bareAmpLines <- grep("^&\\s*$", matrixLines, perl = TRUE) + if (length(bareAmpLines)) matrixLines <- matrixLines[-bareAmpLines] ctypeLines <- grep("^&\\[[\\w\\s]+\\]$", matrixLines, perl = TRUE) if (is.null(type)) { if (length(ctypeLines)) matrixLines <- matrixLines[-ctypeLines] diff --git a/inst/extdata/tests/tnt-amp-continuation.tnt b/inst/extdata/tests/tnt-amp-continuation.tnt new file mode 100644 index 000000000..862764158 --- /dev/null +++ b/inst/extdata/tests/tnt-amp-continuation.tnt @@ -0,0 +1,10 @@ +xread +4 3 +& +taxon_a 0 +& +taxon_a 001 +taxon_b 010 +taxon_c 111 +; +proc /; diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index 734b9b655..5eaf26a24 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -46,6 +46,14 @@ test_that("ReadTntCharacters() multi-line comment", { expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c", "taxon_d")) }) +test_that("ReadTntCharacters() bare & continuation", { + ampFile <- TestFile("tnt-amp-continuation.tnt") + result <- ReadTntCharacters(ampFile) + expect_equal(dim(result), c(3L, 4L)) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) + expect_equal(result["taxon_a", ], c("0", "0", "0", "1")) +}) + test_that("TntTextToTree()", { expect_equal(TNTText2Tree("(A (B (C (D E ))));"), ape::read.tree(text = "(A, (B, (C, (D, E))));")) From af1ccb0a9f5023597a6901ac6d391b8cf15d9f55 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 09:39:33 +0100 Subject: [PATCH 3/5] Handle taxon-name-only lines in ExtractTaxa TNT files from e.g. characidae/dionychans place the taxon name alone on its own line, with character data on subsequent lines. Introduce a secondary nameOnly.pattern that recognises single-token lines starting with a letter as taxon starts, and explicitly set their token contribution to empty so data lines are correctly concatenated. Co-Authored-By: Claude Sonnet 4.6 --- R/parse_files.R | 6 ++++++ inst/extdata/tests/tnt-multiline-taxa.tnt | 13 +++++++++++++ tests/testthat/test-ReadTntTree.R | 8 ++++++++ 3 files changed, 27 insertions(+) create mode 100644 inst/extdata/tests/tnt-multiline-taxa.tnt diff --git a/R/parse_files.R b/R/parse_files.R index 74db24832..2c0bfe662 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -61,8 +61,13 @@ ApeTime <- function(filepath, format = "double") { ExtractTaxa <- function(matrixLines, character_num = NULL, continuous = FALSE) { taxonLine.pattern <- "('([^']+)'|\"([^\"+])\"|(\\S+))\\s+(.+)$" + # Also recognise taxon-name-only lines (name without data on the same line, + # used e.g. in TNT files where data runs across multiple lines per taxon) + nameOnly.pattern <- "^[A-Za-z][^\\s]*$" taxonLines <- regexpr(taxonLine.pattern, matrixLines, perl = TRUE) > -1 + nameOnlyLines <- grepl(nameOnly.pattern, matrixLines, perl = TRUE) + taxonLines <- taxonLines | nameOnlyLines # If a line does not start with a taxon name, join it to the preceding line taxonLineNumber <- which(taxonLines) previousTaxon <- vapply(which(!taxonLines), function(x) { @@ -76,6 +81,7 @@ ExtractTaxa <- function(matrixLines, character_num = NULL, uniqueTaxa <- unique(taxa) tokens <- sub(taxonLine.pattern, "\\5", matrixLines, perl = TRUE) + tokens[nameOnlyLines] <- "" # name-only lines carry no character data if (continuous) { tokens <- strsplit(tokens, "\\s+") lengths <- lengths(tokens) diff --git a/inst/extdata/tests/tnt-multiline-taxa.tnt b/inst/extdata/tests/tnt-multiline-taxa.tnt new file mode 100644 index 000000000..8d46d8c35 --- /dev/null +++ b/inst/extdata/tests/tnt-multiline-taxa.tnt @@ -0,0 +1,13 @@ +xread +4 3 +Hypochilus +0011 +0100 +Filistata +1100 +1011 +Thaida +0100 +0001 +; +proc /; diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index 5eaf26a24..a95733f5b 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -54,6 +54,14 @@ test_that("ReadTntCharacters() bare & continuation", { expect_equal(result["taxon_a", ], c("0", "0", "0", "1")) }) +test_that("ReadTntCharacters() taxon name on own line", { + mltFile <- TestFile("tnt-multiline-taxa.tnt") + result <- ReadTntCharacters(mltFile) + expect_equal(dim(result), c(3L, 8L)) + expect_equal(rownames(result), c("Hypochilus", "Filistata", "Thaida")) + expect_equal(result["Hypochilus", ], c("0", "0", "1", "1", "0", "1", "0", "0")) +}) + test_that("TntTextToTree()", { expect_equal(TNTText2Tree("(A (B (C (D E ))));"), ape::read.tree(text = "(A, (B, (C, (D, E))));")) From 6fa925288c3ca36c140e7686d71fa105f4dd6064 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 09:40:58 +0100 Subject: [PATCH 4/5] Support xread keyword appearing mid-line in TNT files TNT files from dromaeodat-style datasets emit all directives on a single semicolon-separated line (e.g. 'piwe=; mxr 100 ; ... ; xread 853 164'). Relax the grep anchor from ^XREAD to \bXREAD\b and strip the pre-xread portion of the line so dimension parsing and matrix extraction work correctly. Co-Authored-By: Claude Sonnet 4.6 --- R/parse_files.R | 6 +++++- inst/extdata/tests/tnt-midline-xread.tnt | 6 ++++++ tests/testthat/test-ReadTntTree.R | 7 +++++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 inst/extdata/tests/tnt-midline-xread.tnt diff --git a/R/parse_files.R b/R/parse_files.R index 2c0bfe662..b78e4d622 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -386,7 +386,7 @@ ReadTntCharacters <- function(filepath, character_num = NULL, semicolons <- grep(";", lines, fixed = TRUE) upperLines <- toupper(lines) - xread <- grep("^XREAD\\b", lines, ignore.case = TRUE, perl = TRUE) + xread <- grep("\\bXREAD\\b", lines, ignore.case = TRUE, perl = TRUE) if (length(xread) < 1) return(NULL) if (length(xread) > 1) { message("Multiple character blocks not yet supported;", @@ -394,6 +394,10 @@ ReadTntCharacters <- function(filepath, character_num = NULL, "Returning first block only.") xread <- xread[1] } + # If xread appears mid-line (e.g. after other TNT directives separated by ;), + # strip everything before the xread keyword so dimension parsing works. + lines[xread] <- sub("^.*\\bxread\\b", "xread", lines[xread], + ignore.case = TRUE, perl = TRUE) xreadEnd <- semicolons[semicolons > xread][1] if (lines[xreadEnd] == ";") { diff --git a/inst/extdata/tests/tnt-midline-xread.tnt b/inst/extdata/tests/tnt-midline-xread.tnt new file mode 100644 index 000000000..8324f8f02 --- /dev/null +++ b/inst/extdata/tests/tnt-midline-xread.tnt @@ -0,0 +1,6 @@ +piwe=; mxr 100 ; nstates 8 ; xread 4 3 +taxon_a 0001 +taxon_b 0110 +taxon_c 1111 +; +proc /; diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index a95733f5b..c6ffdd26b 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -62,6 +62,13 @@ test_that("ReadTntCharacters() taxon name on own line", { expect_equal(result["Hypochilus", ], c("0", "0", "1", "1", "0", "1", "0", "0")) }) +test_that("ReadTntCharacters() xread mid-line", { + mxFile <- TestFile("tnt-midline-xread.tnt") + result <- ReadTntCharacters(mxFile) + expect_equal(dim(result), c(3L, 4L)) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) +}) + test_that("TntTextToTree()", { expect_equal(TNTText2Tree("(A (B (C (D E ))));"), ape::read.tree(text = "(A, (B, (C, (D, E))));")) From adc0bfd4d3984766c6121a25772e2ffa6181c3aa Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 09:42:30 +0100 Subject: [PATCH 5/5] Strip @taxonomy classification suffixes from TNT taxon names TNT files with 'taxonomy=;' append '@Family_Genus_...' to each taxon name. Strip the '@...' suffix (and any trailing underscores) in ExtractTaxa so that rownames reflect the plain taxon name only. Co-Authored-By: Claude Sonnet 4.6 --- R/parse_files.R | 3 +++ inst/extdata/tests/tnt-taxon-taxonomy.tnt | 7 +++++++ tests/testthat/test-ReadTntTree.R | 6 ++++++ 3 files changed, 16 insertions(+) create mode 100644 inst/extdata/tests/tnt-taxon-taxonomy.tnt diff --git a/R/parse_files.R b/R/parse_files.R index b78e4d622..9184d6657 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -77,6 +77,9 @@ ExtractTaxa <- function(matrixLines, character_num = NULL, taxa <- sub(taxonLine.pattern, "\\2\\3\\4", matrixLines, perl = TRUE) taxa <- gsub(" ", "_", taxa, fixed=TRUE) + # Strip TNT @taxonomy classification suffixes (e.g. Name_@Family_Genus) + taxa <- sub("@\\S*$", "", taxa, perl = TRUE) + taxa <- sub("_+$", "", taxa, perl = TRUE) # remove trailing underscores taxa[!taxonLines] <- taxa[previousTaxon] uniqueTaxa <- unique(taxa) diff --git a/inst/extdata/tests/tnt-taxon-taxonomy.tnt b/inst/extdata/tests/tnt-taxon-taxonomy.tnt new file mode 100644 index 000000000..621d9ae2b --- /dev/null +++ b/inst/extdata/tests/tnt-taxon-taxonomy.tnt @@ -0,0 +1,7 @@ +taxname +100 ; taxonomy=; xread +4 3 +taxon_a_@Family_Genus 0001 +taxon_b_@Family_Genus 0110 +taxon_c_@OtherFamily 1010 +; +proc /; diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index c6ffdd26b..cfd0e2140 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -69,6 +69,12 @@ test_that("ReadTntCharacters() xread mid-line", { expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) }) +test_that("ReadTntCharacters() strips @taxonomy from taxon names", { + ttFile <- TestFile("tnt-taxon-taxonomy.tnt") + result <- ReadTntCharacters(ttFile) + expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c")) +}) + test_that("TntTextToTree()", { expect_equal(TNTText2Tree("(A (B (C (D E ))));"), ape::read.tree(text = "(A, (B, (C, (D, E))));"))