From 25160c700f0d56dd98681af2c1c983fa277e3835 Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 09:35:31 +0100
Subject: [PATCH 1/5] Handle multi-line TNT comments in ReadTntCharacters

Blank inner lines between open/close multi-line comment markers so that
semicolons and text within comments no longer corrupt xreadEnd detection
or appear as spurious matrix content (fixes dinosaurs- and wasps-style
failures). Guarded with innerStart <= innerEnd to avoid descending
seq on adjacent-line comments.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 R/parse_files.R                              |  7 +++++++
 inst/extdata/tests/tnt-multiline-comment.tnt | 13 +++++++++++++
 tests/testthat/test-ReadTntTree.R            |  7 +++++++
 3 files changed, 27 insertions(+)
 create mode 100644 inst/extdata/tests/tnt-multiline-comment.tnt

diff --git a/R/parse_files.R b/R/parse_files.R
index f701a1cf1..823726bda 100644
--- a/R/parse_files.R
+++ b/R/parse_files.R
@@ -366,6 +366,13 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
   closeComment <- multilineComments[seq_len(nmlc) * 2L]
   lines[openComment] <- gsub("'.*", "", lines[openComment])
   lines[closeComment] <- gsub(".*'", "", lines[closeComment])
+  if (nmlc > 0) {
+    for (i in seq_len(nmlc)) {
+      innerStart <- openComment[i] + 1L
+      innerEnd <- closeComment[i] - 1L
+      if (innerStart <= innerEnd) lines[innerStart:innerEnd] <- ""
+    }
+  }
 
   lines <- trimws(lines)
   lines <- lines[lines != ""]
diff --git a/inst/extdata/tests/tnt-multiline-comment.tnt b/inst/extdata/tests/tnt-multiline-comment.tnt
new file mode 100644
index 000000000..22b818b46
--- /dev/null
+++ b/inst/extdata/tests/tnt-multiline-comment.tnt
@@ -0,0 +1,13 @@
+xread
+'Multi-line comment with semicolons
+piwe =10 ;
+xpiwe = ;
+option_x (*0.80 < 5 /10 ;
+'
+4 4
+taxon_a 0001
+taxon_b 0110
+taxon_c 1010
+taxon_d 1101
+;
+proc /;
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index 6fb4cba37..734b9b655 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -39,6 +39,13 @@ test_that("ReadTntCharacter()", {
   expect_equal(ReadTntAsPhyDat(testFile), expectedPhyDat)
 })
 
+test_that("ReadTntCharacters() multi-line comment", {
+  mlcFile <- TestFile("tnt-multiline-comment.tnt")
+  result <- ReadTntCharacters(mlcFile)
+  expect_equal(dim(result), c(4L, 4L))
+  expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c", "taxon_d"))
+})
+
 test_that("TntTextToTree()", {
   expect_equal(TNTText2Tree("(A (B (C (D E ))));"),
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))

From ccc572c47259bb3e3b33f2d140f6dcdc6a0169cb Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 09:37:14 +0100
Subject: [PATCH 2/5] Strip bare & continuation markers from TNT matrix lines

Bare & lines (used as block separators in multi-segment TNT files
like beetles.tnt) were not removed before ExtractTaxa, causing
max(integer(0)) = -Inf and a vapply type error. Remove them
unconditionally before ctypeLines processing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 R/parse_files.R                             |  2 ++
 inst/extdata/tests/tnt-amp-continuation.tnt | 10 ++++++++++
 tests/testthat/test-ReadTntTree.R           |  8 ++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 inst/extdata/tests/tnt-amp-continuation.tnt

diff --git a/R/parse_files.R b/R/parse_files.R
index 823726bda..74db24832 100644
--- a/R/parse_files.R
+++ b/R/parse_files.R
@@ -404,6 +404,8 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
                               attr(dimHit, "match.length")[3] - 1L))
   matrixLines <- xreadLines[-seq_len(xDimLine)]
 
+  bareAmpLines <- grep("^&\\s*$", matrixLines, perl = TRUE)
+  if (length(bareAmpLines)) matrixLines <- matrixLines[-bareAmpLines]
   ctypeLines <- grep("^&\\[[\\w\\s]+\\]$", matrixLines, perl = TRUE)
   if (is.null(type)) {
     if (length(ctypeLines)) matrixLines <- matrixLines[-ctypeLines]
diff --git a/inst/extdata/tests/tnt-amp-continuation.tnt b/inst/extdata/tests/tnt-amp-continuation.tnt
new file mode 100644
index 000000000..862764158
--- /dev/null
+++ b/inst/extdata/tests/tnt-amp-continuation.tnt
@@ -0,0 +1,10 @@
+xread
+4 3
+&
+taxon_a 0
+&
+taxon_a 001
+taxon_b 010
+taxon_c 111
+;
+proc /;
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index 734b9b655..5eaf26a24 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -46,6 +46,14 @@ test_that("ReadTntCharacters() multi-line comment", {
   expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c", "taxon_d"))
 })
 
+test_that("ReadTntCharacters() bare & continuation", {
+  ampFile <- TestFile("tnt-amp-continuation.tnt")
+  result <- ReadTntCharacters(ampFile)
+  expect_equal(dim(result), c(3L, 4L))
+  expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
+  expect_equal(result["taxon_a", ], c("0", "0", "0", "1"))
+})
+
 test_that("TntTextToTree()", {
   expect_equal(TNTText2Tree("(A (B (C (D E ))));"),
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))

From af1ccb0a9f5023597a6901ac6d391b8cf15d9f55 Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 09:39:33 +0100
Subject: [PATCH 3/5] Handle taxon-name-only lines in ExtractTaxa

TNT files from e.g. characidae/dionychans place the taxon name alone on
its own line, with character data on subsequent lines. Introduce a
secondary nameOnly.pattern that recognises single-token lines starting
with a letter as taxon starts, and explicitly set their token contribution
to empty so data lines are correctly concatenated.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 R/parse_files.R                           |  6 ++++++
 inst/extdata/tests/tnt-multiline-taxa.tnt | 13 +++++++++++++
 tests/testthat/test-ReadTntTree.R         |  8 ++++++++
 3 files changed, 27 insertions(+)
 create mode 100644 inst/extdata/tests/tnt-multiline-taxa.tnt

diff --git a/R/parse_files.R b/R/parse_files.R
index 74db24832..2c0bfe662 100644
--- a/R/parse_files.R
+++ b/R/parse_files.R
@@ -61,8 +61,13 @@ ApeTime <- function(filepath, format = "double") {
 ExtractTaxa <- function(matrixLines, character_num = NULL,
                          continuous = FALSE) {
   taxonLine.pattern <- "('([^']+)'|\"([^\"+])\"|(\\S+))\\s+(.+)$"
+  # Also recognise taxon-name-only lines (name without data on the same line,
+  # used e.g. in TNT files where data runs across multiple lines per taxon)
+  nameOnly.pattern <- "^[A-Za-z][^\\s]*$"
 
   taxonLines <- regexpr(taxonLine.pattern, matrixLines, perl = TRUE) > -1
+  nameOnlyLines <- grepl(nameOnly.pattern, matrixLines, perl = TRUE)
+  taxonLines <- taxonLines | nameOnlyLines
   # If a line does not start with a taxon name, join it to the preceding line
   taxonLineNumber <- which(taxonLines)
   previousTaxon <- vapply(which(!taxonLines), function(x) {
@@ -76,6 +81,7 @@ ExtractTaxa <- function(matrixLines, character_num = NULL,
   uniqueTaxa <- unique(taxa)
 
   tokens <- sub(taxonLine.pattern, "\\5", matrixLines, perl = TRUE)
+  tokens[nameOnlyLines] <- ""  # name-only lines carry no character data
   if (continuous) {
     tokens <- strsplit(tokens, "\\s+")
     lengths <- lengths(tokens)
diff --git a/inst/extdata/tests/tnt-multiline-taxa.tnt b/inst/extdata/tests/tnt-multiline-taxa.tnt
new file mode 100644
index 000000000..8d46d8c35
--- /dev/null
+++ b/inst/extdata/tests/tnt-multiline-taxa.tnt
@@ -0,0 +1,13 @@
+xread
+4 3
+Hypochilus
+0011
+0100
+Filistata
+1100
+1011
+Thaida
+0100
+0001
+;
+proc /;
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index 5eaf26a24..a95733f5b 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -54,6 +54,14 @@ test_that("ReadTntCharacters() bare & continuation", {
   expect_equal(result["taxon_a", ], c("0", "0", "0", "1"))
 })
 
+test_that("ReadTntCharacters() taxon name on own line", {
+  mltFile <- TestFile("tnt-multiline-taxa.tnt")
+  result <- ReadTntCharacters(mltFile)
+  expect_equal(dim(result), c(3L, 8L))
+  expect_equal(rownames(result), c("Hypochilus", "Filistata", "Thaida"))
+  expect_equal(result["Hypochilus", ], c("0", "0", "1", "1", "0", "1", "0", "0"))
+})
+
 test_that("TntTextToTree()", {
   expect_equal(TNTText2Tree("(A (B (C (D E ))));"),
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))

From 6fa925288c3ca36c140e7686d71fa105f4dd6064 Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 09:40:58 +0100
Subject: [PATCH 4/5] Support xread keyword appearing mid-line in TNT files

TNT files from dromaeodat-style datasets emit all directives on a single
semicolon-separated line (e.g. 'piwe=; mxr 100 ; ... ; xread 853 164').
Relax the grep anchor from ^XREAD to \bXREAD\b and strip the pre-xread
portion of the line so dimension parsing and matrix extraction work
correctly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 R/parse_files.R                          | 6 +++++-
 inst/extdata/tests/tnt-midline-xread.tnt | 6 ++++++
 tests/testthat/test-ReadTntTree.R        | 7 +++++++
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 inst/extdata/tests/tnt-midline-xread.tnt

diff --git a/R/parse_files.R b/R/parse_files.R
index 2c0bfe662..b78e4d622 100644
--- a/R/parse_files.R
+++ b/R/parse_files.R
@@ -386,7 +386,7 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
   semicolons <- grep(";", lines, fixed = TRUE)
   upperLines <- toupper(lines)
 
-  xread <- grep("^XREAD\\b", lines, ignore.case = TRUE, perl = TRUE)
+  xread <- grep("\\bXREAD\\b", lines, ignore.case = TRUE, perl = TRUE)
   if (length(xread) < 1) return(NULL)
   if (length(xread) > 1) {
     message("Multiple character blocks not yet supported;",
@@ -394,6 +394,10 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
             "Returning first block only.")
     xread <- xread[1]
   }
+  # If xread appears mid-line (e.g. after other TNT directives separated by ;),
+  # strip everything before the xread keyword so dimension parsing works.
+  lines[xread] <- sub("^.*\\bxread\\b", "xread", lines[xread],
+                       ignore.case = TRUE, perl = TRUE)
 
   xreadEnd <- semicolons[semicolons > xread][1]
   if (lines[xreadEnd] == ";") {
diff --git a/inst/extdata/tests/tnt-midline-xread.tnt b/inst/extdata/tests/tnt-midline-xread.tnt
new file mode 100644
index 000000000..8324f8f02
--- /dev/null
+++ b/inst/extdata/tests/tnt-midline-xread.tnt
@@ -0,0 +1,6 @@
+piwe=; mxr 100 ; nstates 8 ; xread 4 3
+taxon_a 0001
+taxon_b 0110
+taxon_c 1111
+;
+proc /;
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index a95733f5b..c6ffdd26b 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -62,6 +62,13 @@ test_that("ReadTntCharacters() taxon name on own line", {
   expect_equal(result["Hypochilus", ], c("0", "0", "1", "1", "0", "1", "0", "0"))
 })
 
+test_that("ReadTntCharacters() xread mid-line", {
+  mxFile <- TestFile("tnt-midline-xread.tnt")
+  result <- ReadTntCharacters(mxFile)
+  expect_equal(dim(result), c(3L, 4L))
+  expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
+})
+
 test_that("TntTextToTree()", {
   expect_equal(TNTText2Tree("(A (B (C (D E ))));"),
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))

From adc0bfd4d3984766c6121a25772e2ffa6181c3aa Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 09:42:30 +0100
Subject: [PATCH 5/5] Strip @taxonomy classification suffixes from TNT taxon
 names

TNT files with 'taxonomy=;' append '@Family_Genus_...' to each taxon
name. Strip the '@...' suffix (and any trailing underscores) in
ExtractTaxa so that rownames reflect the plain taxon name only.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 R/parse_files.R                           | 3 +++
 inst/extdata/tests/tnt-taxon-taxonomy.tnt | 7 +++++++
 tests/testthat/test-ReadTntTree.R         | 6 ++++++
 3 files changed, 16 insertions(+)
 create mode 100644 inst/extdata/tests/tnt-taxon-taxonomy.tnt

diff --git a/R/parse_files.R b/R/parse_files.R
index b78e4d622..9184d6657 100644
--- a/R/parse_files.R
+++ b/R/parse_files.R
@@ -77,6 +77,9 @@ ExtractTaxa <- function(matrixLines, character_num = NULL,
 
   taxa <- sub(taxonLine.pattern, "\\2\\3\\4", matrixLines, perl = TRUE)
   taxa <- gsub(" ", "_", taxa, fixed=TRUE)
+  # Strip TNT @taxonomy classification suffixes (e.g. Name_@Family_Genus)
+  taxa <- sub("@\\S*$", "", taxa, perl = TRUE)
+  taxa <- sub("_+$", "", taxa, perl = TRUE)  # remove trailing underscores
   taxa[!taxonLines] <- taxa[previousTaxon]
   uniqueTaxa <- unique(taxa)
 
diff --git a/inst/extdata/tests/tnt-taxon-taxonomy.tnt b/inst/extdata/tests/tnt-taxon-taxonomy.tnt
new file mode 100644
index 000000000..621d9ae2b
--- /dev/null
+++ b/inst/extdata/tests/tnt-taxon-taxonomy.tnt
@@ -0,0 +1,7 @@
+taxname +100 ; taxonomy=; xread
+4 3
+taxon_a_@Family_Genus 0001
+taxon_b_@Family_Genus 0110
+taxon_c_@OtherFamily 1010
+;
+proc /;
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index c6ffdd26b..cfd0e2140 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -69,6 +69,12 @@ test_that("ReadTntCharacters() xread mid-line", {
   expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
 })
 
+test_that("ReadTntCharacters() strips @taxonomy from taxon names", {
+  ttFile <- TestFile("tnt-taxon-taxonomy.tnt")
+  result <- ReadTntCharacters(ttFile)
+  expect_equal(rownames(result), c("taxon_a", "taxon_b", "taxon_c"))
+})
+
 test_that("TntTextToTree()", {
   expect_equal(TNTText2Tree("(A (B (C (D E ))));"),
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))