From ee8bec1c80cb0ba5af70c89eebf3ac3da365a492 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 09:08:59 +0100 Subject: [PATCH 1/2] Add ReadXgroup() for TNT character-partition blocks Parses the xgroup block of a TNT file into a named integer vector mapping each character (1-indexed) to its partition; returns NULL when no xread block is present, all-NA when xread is present but xgroup is not. Joins the ReadCharacters family via @describeIn. Sourced from a prototype in AutoPart, where the function originally lived; moved upstream so AutoPart and other downstream packages can share a single implementation. Closes that duplication. Co-Authored-By: Claude Sonnet 4.6 --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 7 ++ R/parse_files.R | 107 ++++++++++++++++++++++++++++++ inst/extdata/tests/tnt-xgroup.tnt | 16 +++++ man/ReadCharacters.Rd | 17 +++++ tests/testthat/test-ReadTntTree.R | 31 +++++++++ 7 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 inst/extdata/tests/tnt-xgroup.tnt diff --git a/DESCRIPTION b/DESCRIPTION index 8fdf42823..8a070ff28 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeTools Title: Create, Modify and Analyse Phylogenetic Trees -Version: 2.3.0 +Version: 2.3.0.9001 Authors@R: c( person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), email = "martin.smith@durham.ac.uk", @@ -72,5 +72,5 @@ ByteCompile: true Encoding: UTF-8 Language: en-GB VignetteBuilder: knitr -RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 7f34d68f7..28708651e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -399,6 +399,7 @@ export(ReadTNTCharacters) export(ReadTntAsPhyDat) export(ReadTntCharacters) export(ReadTntTree) +export(ReadXgroup) export(Renumber) export(RenumberEdges) export(RenumberTips) diff --git a/NEWS.md b/NEWS.md index 778afd2e5..b23c47e58 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# TreeTools 2.3.0.9001 (development) # + +## New functions + +- `ReadXgroup()` parses the `xgroup` character-partition block of a TNT file, + returning a named integer vector mapping each character to its partition. + # TreeTools 2.3.0 (2026-04-22) # ## Performance diff --git a/R/parse_files.R b/R/parse_files.R index f701a1cf1..81b003ff3 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -462,6 +462,113 @@ ReadTntCharacters <- function(filepath, character_num = NULL, #' @export ReadTNTCharacters <- ReadTntCharacters + +#' @describeIn ReadCharacters Read the `xgroup` character-partition block +#' \insertCite{Goloboff2008}{TreeTools} of a TNT file, returning a named integer +#' vector that maps each character (1-indexed in R; TNT uses 0-indexing) to a +#' partition. +#' +#' Range notation `A.B` in TNT denotes characters A to B inclusive (0-indexed); +#' a trailing dot with no second index (`A.`) means character A to the last +#' character in the matrix. Characters not covered by any range receive `NA`. +#' Names of the returned vector are the partition labels supplied in +#' parentheses after the partition id, or the numeric id if no label is given. +#' +#' @return `ReadXgroup()` returns a named integer vector of length equal to the +#' number of characters in the matrix, or `NULL` if no `xread` block is found. +#' @export +ReadXgroup <- function(filepath, encoding = "UTF8") { + lines <- .UTFLines(filepath, encoding) + tntComment.pattern <- "'[^']*'" + lines <- gsub(tntComment.pattern, "", lines, perl = TRUE) + lines <- trimws(lines) + lines <- lines[lines != ""] + + nChar <- .TntNChar(lines) + if (is.null(nChar)) { + return(NULL) + } + + # Return: + .ParseXgroupLines(lines, nChar) +} + +#' @keywords internal +.TntNChar <- function(lines) { + xread <- grep("^xread\\b", lines, ignore.case = TRUE, perl = TRUE)[1] + if (is.na(xread)) { + return(NULL) + } + # The xread dimension line carries two integers: nChar nTaxa. + # Scan forward; ignore any line that begins with a comment quote. + for (i in seq.int(xread + 1L, length(lines))) { + m <- regmatches(lines[i], gregexpr("\\d+", lines[i]))[[1]] + if (length(m) == 2L) { + return(as.integer(m[1])) + } + if (length(m) > 0L && !startsWith(lines[i], "'")) { + break + } + } + NULL # nocov +} + +#' @keywords internal +.ParseXgroupLines <- function(lines, nChar) { + xgLines <- grep("^xgroup\\s*=", lines, ignore.case = TRUE, perl = TRUE) + if (length(xgLines) == 0L) { + return(rep(NA_integer_, nChar)) + } + + out <- rep(NA_integer_, nChar) + names(out) <- seq_len(nChar) + + for (idx in xgLines) { + parsed <- .ParseOneXgroup(lines[idx], nChar) + out[parsed[["chars"]]] <- parsed[["id"]] + names(out)[parsed[["chars"]]] <- parsed[["label"]] + } + + # Return: + out +} + +#' @keywords internal +.ParseOneXgroup <- function(line, nChar) { + idM <- regmatches(line, regexec("xgroup\\s*=\\s*(\\d+)", line, + ignore.case = TRUE, perl = TRUE))[[1]] + id <- as.integer(idM[2]) + + labelM <- regmatches(line, regexec("\\(([^)]+)\\)", line, perl = TRUE))[[1]] + label <- if (length(labelM) > 1L) trimws(labelM[2]) else as.character(id) + + # Range tokens follow the optional label; if no label, follow the partition id + afterLabel <- sub(".*\\)\\s*", "", line) + if (!nzchar(afterLabel)) { + afterLabel <- sub("xgroup\\s*=\\s*\\d+\\s*", "", line, + ignore.case = TRUE, perl = TRUE) + } + tokens <- regmatches(afterLabel, + gregexpr("\\d+\\.\\d*", afterLabel, perl = TRUE))[[1]] + + list(id = id, + label = label, + chars = unlist(lapply(tokens, .ExpandTntRange, nChar = nChar))) +} + +#' @keywords internal +.ExpandTntRange <- function(token, nChar) { + parts <- strsplit(token, ".", fixed = TRUE)[[1]] + # TNT is 0-indexed; convert to 1-indexed + from <- as.integer(parts[1]) + 1L + to <- if (length(parts) > 1L && nzchar(parts[2])) { + as.integer(parts[2]) + 1L + } else { + nChar + } + seq.int(from, to) +} + .UTFLines <- function(filepath, encoding) { if (!file.exists(filepath)) { stop("File '", filepath, "' not found.") diff --git a/inst/extdata/tests/tnt-xgroup.tnt b/inst/extdata/tests/tnt-xgroup.tnt new file mode 100644 index 000000000..669a60500 --- /dev/null +++ b/inst/extdata/tests/tnt-xgroup.tnt @@ -0,0 +1,16 @@ +nstates num 4; +xread +'Example matrix for AutoPart tests' +6 4 +TaxonA 0010 +TaxonB 0110 +TaxonC 1100 +TaxonD 1001 +TaxonE 0?01 +TaxonF 1-00 +; +ccode - 0.5; +xgroup =0 (ANTERIOR) 0.2 ; +xgroup =1 (POSTERIOR) 3. ; +; +proc/; diff --git a/man/ReadCharacters.Rd b/man/ReadCharacters.Rd index 82b444420..e182b9639 100644 --- a/man/ReadCharacters.Rd +++ b/man/ReadCharacters.Rd @@ -4,6 +4,7 @@ \alias{ReadCharacters} \alias{ReadTntCharacters} \alias{ReadTNTCharacters} +\alias{ReadXgroup} \alias{ReadNotes} \alias{ReadAsPhyDat} \alias{ReadTntAsPhyDat} @@ -27,6 +28,8 @@ ReadTNTCharacters( encoding = "UTF8" ) +ReadXgroup(filepath, encoding = "UTF8") + ReadNotes(filepath, encoding = "UTF8") ReadAsPhyDat(...) @@ -70,6 +73,9 @@ function call was unsuccessful. \code{ReadAsPhyDat()} and \code{ReadTntAsPhyDat()} return a \code{phyDat} object. +\code{ReadXgroup()} returns a named integer vector of length equal to the +number of characters in the matrix, or \code{NULL} if no \code{xread} block is found. + \code{ReadNotes()} returns a list in which each entry corresponds to a single character, and itself contains a list of with two elements: \enumerate{ @@ -105,6 +111,17 @@ the highlighted option in the "Encoding" menu) following the example below. } \section{Functions}{ \itemize{ +\item \code{ReadXgroup()}: Read the \code{xgroup} character-partition block +\insertCite{Goloboff2008}{TreeTools} of a TNT file, returning a named integer +vector that maps each character (1-indexed in R; TNT uses 0-indexing) to a +partition. + +Range notation \code{A.B} in TNT denotes characters A to B inclusive (0-indexed); +a trailing dot with no second index (\code{A.}) means character A to the last +character in the matrix. Characters not covered by any range receive \code{NA}. +Names of the returned vector are the partition labels supplied in +parentheses after the partition id, or the numeric id if no label is given. + \item \code{PhyDat()}: A convenient wrapper for \pkg{phangorn}'s \code{phyDat()}, which converts a \strong{list} of morphological characters into a \code{phyDat} object. diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index 6fb4cba37..3817602a0 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -44,6 +44,37 @@ test_that("TntTextToTree()", { ape::read.tree(text = "(A, (B, (C, (D, E))));")) }) +test_that("ReadXgroup() parses partition blocks", { + xg <- ReadXgroup(TestFile("tnt-xgroup.tnt")) + # tnt-xgroup.tnt: 6 chars; xgroup =0 (ANTERIOR) 0.2 ; xgroup =1 (POSTERIOR) 3. + expect_true(is.integer(xg)) + expect_length(xg, 6L) + expect_equal(unname(xg), c(0L, 0L, 0L, 1L, 1L, 1L)) + expect_equal(unname(names(xg)), + c("ANTERIOR", "ANTERIOR", "ANTERIOR", + "POSTERIOR", "POSTERIOR", "POSTERIOR")) +}) + +test_that("ReadXgroup() returns NULL when no xread block", { + tmp <- tempfile(fileext = ".tnt") + on.exit(unlink(tmp), add = TRUE) + writeLines("proc/;", tmp) + expect_null(ReadXgroup(tmp)) +}) + +test_that("ReadXgroup() returns all-NA when xread present but no xgroup", { + xg <- ReadXgroup(TestFile("tnt-matrix.tnt")) + expect_true(is.integer(xg)) + expect_true(all(is.na(xg))) + expect_length(xg, 95L) # tnt-matrix.tnt: 95 characters +}) + +test_that(".ExpandTntRange() handles A.B, A. and A.A", { + expect_equal(TreeTools:::.ExpandTntRange("1.3", 10L), c(2L, 3L, 4L)) + expect_equal(TreeTools:::.ExpandTntRange("3.", 5L), c(4L, 5L)) + expect_equal(TreeTools:::.ExpandTntRange("2.2", 10L), 3L) +}) + test_that("ReadTntTree() NULL return", { expect_null(ReadTntTree(TestFile("ape-tree.nex"))) }) From aeb35c52e63c79e87665e7ccb7fc1da8dff51bd8 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Fri, 15 May 2026 13:49:31 +0100 Subject: [PATCH 2/2] Add coverage tests for unlabelled xgroup and malformed dimension Co-Authored-By: Claude Sonnet 4.6 --- tests/testthat/test-ReadTntTree.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R index 3817602a0..c3026705d 100644 --- a/tests/testthat/test-ReadTntTree.R +++ b/tests/testthat/test-ReadTntTree.R @@ -75,6 +75,27 @@ test_that(".ExpandTntRange() handles A.B, A. and A.A", { expect_equal(TreeTools:::.ExpandTntRange("2.2", 10L), 3L) }) +test_that("ReadXgroup() handles xgroup lines without parenthetical label", { + tmp <- tempfile(fileext = ".tnt") + on.exit(unlink(tmp), add = TRUE) + writeLines(c("xread", "6 4", + "TaxonA 010101", "TaxonB 010101", + "TaxonC 101010", "TaxonD 101010", ";", + "xgroup =0 0.2 ;", + "xgroup =1 3. ;", ";", "proc/;"), tmp) + xg <- ReadXgroup(tmp) + expect_length(xg, 6L) + expect_equal(unname(xg), c(0L, 0L, 0L, 1L, 1L, 1L)) + expect_equal(unname(names(xg)), c("0", "0", "0", "1", "1", "1")) +}) + +test_that(".TntNChar() returns NULL for malformed dimension line", { + tmp <- tempfile(fileext = ".tnt") + on.exit(unlink(tmp), add = TRUE) + writeLines(c("xread", "6", ";", "proc/;"), tmp) + expect_null(ReadXgroup(tmp)) +}) + test_that("ReadTntTree() NULL return", { expect_null(ReadTntTree(TestFile("ape-tree.nex"))) })