From ee8bec1c80cb0ba5af70c89eebf3ac3da365a492 Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 09:08:59 +0100
Subject: [PATCH 1/2] Add ReadXgroup() for TNT character-partition blocks

Parses the xgroup block of a TNT file into a named integer vector mapping
each character (1-indexed) to its partition; returns NULL when no xread
block is present, all-NA when xread is present but xgroup is not. Joins
the ReadCharacters family via @describeIn.

Sourced from a prototype in AutoPart, where the function originally lived;
moved upstream so AutoPart and other downstream packages can share a single
implementation. Closes that duplication.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 DESCRIPTION                       |   4 +-
 NAMESPACE                         |   1 +
 NEWS.md                           |   7 ++
 R/parse_files.R                   | 107 ++++++++++++++++++++++++++++++
 inst/extdata/tests/tnt-xgroup.tnt |  16 +++++
 man/ReadCharacters.Rd             |  17 +++++
 tests/testthat/test-ReadTntTree.R |  31 +++++++++
 7 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 inst/extdata/tests/tnt-xgroup.tnt

diff --git a/DESCRIPTION b/DESCRIPTION
index 8fdf42823..8a070ff28 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TreeTools
 Title: Create, Modify and Analyse Phylogenetic Trees
-Version: 2.3.0
+Version: 2.3.0.9001
 Authors@R: c(
   person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), 
          email = "martin.smith@durham.ac.uk",
@@ -72,5 +72,5 @@ ByteCompile: true
 Encoding: UTF-8
 Language: en-GB
 VignetteBuilder: knitr
-RoxygenNote: 7.3.3
 Roxygen: list(markdown = TRUE)
+Config/roxygen2/version: 8.0.0
diff --git a/NAMESPACE b/NAMESPACE
index 7f34d68f7..28708651e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -399,6 +399,7 @@ export(ReadTNTCharacters)
 export(ReadTntAsPhyDat)
 export(ReadTntCharacters)
 export(ReadTntTree)
+export(ReadXgroup)
 export(Renumber)
 export(RenumberEdges)
 export(RenumberTips)
diff --git a/NEWS.md b/NEWS.md
index 778afd2e5..b23c47e58 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# TreeTools 2.3.0.9001 (development) #
+
+## New functions
+
+- `ReadXgroup()` parses the `xgroup` character-partition block of a TNT file,
+  returning a named integer vector mapping each character to its partition.
+
 # TreeTools 2.3.0 (2026-04-22) #
 
 ## Performance
diff --git a/R/parse_files.R b/R/parse_files.R
index f701a1cf1..81b003ff3 100644
--- a/R/parse_files.R
+++ b/R/parse_files.R
@@ -462,6 +462,113 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
 #' @export
 ReadTNTCharacters <- ReadTntCharacters
 
+
+#' @describeIn ReadCharacters Read the `xgroup` character-partition block
+#' \insertCite{Goloboff2008}{TreeTools} of a TNT file, returning a named integer
+#' vector that maps each character (1-indexed in R; TNT uses 0-indexing) to a
+#' partition.
+#'
+#' Range notation `A.B` in TNT denotes characters A to B inclusive (0-indexed);
+#' a trailing dot with no second index (`A.`) means character A to the last
+#' character in the matrix.  Characters not covered by any range receive `NA`.
+#' Names of the returned vector are the partition labels supplied in
+#' parentheses after the partition id, or the numeric id if no label is given.
+#'
+#' @return `ReadXgroup()` returns a named integer vector of length equal to the
+#'   number of characters in the matrix, or `NULL` if no `xread` block is found.
+#' @export
+ReadXgroup <- function(filepath, encoding = "UTF8") {
+  lines <- .UTFLines(filepath, encoding)
+  tntComment.pattern <- "'[^']*'"
+  lines <- gsub(tntComment.pattern, "", lines, perl = TRUE)
+  lines <- trimws(lines)
+  lines <- lines[lines != ""]
+
+  nChar <- .TntNChar(lines)
+  if (is.null(nChar)) {
+    return(NULL)
+  }
+
+  # Return:
+  .ParseXgroupLines(lines, nChar)
+}
+
+#' @keywords internal
+.TntNChar <- function(lines) {
+  xread <- grep("^xread\\b", lines, ignore.case = TRUE, perl = TRUE)[1]
+  if (is.na(xread)) {
+    return(NULL)
+  }
+  # The xread dimension line carries two integers: nChar nTaxa.
+  # Scan forward; ignore any line that begins with a comment quote.
+  for (i in seq.int(xread + 1L, length(lines))) {
+    m <- regmatches(lines[i], gregexpr("\\d+", lines[i]))[[1]]
+    if (length(m) == 2L) {
+      return(as.integer(m[1]))
+    }
+    if (length(m) > 0L && !startsWith(lines[i], "'")) {
+      break
+    }
+  }
+  NULL                                                                       # nocov
+}
+
+#' @keywords internal
+.ParseXgroupLines <- function(lines, nChar) {
+  xgLines <- grep("^xgroup\\s*=", lines, ignore.case = TRUE, perl = TRUE)
+  if (length(xgLines) == 0L) {
+    return(rep(NA_integer_, nChar))
+  }
+
+  out <- rep(NA_integer_, nChar)
+  names(out) <- seq_len(nChar)
+
+  for (idx in xgLines) {
+    parsed <- .ParseOneXgroup(lines[idx], nChar)
+    out[parsed[["chars"]]] <- parsed[["id"]]
+    names(out)[parsed[["chars"]]] <- parsed[["label"]]
+  }
+
+  # Return:
+  out
+}
+
+#' @keywords internal
+.ParseOneXgroup <- function(line, nChar) {
+  idM <- regmatches(line, regexec("xgroup\\s*=\\s*(\\d+)", line,
+                                  ignore.case = TRUE, perl = TRUE))[[1]]
+  id <- as.integer(idM[2])
+
+  labelM <- regmatches(line, regexec("\\(([^)]+)\\)", line, perl = TRUE))[[1]]
+  label <- if (length(labelM) > 1L) trimws(labelM[2]) else as.character(id)
+
+  # Range tokens follow the optional label; if no label, follow the partition id
+  afterLabel <- sub(".*\\)\\s*", "", line)
+  if (!nzchar(afterLabel)) {
+    afterLabel <- sub("xgroup\\s*=\\s*\\d+\\s*", "", line,
+                      ignore.case = TRUE, perl = TRUE)
+  }
+  tokens <- regmatches(afterLabel,
+                       gregexpr("\\d+\\.\\d*", afterLabel, perl = TRUE))[[1]]
+
+  list(id    = id,
+       label = label,
+       chars = unlist(lapply(tokens, .ExpandTntRange, nChar = nChar)))
+}
+
+#' @keywords internal
+.ExpandTntRange <- function(token, nChar) {
+  parts <- strsplit(token, ".", fixed = TRUE)[[1]]
+  # TNT is 0-indexed; convert to 1-indexed
+  from <- as.integer(parts[1]) + 1L
+  to <- if (length(parts) > 1L && nzchar(parts[2])) {
+    as.integer(parts[2]) + 1L
+  } else {
+    nChar
+  }
+  seq.int(from, to)
+}
+
 .UTFLines <- function(filepath, encoding) {
   if (!file.exists(filepath)) {
     stop("File '", filepath, "' not found.")
diff --git a/inst/extdata/tests/tnt-xgroup.tnt b/inst/extdata/tests/tnt-xgroup.tnt
new file mode 100644
index 000000000..669a60500
--- /dev/null
+++ b/inst/extdata/tests/tnt-xgroup.tnt
@@ -0,0 +1,16 @@
+nstates num 4;
+xread
+'Example matrix for AutoPart tests'
+6 4
+TaxonA 0010
+TaxonB 0110
+TaxonC 1100
+TaxonD 1001
+TaxonE 0?01
+TaxonF 1-00
+;
+ccode - 0.5;
+xgroup =0 (ANTERIOR) 0.2 ;
+xgroup =1 (POSTERIOR) 3. ;
+;
+proc/;
diff --git a/man/ReadCharacters.Rd b/man/ReadCharacters.Rd
index 82b444420..e182b9639 100644
--- a/man/ReadCharacters.Rd
+++ b/man/ReadCharacters.Rd
@@ -4,6 +4,7 @@
 \alias{ReadCharacters}
 \alias{ReadTntCharacters}
 \alias{ReadTNTCharacters}
+\alias{ReadXgroup}
 \alias{ReadNotes}
 \alias{ReadAsPhyDat}
 \alias{ReadTntAsPhyDat}
@@ -27,6 +28,8 @@ ReadTNTCharacters(
   encoding = "UTF8"
 )
 
+ReadXgroup(filepath, encoding = "UTF8")
+
 ReadNotes(filepath, encoding = "UTF8")
 
 ReadAsPhyDat(...)
@@ -70,6 +73,9 @@ function call was unsuccessful.
 
 \code{ReadAsPhyDat()} and \code{ReadTntAsPhyDat()} return a \code{phyDat} object.
 
+\code{ReadXgroup()} returns a named integer vector of length equal to the
+number of characters in the matrix, or \code{NULL} if no \code{xread} block is found.
+
 \code{ReadNotes()} returns a list in which each entry corresponds to a
 single character, and itself contains a list of with two elements:
 \enumerate{
@@ -105,6 +111,17 @@ the highlighted option in the "Encoding" menu) following the example below.
 }
 \section{Functions}{
 \itemize{
+\item \code{ReadXgroup()}: Read the \code{xgroup} character-partition block
+\insertCite{Goloboff2008}{TreeTools} of a TNT file, returning a named integer
+vector that maps each character (1-indexed in R; TNT uses 0-indexing) to a
+partition.
+
+Range notation \code{A.B} in TNT denotes characters A to B inclusive (0-indexed);
+a trailing dot with no second index (\code{A.}) means character A to the last
+character in the matrix.  Characters not covered by any range receive \code{NA}.
+Names of the returned vector are the partition labels supplied in
+parentheses after the partition id, or the numeric id if no label is given.
+
 \item \code{PhyDat()}: A convenient wrapper for \pkg{phangorn}'s
 \code{phyDat()}, which converts a \strong{list} of morphological characters into a
 \code{phyDat} object.
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index 6fb4cba37..3817602a0 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -44,6 +44,37 @@ test_that("TntTextToTree()", {
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))
 })
 
+test_that("ReadXgroup() parses partition blocks", {
+  xg <- ReadXgroup(TestFile("tnt-xgroup.tnt"))
+  # tnt-xgroup.tnt: 6 chars; xgroup =0 (ANTERIOR) 0.2 ; xgroup =1 (POSTERIOR) 3.
+  expect_true(is.integer(xg))
+  expect_length(xg, 6L)
+  expect_equal(unname(xg), c(0L, 0L, 0L, 1L, 1L, 1L))
+  expect_equal(unname(names(xg)),
+               c("ANTERIOR", "ANTERIOR", "ANTERIOR",
+                 "POSTERIOR", "POSTERIOR", "POSTERIOR"))
+})
+
+test_that("ReadXgroup() returns NULL when no xread block", {
+  tmp <- tempfile(fileext = ".tnt")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines("proc/;", tmp)
+  expect_null(ReadXgroup(tmp))
+})
+
+test_that("ReadXgroup() returns all-NA when xread present but no xgroup", {
+  xg <- ReadXgroup(TestFile("tnt-matrix.tnt"))
+  expect_true(is.integer(xg))
+  expect_true(all(is.na(xg)))
+  expect_length(xg, 95L)  # tnt-matrix.tnt: 95 characters
+})
+
+test_that(".ExpandTntRange() handles A.B, A. and A.A", {
+  expect_equal(TreeTools:::.ExpandTntRange("1.3", 10L), c(2L, 3L, 4L))
+  expect_equal(TreeTools:::.ExpandTntRange("3.",  5L), c(4L, 5L))
+  expect_equal(TreeTools:::.ExpandTntRange("2.2", 10L), 3L)
+})
+
 test_that("ReadTntTree() NULL return", {
   expect_null(ReadTntTree(TestFile("ape-tree.nex")))
 })

From aeb35c52e63c79e87665e7ccb7fc1da8dff51bd8 Mon Sep 17 00:00:00 2001
From: R script <1695515+ms609@users.noreply.github.com>
Date: Fri, 15 May 2026 13:49:31 +0100
Subject: [PATCH 2/2] Add coverage tests for unlabelled xgroup and malformed
 dimension

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/testthat/test-ReadTntTree.R | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
index 3817602a0..c3026705d 100644
--- a/tests/testthat/test-ReadTntTree.R
+++ b/tests/testthat/test-ReadTntTree.R
@@ -75,6 +75,27 @@ test_that(".ExpandTntRange() handles A.B, A. and A.A", {
   expect_equal(TreeTools:::.ExpandTntRange("2.2", 10L), 3L)
 })
 
+test_that("ReadXgroup() handles xgroup lines without parenthetical label", {
+  tmp <- tempfile(fileext = ".tnt")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c("xread", "6 4",
+               "TaxonA 010101", "TaxonB 010101",
+               "TaxonC 101010", "TaxonD 101010", ";",
+               "xgroup =0 0.2 ;",
+               "xgroup =1 3. ;", ";", "proc/;"), tmp)
+  xg <- ReadXgroup(tmp)
+  expect_length(xg, 6L)
+  expect_equal(unname(xg), c(0L, 0L, 0L, 1L, 1L, 1L))
+  expect_equal(unname(names(xg)), c("0", "0", "0", "1", "1", "1"))
+})
+
+test_that(".TntNChar() returns NULL for malformed dimension line", {
+  tmp <- tempfile(fileext = ".tnt")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c("xread", "6", ";", "proc/;"), tmp)
+  expect_null(ReadXgroup(tmp))
+})
+
 test_that("ReadTntTree() NULL return", {
   expect_null(ReadTntTree(TestFile("ape-tree.nex")))
 })