ms609 · ms609 · May 15, 2026 · May 15, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TreeTools
 Title: Create, Modify and Analyse Phylogenetic Trees
-Version: 2.3.0
+Version: 2.3.0.9001
 Authors@R: c(
   person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), 
          email = "martin.smith@durham.ac.uk",
@@ -72,5 +72,5 @@ ByteCompile: true
 Encoding: UTF-8
 Language: en-GB
 VignetteBuilder: knitr
-RoxygenNote: 7.3.3
 Roxygen: list(markdown = TRUE)
+Config/roxygen2/version: 8.0.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -399,6 +399,7 @@ export(ReadTNTCharacters)
 export(ReadTntAsPhyDat)
 export(ReadTntCharacters)
 export(ReadTntTree)
+export(ReadXgroup)
 export(Renumber)
 export(RenumberEdges)
 export(RenumberTips)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# TreeTools 2.3.0.9001 (development) #
+
+## New functions
+
+- `ReadXgroup()` parses the `xgroup` character-partition block of a TNT file,
+  returning a named integer vector mapping each character to its partition.
+
 # TreeTools 2.3.0 (2026-04-22) #
 
 ## Performance

diff --git a/R/parse_files.R b/R/parse_files.R
@@ -462,6 +462,113 @@ ReadTntCharacters <- function(filepath, character_num = NULL,
 #' @export
 ReadTNTCharacters <- ReadTntCharacters
 
+
+#' @describeIn ReadCharacters Read the `xgroup` character-partition block
+#' \insertCite{Goloboff2008}{TreeTools} of a TNT file, returning a named integer
+#' vector that maps each character (1-indexed in R; TNT uses 0-indexing) to a
+#' partition.
+#'
+#' Range notation `A.B` in TNT denotes characters A to B inclusive (0-indexed);
+#' a trailing dot with no second index (`A.`) means character A to the last
+#' character in the matrix.  Characters not covered by any range receive `NA`.
+#' Names of the returned vector are the partition labels supplied in
+#' parentheses after the partition id, or the numeric id if no label is given.
+#'
+#' @return `ReadXgroup()` returns a named integer vector of length equal to the
+#'   number of characters in the matrix, or `NULL` if no `xread` block is found.
+#' @export
+ReadXgroup <- function(filepath, encoding = "UTF8") {
+  lines <- .UTFLines(filepath, encoding)
+  tntComment.pattern <- "'[^']*'"
+  lines <- gsub(tntComment.pattern, "", lines, perl = TRUE)
+  lines <- trimws(lines)
+  lines <- lines[lines != ""]
+
+  nChar <- .TntNChar(lines)
+  if (is.null(nChar)) {
+    return(NULL)
+  }
+
+  # Return:
+  .ParseXgroupLines(lines, nChar)
+}
+
+#' @keywords internal
+.TntNChar <- function(lines) {
+  xread <- grep("^xread\\b", lines, ignore.case = TRUE, perl = TRUE)[1]
+  if (is.na(xread)) {
+    return(NULL)
+  }
+  # The xread dimension line carries two integers: nChar nTaxa.
+  # Scan forward; ignore any line that begins with a comment quote.
+  for (i in seq.int(xread + 1L, length(lines))) {
+    m <- regmatches(lines[i], gregexpr("\\d+", lines[i]))[[1]]
+    if (length(m) == 2L) {
+      return(as.integer(m[1]))
+    }
+    if (length(m) > 0L && !startsWith(lines[i], "'")) {
+      break
+    }
+  }
+  NULL                                                                       # nocov
+}
+
+#' @keywords internal
+.ParseXgroupLines <- function(lines, nChar) {
+  xgLines <- grep("^xgroup\\s*=", lines, ignore.case = TRUE, perl = TRUE)
+  if (length(xgLines) == 0L) {
+    return(rep(NA_integer_, nChar))
+  }
+
+  out <- rep(NA_integer_, nChar)
+  names(out) <- seq_len(nChar)
+
+  for (idx in xgLines) {
+    parsed <- .ParseOneXgroup(lines[idx], nChar)
+    out[parsed[["chars"]]] <- parsed[["id"]]
+    names(out)[parsed[["chars"]]] <- parsed[["label"]]
+  }
+
+  # Return:
+  out
+}
+
+#' @keywords internal
+.ParseOneXgroup <- function(line, nChar) {
+  idM <- regmatches(line, regexec("xgroup\\s*=\\s*(\\d+)", line,
+                                  ignore.case = TRUE, perl = TRUE))[[1]]
+  id <- as.integer(idM[2])
+
+  labelM <- regmatches(line, regexec("\\(([^)]+)\\)", line, perl = TRUE))[[1]]
+  label <- if (length(labelM) > 1L) trimws(labelM[2]) else as.character(id)
+
+  # Range tokens follow the optional label; if no label, follow the partition id
+  afterLabel <- sub(".*\\)\\s*", "", line)
+  if (!nzchar(afterLabel)) {
+    afterLabel <- sub("xgroup\\s*=\\s*\\d+\\s*", "", line,
+                      ignore.case = TRUE, perl = TRUE)
+  }
+  tokens <- regmatches(afterLabel,
+                       gregexpr("\\d+\\.\\d*", afterLabel, perl = TRUE))[[1]]
+
+  list(id    = id,
+       label = label,
+       chars = unlist(lapply(tokens, .ExpandTntRange, nChar = nChar)))
+}
+
+#' @keywords internal
+.ExpandTntRange <- function(token, nChar) {
+  parts <- strsplit(token, ".", fixed = TRUE)[[1]]
+  # TNT is 0-indexed; convert to 1-indexed
+  from <- as.integer(parts[1]) + 1L
+  to <- if (length(parts) > 1L && nzchar(parts[2])) {
+    as.integer(parts[2]) + 1L
+  } else {
+    nChar
+  }
+  seq.int(from, to)
+}
+
 .UTFLines <- function(filepath, encoding) {
   if (!file.exists(filepath)) {
     stop("File '", filepath, "' not found.")

diff --git a/inst/extdata/tests/tnt-xgroup.tnt b/inst/extdata/tests/tnt-xgroup.tnt
@@ -0,0 +1,16 @@
+nstates num 4;
+xread
+'Example matrix for AutoPart tests'
+6 4
+TaxonA 0010
+TaxonB 0110
+TaxonC 1100
+TaxonD 1001
+TaxonE 0?01
+TaxonF 1-00
+;
+ccode - 0.5;
+xgroup =0 (ANTERIOR) 0.2 ;
+xgroup =1 (POSTERIOR) 3. ;
+;
+proc/;
diff --git a/man/ReadCharacters.Rd b/man/ReadCharacters.Rd
diff --git a/tests/testthat/test-ReadTntTree.R b/tests/testthat/test-ReadTntTree.R
@@ -44,6 +44,58 @@ test_that("TntTextToTree()", {
                ape::read.tree(text = "(A, (B, (C, (D, E))));"))
 })
 
+test_that("ReadXgroup() parses partition blocks", {
+  xg <- ReadXgroup(TestFile("tnt-xgroup.tnt"))
+  # tnt-xgroup.tnt: 6 chars; xgroup =0 (ANTERIOR) 0.2 ; xgroup =1 (POSTERIOR) 3.
+  expect_true(is.integer(xg))
+  expect_length(xg, 6L)
+  expect_equal(unname(xg), c(0L, 0L, 0L, 1L, 1L, 1L))
+  expect_equal(unname(names(xg)),
+               c("ANTERIOR", "ANTERIOR", "ANTERIOR",
+                 "POSTERIOR", "POSTERIOR", "POSTERIOR"))
+})
+
+test_that("ReadXgroup() returns NULL when no xread block", {
+  tmp <- tempfile(fileext = ".tnt")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines("proc/;", tmp)
+  expect_null(ReadXgroup(tmp))
+})
+
+test_that("ReadXgroup() returns all-NA when xread present but no xgroup", {
+  xg <- ReadXgroup(TestFile("tnt-matrix.tnt"))
+  expect_true(is.integer(xg))
+  expect_true(all(is.na(xg)))
+  expect_length(xg, 95L)  # tnt-matrix.tnt: 95 characters
+})
+
+test_that(".ExpandTntRange() handles A.B, A. and A.A", {
+  expect_equal(TreeTools:::.ExpandTntRange("1.3", 10L), c(2L, 3L, 4L))
+  expect_equal(TreeTools:::.ExpandTntRange("3.",  5L), c(4L, 5L))
+  expect_equal(TreeTools:::.ExpandTntRange("2.2", 10L), 3L)
+})
+
+test_that("ReadXgroup() handles xgroup lines without parenthetical label", {
+  tmp <- tempfile(fileext = ".tnt")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c("xread", "6 4",
+               "TaxonA 010101", "TaxonB 010101",
+               "TaxonC 101010", "TaxonD 101010", ";",
+               "xgroup =0 0.2 ;",
+               "xgroup =1 3. ;", ";", "proc/;"), tmp)
+  xg <- ReadXgroup(tmp)
+  expect_length(xg, 6L)
+  expect_equal(unname(xg), c(0L, 0L, 0L, 1L, 1L, 1L))
+  expect_equal(unname(names(xg)), c("0", "0", "0", "1", "1", "1"))
+})
+
+test_that(".TntNChar() returns NULL for malformed dimension line", {
+  tmp <- tempfile(fileext = ".tnt")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c("xread", "6", ";", "proc/;"), tmp)
+  expect_null(ReadXgroup(tmp))
+})
+
 test_that("ReadTntTree() NULL return", {
   expect_null(ReadTntTree(TestFile("ape-tree.nex")))
 })