ms609 · ms609 · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: TreeTools
 Title: Create, Modify and Analyse Phylogenetic Trees
-Version: 2.3.0.9001
+Version: 2.3.0.9002
 Authors@R: c(
   person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), 
          email = "martin.smith@durham.ac.uk",
@@ -63,6 +63,7 @@ Config/Needs/memcheck: pkgdown, testthat
 Config/Needs/metadata: codemeta
 Config/Needs/revdeps: revdepcheck
 Config/Needs/website: pkgdown
+Config/roxygen2/version: 8.0.0
 Config/testthat/parallel: false
 Config/testthat/edition: 3
 LinkingTo: Rcpp
@@ -73,4 +74,3 @@ Encoding: UTF-8
 Language: en-GB
 VignetteBuilder: knitr
 Roxygen: list(markdown = TRUE)
-Config/roxygen2/version: 8.0.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -372,6 +372,7 @@ export(NewickTree)
 export(NeworderPhylo)
 export(NeworderPruningwise)
 export(NexusTokens)
+export(NexusTokensToInteger)
 export(NodeDepth)
 export(NodeNumbers)
 export(NodeOrder)

diff --git a/NEWS.md b/NEWS.md
@@ -1,18 +1,25 @@
-# TreeTools 2.3.0.9001 (development) #
+# TreeTools 2.3.0.9002 (development) #
 
 ## New features
 
+- `NexusTokensToInteger()` converts character data to integers,
+  mapping uncertain tokens to `NA`.
 - `ReadTntCharacters()` attaches an `xgroup` attribute (factor) when a TNT
-  `xgroup` partition block is present, replacing the standalone `ReadXgroup()`.
+  `xgroup` partition block is present, replacing the stand-alone `ReadXgroup()`.
+
+## Fixes
+
+- `NexusTokens()` once again handles polymorphism tokens with internal
+  whitespace (e.g. `(1 2)`, `{0 1}`).
+
 
 # TreeTools 2.3.0 (2026-04-22) #
 
 ## Bug fixes
 
 - `ReadTntCharacters()` now handles multi-line comments, bare `&` continuations,
   `@taxonomy` suffixes, name-only taxon lines, mid-line `xread`, smart-quote
-  names (Windows-1252), and packed multi-taxon lines; all 13 Goloboff (2019)
-  corpus files parse cleanly.
+  names (Windows-1252), and packed multi-taxon lines.
 
 ## Performance
 

diff --git a/R/parse_files.R b/R/parse_files.R
@@ -1011,6 +1011,75 @@ PhyDat <- function(dataset) {
   MatrixToPhyDat(mat)
 }
 
+#' Convert Nexus token matrix to integer
+#'
+#' `NexusTokensToInteger()` converts the character matrix returned by
+#' [`ReadCharacters()`] to an integer matrix, mapping polymorphic,
+#' ambiguous (`?`), and inapplicable (`-`) tokens to `NA_integer_` or to the
+#' first/last state listed in the polymorphism, depending on `polymorphism`.
+#'
+#' Only digit states `0`..`9` are recognised; non-digit symbols (and any
+#' token whose interior contains no digits) become `NA_integer_`.
+#' Polymorphism extraction (`polymorphism = "first"`/`"last"`) likewise
+#' considers digits only.
+#'
+#' If `tokens` is a `phyDat` object it is first converted via
+#' [`PhyDatToMatrix()`] with `ambigNA = TRUE, inappNA = TRUE`, so that
+#' fully-ambiguous and inapplicable rows become `NA_integer_` and only
+#' true partial polymorphisms are subject to the `polymorphism` rule.
+#'
+#' @param tokens Character matrix as returned by [`ReadCharacters()`], a
+#' character vector as returned by [`NexusTokens()`], or a `phyDat` object.
+#' @param polymorphism Character string specifying how to handle polymorphic
+#' tokens such as `"(01)"` or `"{12}"`:
+#' \describe{
+#'   \item{`"?"` (default)}{Treat as the NEXUS missing-data token: map to
+#'     `NA_integer_`.}
+#'   \item{`"first"`}{Use the first state digit inside the brackets.}
+#'   \item{`"last"`}{Use the last state digit inside the brackets.}
+#' }
+#' Tokens `"?"` and `"-"` always map to `NA_integer_` regardless of
+#' `polymorphism`.
+#'
+#' @return An integer matrix (or vector) with the same dimensions and
+#' `dimnames` as `tokens`.
+#'
+#' @examples
+#' tokens <- matrix(c("0", "(12)", "1", "?", "-"),
+#'                  nrow = 1,
+#'                  dimnames = list("Taxon_A", paste0("C", 1:5)))
+#' NexusTokensToInteger(tokens)
+#' NexusTokensToInteger(tokens, polymorphism = "first")
+#'
+#' @family phylogenetic matrix conversion functions
+#' @template MRS
+#' @export
+NexusTokensToInteger <- function(tokens,
+                                 polymorphism = c("?", "first", "last")) {
+  polymorphism <- match.arg(polymorphism)
+  if (inherits(tokens, "phyDat")) {
+    tokens <- PhyDatToMatrix(tokens, ambigNA = TRUE, inappNA = TRUE)
+  }
+  at <- attributes(tokens)
+
+  x <- as.character(tokens)
+  result <- suppressWarnings(as.integer(x))
+
+  ambig <- is.na(result) & !is.na(x) & x != "?" & x != "-"
+  if (polymorphism != "?" && any(ambig)) {
+    pattern <- if (polymorphism == "first") "\\d" else "\\d(?=[^\\d]*$)"
+    m <- regexpr(pattern, x[ambig], perl = TRUE)
+    matched <- regmatches(x[ambig], m)
+    digits <- rep(NA_character_, sum(ambig))
+    digits[m != -1L] <- matched
+    result[ambig] <- suppressWarnings(as.integer(digits))
+  }
+
+  attributes(result) <- at
+  result
+}
+
+
 #' Rightmost character of string
 #'
 #' `RightmostCharacter()` is a convenience function that returns the final

diff --git a/man/Decompose.Rd b/man/Decompose.Rd
diff --git a/man/MatrixToPhyDat.Rd b/man/MatrixToPhyDat.Rd
diff --git a/man/NexusTokensToInteger.Rd b/man/NexusTokensToInteger.Rd
diff --git a/man/PhyToString.Rd b/man/PhyToString.Rd
diff --git a/man/Reweight.Rd b/man/Reweight.Rd
diff --git a/tests/testthat/test-parsers.R b/tests/testthat/test-parsers.R
@@ -32,6 +32,134 @@ test_that("Nexus file can be parsed", {
   expect_equal(3L, unique(as.integer(read[3, ])))
 })
 
+test_that("ReadCharacters handles Cingulata-style polymorphism with internal whitespace", {
+  nexusContent <- "#NEXUS
+BEGIN CHARACTERS;
+  DIMENSIONS NCHAR=5;
+  FORMAT DATATYPE=STANDARD MISSING=? GAP=-;
+  MATRIX
+    Taxon_A  0(1 2)1?-
+    Taxon_B  1{0 1}0?-
+    Taxon_C  01010
+  ;"
+
+  tf <- tempfile(fileext = ".nex")
+  on.exit(unlink(tf))
+  writeLines(nexusContent, tf)
+  read <- ReadCharacters(tf)
+
+  expect_equal(dim(read), c(3L, 5L))
+  expect_equal(unname(read["Taxon_A", 2]), "(12)")
+  expect_equal(unname(read["Taxon_B", 2]), "{01}")
+  expect_equal(unname(read["Taxon_A", 4]), "?")
+  expect_equal(unname(read["Taxon_A", 5]), "-")
+  expect_equal(unname(read["Taxon_C", 1]), "0")
+
+  # Continuation lines: data for a taxon split across multiple lines
+  nexusMulti <- "#NEXUS
+BEGIN CHARACTERS;
+  DIMENSIONS NCHAR=6;
+  FORMAT DATATYPE=STANDARD MISSING=? GAP=-;
+  MATRIX
+    Taxon_A  0(1 2)1
+              ?-1
+    Taxon_B  1{0 1}0
+              ?-0
+  ;"
+
+  tf2 <- tempfile(fileext = ".nex")
+  on.exit(unlink(tf2), add = TRUE)
+  writeLines(nexusMulti, tf2)
+  readMulti <- ReadCharacters(tf2)
+
+  expect_equal(dim(readMulti), c(2L, 6L))
+  expect_equal(unname(readMulti["Taxon_A", 2]), "(12)")
+  expect_equal(unname(readMulti["Taxon_B", 2]), "{01}")
+  expect_equal(unname(readMulti["Taxon_A", 6]), "1")
+  expect_equal(unname(readMulti["Taxon_B", 6]), "0")
+
+  # Round-trip the ReadCharacters() matrix through NexusTokensToInteger().
+  intMat <- NexusTokensToInteger(read)
+  expect_equal(dim(intMat), c(3L, 5L))
+  expect_equal(unname(intMat["Taxon_A", 1]), 0L)
+  expect_equal(unname(intMat["Taxon_A", 2]), NA_integer_) # polymorphism -> ?
+  expect_equal(unname(intMat["Taxon_B", 2]), NA_integer_) # uncertainty -> ?
+  expect_equal(unname(intMat["Taxon_A", 4]), NA_integer_) # ?
+  expect_equal(unname(intMat["Taxon_A", 5]), NA_integer_) # -
+  expect_equal(unname(intMat["Taxon_C", 5]), 0L)
+  expect_equal(unname(NexusTokensToInteger(read, "first")["Taxon_A", 2]), 1L)
+  expect_equal(unname(NexusTokensToInteger(read, "last")["Taxon_A", 2]), 2L)
+})
+
+test_that("NexusTokensToInteger() converts token matrix to integer", {
+  tokens <- matrix(c("0", "(12)", "1", "?", "-"),
+                   nrow = 1L,
+                   dimnames = list("Tax", paste0("C", 1:5)))
+
+  # Default: polymorphisms and ambiguities become NA
+  result <- NexusTokensToInteger(tokens)
+  expect_equal(dim(result), c(1L, 5L))
+  expect_equal(dimnames(result), list("Tax", paste0("C", 1:5)))
+  expect_equal(unname(result["Tax", "C1"]), 0L)
+  expect_equal(unname(result["Tax", "C2"]), NA_integer_)
+  expect_equal(unname(result["Tax", "C3"]), 1L)
+  expect_equal(unname(result["Tax", "C4"]), NA_integer_)
+  expect_equal(unname(result["Tax", "C5"]), NA_integer_)
+
+  # polymorphism = "first": take first digit inside brackets
+  result_f <- NexusTokensToInteger(tokens, polymorphism = "first")
+  expect_equal(unname(result_f["Tax", "C2"]), 1L)
+  expect_equal(unname(result_f["Tax", "C4"]), NA_integer_)
+
+  # polymorphism = "last": take last digit inside brackets
+  result_l <- NexusTokensToInteger(tokens, polymorphism = "last")
+  expect_equal(unname(result_l["Tax", "C2"]), 2L)
+
+  # Braces form {01}
+  tokens2 <- matrix(c("{01}", "0"), nrow = 1L, dimnames = list("T1", c("C1", "C2")))
+  expect_equal(unname(NexusTokensToInteger(tokens2)["T1", "C1"]), NA_integer_)
+  expect_equal(unname(NexusTokensToInteger(tokens2, "first")["T1", "C1"]), 0L)
+  expect_equal(unname(NexusTokensToInteger(tokens2, "last")["T1", "C1"]), 1L)
+
+  # Named vector input (no dim attribute).
+  vec <- c(a = "0", b = "(12)", c = "?", d = "1")
+  vecOut <- NexusTokensToInteger(vec)
+  expect_null(dim(vecOut))
+  expect_equal(names(vecOut), c("a", "b", "c", "d"))
+  expect_equal(unname(vecOut), c(0L, NA_integer_, NA_integer_, 1L))
+  expect_equal(unname(NexusTokensToInteger(vec, "first")), c(0L, 1L, NA_integer_, 1L))
+  expect_equal(unname(NexusTokensToInteger(vec, "last")), c(0L, 2L, NA_integer_, 1L))
+
+  # No-digit polymorphism token: must not crash, must yield NA in every mode.
+  noDigit <- matrix(c("(AB)", "0"), nrow = 1L,
+                    dimnames = list("T", c("C1", "C2")))
+  expect_silent(NexusTokensToInteger(noDigit))
+  expect_equal(unname(NexusTokensToInteger(noDigit)["T", "C1"]), NA_integer_)
+  expect_equal(unname(NexusTokensToInteger(noDigit, "first")["T", "C1"]),
+               NA_integer_)
+  expect_equal(unname(NexusTokensToInteger(noDigit, "last")["T", "C1"]),
+               NA_integer_)
+  expect_equal(unname(NexusTokensToInteger(noDigit, "first")["T", "C2"]), 0L)
+
+  # state.labels and other matrix attributes round-trip through the result.
+  tokens3 <- matrix(c("0", "(12)", "1"), nrow = 1L,
+                    dimnames = list("Tax", c("C1", "C2", "C3")))
+  attr(tokens3, "state.labels") <- list(c("absent", "present"),
+                                        c("a", "b", "c"),
+                                        c("absent", "present"))
+  out3 <- NexusTokensToInteger(tokens3)
+  expect_equal(attr(out3, "state.labels"), attr(tokens3, "state.labels"))
+
+  # phyDat input routes through PhyDatToMatrix(ambigNA = TRUE, inappNA = TRUE).
+  phy <- MatrixToPhyDat(matrix(c("0", "(12)", "1", "?", "-"),
+                               nrow = 1L,
+                               dimnames = list("Tax", paste0("C", 1:5))))
+  viaPhy <- NexusTokensToInteger(phy)
+  viaMat <- NexusTokensToInteger(PhyDatToMatrix(phy, ambigNA = TRUE,
+                                                inappNA = TRUE))
+  expect_equal(unname(viaPhy), unname(viaMat))
+})
+
 test_that("NexusTokens() fails gracefully", {
   expect_error(NexusTokens("0123012301230123", integer(0)))
   expect_equal("Character number must be between 1 and 16.",