Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: TreeTools
Title: Create, Modify and Analyse Phylogenetic Trees
Version: 2.3.0.9001
Version: 2.3.0.9002
Authors@R: c(
person("Martin R.", 'Smith', role = c("aut", "cre", "cph"),
email = "martin.smith@durham.ac.uk",
Expand Down Expand Up @@ -63,6 +63,7 @@ Config/Needs/memcheck: pkgdown, testthat
Config/Needs/metadata: codemeta
Config/Needs/revdeps: revdepcheck
Config/Needs/website: pkgdown
Config/roxygen2/version: 8.0.0
Config/testthat/parallel: false
Config/testthat/edition: 3
LinkingTo: Rcpp
Expand All @@ -73,4 +74,3 @@ Encoding: UTF-8
Language: en-GB
VignetteBuilder: knitr
Roxygen: list(markdown = TRUE)
Config/roxygen2/version: 8.0.0
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ export(NewickTree)
export(NeworderPhylo)
export(NeworderPruningwise)
export(NexusTokens)
export(NexusTokensToInteger)
export(NodeDepth)
export(NodeNumbers)
export(NodeOrder)
Expand Down
15 changes: 11 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
# TreeTools 2.3.0.9001 (development) #
# TreeTools 2.3.0.9002 (development) #

## New features

- `NexusTokensToInteger()` converts character data to integers,
mapping uncertain tokens to `NA`.
- `ReadTntCharacters()` attaches an `xgroup` attribute (factor) when a TNT
`xgroup` partition block is present, replacing the standalone `ReadXgroup()`.
`xgroup` partition block is present, replacing the stand-alone `ReadXgroup()`.

## Fixes

- `NexusTokens()` once again handles polymorphism tokens with internal
whitespace (e.g. `(1 2)`, `{0 1}`).


# TreeTools 2.3.0 (2026-04-22) #

## Bug fixes

- `ReadTntCharacters()` now handles multi-line comments, bare `&` continuations,
`@taxonomy` suffixes, name-only taxon lines, mid-line `xread`, smart-quote
names (Windows-1252), and packed multi-taxon lines; all 13 Goloboff (2019)
corpus files parse cleanly.
names (Windows-1252), and packed multi-taxon lines.

## Performance

Expand Down
69 changes: 69 additions & 0 deletions R/parse_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,75 @@ PhyDat <- function(dataset) {
MatrixToPhyDat(mat)
}

#' Convert Nexus token matrix to integer
#'
#' `NexusTokensToInteger()` converts the character matrix returned by
#' [`ReadCharacters()`] to an integer matrix, mapping polymorphic,
#' ambiguous (`?`), and inapplicable (`-`) tokens to `NA_integer_` or to the
#' first/last state listed in the polymorphism, depending on `polymorphism`.
#'
#' Only digit states `0`..`9` are recognised; non-digit symbols (and any
#' token whose interior contains no digits) become `NA_integer_`.
#' Polymorphism extraction (`polymorphism = "first"`/`"last"`) likewise
#' considers digits only.
#'
#' If `tokens` is a `phyDat` object it is first converted via
#' [`PhyDatToMatrix()`] with `ambigNA = TRUE, inappNA = TRUE`, so that
#' fully-ambiguous and inapplicable rows become `NA_integer_` and only
#' true partial polymorphisms are subject to the `polymorphism` rule.
#'
#' @param tokens Character matrix as returned by [`ReadCharacters()`], a
#' character vector as returned by [`NexusTokens()`], or a `phyDat` object.
#' @param polymorphism Character string specifying how to handle polymorphic
#' tokens such as `"(01)"` or `"{12}"`:
#' \describe{
#' \item{`"?"` (default)}{Treat as the NEXUS missing-data token: map to
#' `NA_integer_`.}
#' \item{`"first"`}{Use the first state digit inside the brackets.}
#' \item{`"last"`}{Use the last state digit inside the brackets.}
#' }
#' Tokens `"?"` and `"-"` always map to `NA_integer_` regardless of
#' `polymorphism`.
#'
#' @return An integer matrix (or vector) with the same dimensions and
#' `dimnames` as `tokens`.
#'
#' @examples
#' tokens <- matrix(c("0", "(12)", "1", "?", "-"),
#' nrow = 1,
#' dimnames = list("Taxon_A", paste0("C", 1:5)))
#' NexusTokensToInteger(tokens)
#' NexusTokensToInteger(tokens, polymorphism = "first")
#'
#' @family phylogenetic matrix conversion functions
#' @template MRS
#' @export
NexusTokensToInteger <- function(tokens,
polymorphism = c("?", "first", "last")) {
polymorphism <- match.arg(polymorphism)
if (inherits(tokens, "phyDat")) {
tokens <- PhyDatToMatrix(tokens, ambigNA = TRUE, inappNA = TRUE)
}
at <- attributes(tokens)

x <- as.character(tokens)
result <- suppressWarnings(as.integer(x))

ambig <- is.na(result) & !is.na(x) & x != "?" & x != "-"
if (polymorphism != "?" && any(ambig)) {
pattern <- if (polymorphism == "first") "\\d" else "\\d(?=[^\\d]*$)"
m <- regexpr(pattern, x[ambig], perl = TRUE)
matched <- regmatches(x[ambig], m)
digits <- rep(NA_character_, sum(ambig))
digits[m != -1L] <- matched
result[ambig] <- suppressWarnings(as.integer(digits))
}

attributes(result) <- at
result
}


#' Rightmost character of string
#'
#' `RightmostCharacter()` is a convenience function that returns the final
Expand Down
1 change: 1 addition & 0 deletions man/Decompose.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/MatrixToPhyDat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 64 additions & 0 deletions man/NexusTokensToInteger.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/PhyToString.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/Reweight.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

128 changes: 128 additions & 0 deletions tests/testthat/test-parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,134 @@ test_that("Nexus file can be parsed", {
expect_equal(3L, unique(as.integer(read[3, ])))
})

test_that("ReadCharacters handles Cingulata-style polymorphism with internal whitespace", {
nexusContent <- "#NEXUS
BEGIN CHARACTERS;
DIMENSIONS NCHAR=5;
FORMAT DATATYPE=STANDARD MISSING=? GAP=-;
MATRIX
Taxon_A 0(1 2)1?-
Taxon_B 1{0 1}0?-
Taxon_C 01010
;"

tf <- tempfile(fileext = ".nex")
on.exit(unlink(tf))
writeLines(nexusContent, tf)
read <- ReadCharacters(tf)

expect_equal(dim(read), c(3L, 5L))
expect_equal(unname(read["Taxon_A", 2]), "(12)")
expect_equal(unname(read["Taxon_B", 2]), "{01}")
expect_equal(unname(read["Taxon_A", 4]), "?")
expect_equal(unname(read["Taxon_A", 5]), "-")
expect_equal(unname(read["Taxon_C", 1]), "0")

# Continuation lines: data for a taxon split across multiple lines
nexusMulti <- "#NEXUS
BEGIN CHARACTERS;
DIMENSIONS NCHAR=6;
FORMAT DATATYPE=STANDARD MISSING=? GAP=-;
MATRIX
Taxon_A 0(1 2)1
?-1
Taxon_B 1{0 1}0
?-0
;"

tf2 <- tempfile(fileext = ".nex")
on.exit(unlink(tf2), add = TRUE)
writeLines(nexusMulti, tf2)
readMulti <- ReadCharacters(tf2)

expect_equal(dim(readMulti), c(2L, 6L))
expect_equal(unname(readMulti["Taxon_A", 2]), "(12)")
expect_equal(unname(readMulti["Taxon_B", 2]), "{01}")
expect_equal(unname(readMulti["Taxon_A", 6]), "1")
expect_equal(unname(readMulti["Taxon_B", 6]), "0")

# Round-trip the ReadCharacters() matrix through NexusTokensToInteger().
intMat <- NexusTokensToInteger(read)
expect_equal(dim(intMat), c(3L, 5L))
expect_equal(unname(intMat["Taxon_A", 1]), 0L)
expect_equal(unname(intMat["Taxon_A", 2]), NA_integer_) # polymorphism -> ?
expect_equal(unname(intMat["Taxon_B", 2]), NA_integer_) # uncertainty -> ?
expect_equal(unname(intMat["Taxon_A", 4]), NA_integer_) # ?
expect_equal(unname(intMat["Taxon_A", 5]), NA_integer_) # -
expect_equal(unname(intMat["Taxon_C", 5]), 0L)
expect_equal(unname(NexusTokensToInteger(read, "first")["Taxon_A", 2]), 1L)
expect_equal(unname(NexusTokensToInteger(read, "last")["Taxon_A", 2]), 2L)
})

test_that("NexusTokensToInteger() converts token matrix to integer", {
tokens <- matrix(c("0", "(12)", "1", "?", "-"),
nrow = 1L,
dimnames = list("Tax", paste0("C", 1:5)))

# Default: polymorphisms and ambiguities become NA
result <- NexusTokensToInteger(tokens)
expect_equal(dim(result), c(1L, 5L))
expect_equal(dimnames(result), list("Tax", paste0("C", 1:5)))
expect_equal(unname(result["Tax", "C1"]), 0L)
expect_equal(unname(result["Tax", "C2"]), NA_integer_)
expect_equal(unname(result["Tax", "C3"]), 1L)
expect_equal(unname(result["Tax", "C4"]), NA_integer_)
expect_equal(unname(result["Tax", "C5"]), NA_integer_)

# polymorphism = "first": take first digit inside brackets
result_f <- NexusTokensToInteger(tokens, polymorphism = "first")
expect_equal(unname(result_f["Tax", "C2"]), 1L)
expect_equal(unname(result_f["Tax", "C4"]), NA_integer_)

# polymorphism = "last": take last digit inside brackets
result_l <- NexusTokensToInteger(tokens, polymorphism = "last")
expect_equal(unname(result_l["Tax", "C2"]), 2L)

# Braces form {01}
tokens2 <- matrix(c("{01}", "0"), nrow = 1L, dimnames = list("T1", c("C1", "C2")))
expect_equal(unname(NexusTokensToInteger(tokens2)["T1", "C1"]), NA_integer_)
expect_equal(unname(NexusTokensToInteger(tokens2, "first")["T1", "C1"]), 0L)
expect_equal(unname(NexusTokensToInteger(tokens2, "last")["T1", "C1"]), 1L)

# Named vector input (no dim attribute).
vec <- c(a = "0", b = "(12)", c = "?", d = "1")
vecOut <- NexusTokensToInteger(vec)
expect_null(dim(vecOut))
expect_equal(names(vecOut), c("a", "b", "c", "d"))
expect_equal(unname(vecOut), c(0L, NA_integer_, NA_integer_, 1L))
expect_equal(unname(NexusTokensToInteger(vec, "first")), c(0L, 1L, NA_integer_, 1L))
expect_equal(unname(NexusTokensToInteger(vec, "last")), c(0L, 2L, NA_integer_, 1L))

# No-digit polymorphism token: must not crash, must yield NA in every mode.
noDigit <- matrix(c("(AB)", "0"), nrow = 1L,
dimnames = list("T", c("C1", "C2")))
expect_silent(NexusTokensToInteger(noDigit))
expect_equal(unname(NexusTokensToInteger(noDigit)["T", "C1"]), NA_integer_)
expect_equal(unname(NexusTokensToInteger(noDigit, "first")["T", "C1"]),
NA_integer_)
expect_equal(unname(NexusTokensToInteger(noDigit, "last")["T", "C1"]),
NA_integer_)
expect_equal(unname(NexusTokensToInteger(noDigit, "first")["T", "C2"]), 0L)

# state.labels and other matrix attributes round-trip through the result.
tokens3 <- matrix(c("0", "(12)", "1"), nrow = 1L,
dimnames = list("Tax", c("C1", "C2", "C3")))
attr(tokens3, "state.labels") <- list(c("absent", "present"),
c("a", "b", "c"),
c("absent", "present"))
out3 <- NexusTokensToInteger(tokens3)
expect_equal(attr(out3, "state.labels"), attr(tokens3, "state.labels"))

# phyDat input routes through PhyDatToMatrix(ambigNA = TRUE, inappNA = TRUE).
phy <- MatrixToPhyDat(matrix(c("0", "(12)", "1", "?", "-"),
nrow = 1L,
dimnames = list("Tax", paste0("C", 1:5))))
viaPhy <- NexusTokensToInteger(phy)
viaMat <- NexusTokensToInteger(PhyDatToMatrix(phy, ambigNA = TRUE,
inappNA = TRUE))
expect_equal(unname(viaPhy), unname(viaMat))
})

test_that("NexusTokens() fails gracefully", {
expect_error(NexusTokens("0123012301230123", integer(0)))
expect_equal("Character number must be between 1 and 16.",
Expand Down
Loading