diff --git a/DESCRIPTION b/DESCRIPTION index 8a070ff28..2ecffdba1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeTools Title: Create, Modify and Analyse Phylogenetic Trees -Version: 2.3.0.9001 +Version: 2.3.0.9002 Authors@R: c( person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), email = "martin.smith@durham.ac.uk", @@ -63,6 +63,7 @@ Config/Needs/memcheck: pkgdown, testthat Config/Needs/metadata: codemeta Config/Needs/revdeps: revdepcheck Config/Needs/website: pkgdown +Config/roxygen2/version: 8.0.0 Config/testthat/parallel: false Config/testthat/edition: 3 LinkingTo: Rcpp @@ -73,4 +74,3 @@ Encoding: UTF-8 Language: en-GB VignetteBuilder: knitr Roxygen: list(markdown = TRUE) -Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 7f34d68f7..e3501d5e7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -372,6 +372,7 @@ export(NewickTree) export(NeworderPhylo) export(NeworderPruningwise) export(NexusTokens) +export(NexusTokensToInteger) export(NodeDepth) export(NodeNumbers) export(NodeOrder) diff --git a/NEWS.md b/NEWS.md index 072e61091..cb2166e7e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,17 @@ -# TreeTools 2.3.0.9001 (development) # +# TreeTools 2.3.0.9002 (development) # ## New features +- `NexusTokensToInteger()` converts character data to integers, + mapping uncertain tokens to `NA`. - `ReadTntCharacters()` attaches an `xgroup` attribute (factor) when a TNT - `xgroup` partition block is present, replacing the standalone `ReadXgroup()`. + `xgroup` partition block is present, replacing the stand-alone `ReadXgroup()`. + +## Fixes + +- `NexusTokens()` once again handles polymorphism tokens with internal + whitespace (e.g. `(1 2)`, `{0 1}`). + # TreeTools 2.3.0 (2026-04-22) # @@ -11,8 +19,7 @@ - `ReadTntCharacters()` now handles multi-line comments, bare `&` continuations, `@taxonomy` suffixes, name-only taxon lines, mid-line `xread`, smart-quote - names (Windows-1252), and packed multi-taxon lines; all 13 Goloboff (2019) - corpus files parse cleanly. + names (Windows-1252), and packed multi-taxon lines. ## Performance diff --git a/R/parse_files.R b/R/parse_files.R index 7201460fd..978757cd6 100644 --- a/R/parse_files.R +++ b/R/parse_files.R @@ -1011,6 +1011,75 @@ PhyDat <- function(dataset) { MatrixToPhyDat(mat) } +#' Convert Nexus token matrix to integer +#' +#' `NexusTokensToInteger()` converts the character matrix returned by +#' [`ReadCharacters()`] to an integer matrix, mapping polymorphic, +#' ambiguous (`?`), and inapplicable (`-`) tokens to `NA_integer_` or to the +#' first/last state listed in the polymorphism, depending on `polymorphism`. +#' +#' Only digit states `0`..`9` are recognised; non-digit symbols (and any +#' token whose interior contains no digits) become `NA_integer_`. +#' Polymorphism extraction (`polymorphism = "first"`/`"last"`) likewise +#' considers digits only. +#' +#' If `tokens` is a `phyDat` object it is first converted via +#' [`PhyDatToMatrix()`] with `ambigNA = TRUE, inappNA = TRUE`, so that +#' fully-ambiguous and inapplicable rows become `NA_integer_` and only +#' true partial polymorphisms are subject to the `polymorphism` rule. +#' +#' @param tokens Character matrix as returned by [`ReadCharacters()`], a +#' character vector as returned by [`NexusTokens()`], or a `phyDat` object. +#' @param polymorphism Character string specifying how to handle polymorphic +#' tokens such as `"(01)"` or `"{12}"`: +#' \describe{ +#' \item{`"?"` (default)}{Treat as the NEXUS missing-data token: map to +#' `NA_integer_`.} +#' \item{`"first"`}{Use the first state digit inside the brackets.} +#' \item{`"last"`}{Use the last state digit inside the brackets.} +#' } +#' Tokens `"?"` and `"-"` always map to `NA_integer_` regardless of +#' `polymorphism`. +#' +#' @return An integer matrix (or vector) with the same dimensions and +#' `dimnames` as `tokens`. +#' +#' @examples +#' tokens <- matrix(c("0", "(12)", "1", "?", "-"), +#' nrow = 1, +#' dimnames = list("Taxon_A", paste0("C", 1:5))) +#' NexusTokensToInteger(tokens) +#' NexusTokensToInteger(tokens, polymorphism = "first") +#' +#' @family phylogenetic matrix conversion functions +#' @template MRS +#' @export +NexusTokensToInteger <- function(tokens, + polymorphism = c("?", "first", "last")) { + polymorphism <- match.arg(polymorphism) + if (inherits(tokens, "phyDat")) { + tokens <- PhyDatToMatrix(tokens, ambigNA = TRUE, inappNA = TRUE) + } + at <- attributes(tokens) + + x <- as.character(tokens) + result <- suppressWarnings(as.integer(x)) + + ambig <- is.na(result) & !is.na(x) & x != "?" & x != "-" + if (polymorphism != "?" && any(ambig)) { + pattern <- if (polymorphism == "first") "\\d" else "\\d(?=[^\\d]*$)" + m <- regexpr(pattern, x[ambig], perl = TRUE) + matched <- regmatches(x[ambig], m) + digits <- rep(NA_character_, sum(ambig)) + digits[m != -1L] <- matched + result[ambig] <- suppressWarnings(as.integer(digits)) + } + + attributes(result) <- at + result +} + + #' Rightmost character of string #' #' `RightmostCharacter()` is a convenience function that returns the final diff --git a/man/Decompose.Rd b/man/Decompose.Rd index 24a69d1f6..1461d624e 100644 --- a/man/Decompose.Rd +++ b/man/Decompose.Rd @@ -73,6 +73,7 @@ NumberOfChars(decomposed) # 116 characters in decomposed \seealso{ Other phylogenetic matrix conversion functions: \code{\link[=MatrixToPhyDat]{MatrixToPhyDat()}}, +\code{\link[=NexusTokensToInteger]{NexusTokensToInteger()}}, \code{\link[=Reweight]{Reweight()}}, \code{\link[=StringToPhyDat]{StringToPhyDat()}} } diff --git a/man/MatrixToPhyDat.Rd b/man/MatrixToPhyDat.Rd index ea3b4645d..b57517b05 100644 --- a/man/MatrixToPhyDat.Rd +++ b/man/MatrixToPhyDat.Rd @@ -68,6 +68,7 @@ head(PhyDatToMatrix(Lobo.phy)[, 91:93]) \seealso{ Other phylogenetic matrix conversion functions: \code{\link[=Decompose]{Decompose()}}, +\code{\link[=NexusTokensToInteger]{NexusTokensToInteger()}}, \code{\link[=Reweight]{Reweight()}}, \code{\link[=StringToPhyDat]{StringToPhyDat()}} } diff --git a/man/NexusTokensToInteger.Rd b/man/NexusTokensToInteger.Rd new file mode 100644 index 000000000..4ccbf16bf --- /dev/null +++ b/man/NexusTokensToInteger.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parse_files.R +\name{NexusTokensToInteger} +\alias{NexusTokensToInteger} +\title{Convert Nexus token matrix to integer} +\usage{ +NexusTokensToInteger(tokens, polymorphism = c("?", "first", "last")) +} +\arguments{ +\item{tokens}{Character matrix as returned by \code{\link[=ReadCharacters]{ReadCharacters()}}, a +character vector as returned by \code{\link[=NexusTokens]{NexusTokens()}}, or a \code{phyDat} object.} + +\item{polymorphism}{Character string specifying how to handle polymorphic +tokens such as \code{"(01)"} or \code{"{12}"}: +\describe{ +\item{\code{"?"} (default)}{Treat as the NEXUS missing-data token: map to +\code{NA_integer_}.} +\item{\code{"first"}}{Use the first state digit inside the brackets.} +\item{\code{"last"}}{Use the last state digit inside the brackets.} +} +Tokens \code{"?"} and \code{"-"} always map to \code{NA_integer_} regardless of +\code{polymorphism}.} +} +\value{ +An integer matrix (or vector) with the same dimensions and +\code{dimnames} as \code{tokens}. +} +\description{ +\code{NexusTokensToInteger()} converts the character matrix returned by +\code{\link[=ReadCharacters]{ReadCharacters()}} to an integer matrix, mapping polymorphic, +ambiguous (\verb{?}), and inapplicable (\code{-}) tokens to \code{NA_integer_} or to the +first/last state listed in the polymorphism, depending on \code{polymorphism}. +} +\details{ +Only digit states \code{0}..\code{9} are recognised; non-digit symbols (and any +token whose interior contains no digits) become \code{NA_integer_}. +Polymorphism extraction (\code{polymorphism = "first"}/\code{"last"}) likewise +considers digits only. + +If \code{tokens} is a \code{phyDat} object it is first converted via +\code{\link[=PhyDatToMatrix]{PhyDatToMatrix()}} with \verb{ambigNA = TRUE, inappNA = TRUE}, so that +fully-ambiguous and inapplicable rows become \code{NA_integer_} and only +true partial polymorphisms are subject to the \code{polymorphism} rule. +} +\examples{ +tokens <- matrix(c("0", "(12)", "1", "?", "-"), + nrow = 1, + dimnames = list("Taxon_A", paste0("C", 1:5))) +NexusTokensToInteger(tokens) +NexusTokensToInteger(tokens, polymorphism = "first") + +} +\seealso{ +Other phylogenetic matrix conversion functions: +\code{\link[=Decompose]{Decompose()}}, +\code{\link[=MatrixToPhyDat]{MatrixToPhyDat()}}, +\code{\link[=Reweight]{Reweight()}}, +\code{\link[=StringToPhyDat]{StringToPhyDat()}} +} +\author{ +\href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} +(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}) +} +\concept{phylogenetic matrix conversion functions} diff --git a/man/PhyToString.Rd b/man/PhyToString.Rd index ec58c9f6a..021b30e3d 100644 --- a/man/PhyToString.Rd +++ b/man/PhyToString.Rd @@ -97,6 +97,7 @@ PhyToString(phyDat, concatenate = FALSE) Other phylogenetic matrix conversion functions: \code{\link[=Decompose]{Decompose()}}, \code{\link[=MatrixToPhyDat]{MatrixToPhyDat()}}, +\code{\link[=NexusTokensToInteger]{NexusTokensToInteger()}}, \code{\link[=Reweight]{Reweight()}} } \author{ diff --git a/man/Reweight.Rd b/man/Reweight.Rd index 6bf37e24e..db3d6e894 100644 --- a/man/Reweight.Rd +++ b/man/Reweight.Rd @@ -69,6 +69,7 @@ Reweight(dat, c("3" = 0, "2" = 2)) Other phylogenetic matrix conversion functions: \code{\link[=Decompose]{Decompose()}}, \code{\link[=MatrixToPhyDat]{MatrixToPhyDat()}}, +\code{\link[=NexusTokensToInteger]{NexusTokensToInteger()}}, \code{\link[=StringToPhyDat]{StringToPhyDat()}} } \author{ diff --git a/tests/testthat/test-parsers.R b/tests/testthat/test-parsers.R index d59a98f2b..71b6fdbe7 100644 --- a/tests/testthat/test-parsers.R +++ b/tests/testthat/test-parsers.R @@ -32,6 +32,134 @@ test_that("Nexus file can be parsed", { expect_equal(3L, unique(as.integer(read[3, ]))) }) +test_that("ReadCharacters handles Cingulata-style polymorphism with internal whitespace", { + nexusContent <- "#NEXUS +BEGIN CHARACTERS; + DIMENSIONS NCHAR=5; + FORMAT DATATYPE=STANDARD MISSING=? GAP=-; + MATRIX + Taxon_A 0(1 2)1?- + Taxon_B 1{0 1}0?- + Taxon_C 01010 + ;" + + tf <- tempfile(fileext = ".nex") + on.exit(unlink(tf)) + writeLines(nexusContent, tf) + read <- ReadCharacters(tf) + + expect_equal(dim(read), c(3L, 5L)) + expect_equal(unname(read["Taxon_A", 2]), "(12)") + expect_equal(unname(read["Taxon_B", 2]), "{01}") + expect_equal(unname(read["Taxon_A", 4]), "?") + expect_equal(unname(read["Taxon_A", 5]), "-") + expect_equal(unname(read["Taxon_C", 1]), "0") + + # Continuation lines: data for a taxon split across multiple lines + nexusMulti <- "#NEXUS +BEGIN CHARACTERS; + DIMENSIONS NCHAR=6; + FORMAT DATATYPE=STANDARD MISSING=? GAP=-; + MATRIX + Taxon_A 0(1 2)1 + ?-1 + Taxon_B 1{0 1}0 + ?-0 + ;" + + tf2 <- tempfile(fileext = ".nex") + on.exit(unlink(tf2), add = TRUE) + writeLines(nexusMulti, tf2) + readMulti <- ReadCharacters(tf2) + + expect_equal(dim(readMulti), c(2L, 6L)) + expect_equal(unname(readMulti["Taxon_A", 2]), "(12)") + expect_equal(unname(readMulti["Taxon_B", 2]), "{01}") + expect_equal(unname(readMulti["Taxon_A", 6]), "1") + expect_equal(unname(readMulti["Taxon_B", 6]), "0") + + # Round-trip the ReadCharacters() matrix through NexusTokensToInteger(). + intMat <- NexusTokensToInteger(read) + expect_equal(dim(intMat), c(3L, 5L)) + expect_equal(unname(intMat["Taxon_A", 1]), 0L) + expect_equal(unname(intMat["Taxon_A", 2]), NA_integer_) # polymorphism -> ? + expect_equal(unname(intMat["Taxon_B", 2]), NA_integer_) # uncertainty -> ? + expect_equal(unname(intMat["Taxon_A", 4]), NA_integer_) # ? + expect_equal(unname(intMat["Taxon_A", 5]), NA_integer_) # - + expect_equal(unname(intMat["Taxon_C", 5]), 0L) + expect_equal(unname(NexusTokensToInteger(read, "first")["Taxon_A", 2]), 1L) + expect_equal(unname(NexusTokensToInteger(read, "last")["Taxon_A", 2]), 2L) +}) + +test_that("NexusTokensToInteger() converts token matrix to integer", { + tokens <- matrix(c("0", "(12)", "1", "?", "-"), + nrow = 1L, + dimnames = list("Tax", paste0("C", 1:5))) + + # Default: polymorphisms and ambiguities become NA + result <- NexusTokensToInteger(tokens) + expect_equal(dim(result), c(1L, 5L)) + expect_equal(dimnames(result), list("Tax", paste0("C", 1:5))) + expect_equal(unname(result["Tax", "C1"]), 0L) + expect_equal(unname(result["Tax", "C2"]), NA_integer_) + expect_equal(unname(result["Tax", "C3"]), 1L) + expect_equal(unname(result["Tax", "C4"]), NA_integer_) + expect_equal(unname(result["Tax", "C5"]), NA_integer_) + + # polymorphism = "first": take first digit inside brackets + result_f <- NexusTokensToInteger(tokens, polymorphism = "first") + expect_equal(unname(result_f["Tax", "C2"]), 1L) + expect_equal(unname(result_f["Tax", "C4"]), NA_integer_) + + # polymorphism = "last": take last digit inside brackets + result_l <- NexusTokensToInteger(tokens, polymorphism = "last") + expect_equal(unname(result_l["Tax", "C2"]), 2L) + + # Braces form {01} + tokens2 <- matrix(c("{01}", "0"), nrow = 1L, dimnames = list("T1", c("C1", "C2"))) + expect_equal(unname(NexusTokensToInteger(tokens2)["T1", "C1"]), NA_integer_) + expect_equal(unname(NexusTokensToInteger(tokens2, "first")["T1", "C1"]), 0L) + expect_equal(unname(NexusTokensToInteger(tokens2, "last")["T1", "C1"]), 1L) + + # Named vector input (no dim attribute). + vec <- c(a = "0", b = "(12)", c = "?", d = "1") + vecOut <- NexusTokensToInteger(vec) + expect_null(dim(vecOut)) + expect_equal(names(vecOut), c("a", "b", "c", "d")) + expect_equal(unname(vecOut), c(0L, NA_integer_, NA_integer_, 1L)) + expect_equal(unname(NexusTokensToInteger(vec, "first")), c(0L, 1L, NA_integer_, 1L)) + expect_equal(unname(NexusTokensToInteger(vec, "last")), c(0L, 2L, NA_integer_, 1L)) + + # No-digit polymorphism token: must not crash, must yield NA in every mode. + noDigit <- matrix(c("(AB)", "0"), nrow = 1L, + dimnames = list("T", c("C1", "C2"))) + expect_silent(NexusTokensToInteger(noDigit)) + expect_equal(unname(NexusTokensToInteger(noDigit)["T", "C1"]), NA_integer_) + expect_equal(unname(NexusTokensToInteger(noDigit, "first")["T", "C1"]), + NA_integer_) + expect_equal(unname(NexusTokensToInteger(noDigit, "last")["T", "C1"]), + NA_integer_) + expect_equal(unname(NexusTokensToInteger(noDigit, "first")["T", "C2"]), 0L) + + # state.labels and other matrix attributes round-trip through the result. + tokens3 <- matrix(c("0", "(12)", "1"), nrow = 1L, + dimnames = list("Tax", c("C1", "C2", "C3"))) + attr(tokens3, "state.labels") <- list(c("absent", "present"), + c("a", "b", "c"), + c("absent", "present")) + out3 <- NexusTokensToInteger(tokens3) + expect_equal(attr(out3, "state.labels"), attr(tokens3, "state.labels")) + + # phyDat input routes through PhyDatToMatrix(ambigNA = TRUE, inappNA = TRUE). + phy <- MatrixToPhyDat(matrix(c("0", "(12)", "1", "?", "-"), + nrow = 1L, + dimnames = list("Tax", paste0("C", 1:5)))) + viaPhy <- NexusTokensToInteger(phy) + viaMat <- NexusTokensToInteger(PhyDatToMatrix(phy, ambigNA = TRUE, + inappNA = TRUE)) + expect_equal(unname(viaPhy), unname(viaMat)) +}) + test_that("NexusTokens() fails gracefully", { expect_error(NexusTokens("0123012301230123", integer(0))) expect_equal("Character number must be between 1 and 16.",