ms609 · ms609 · Jun 1, 2026 · May 31, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -32,5 +32,6 @@ vignettes/.RData
 ^data-raw$
 ^pkgdown$
 ^revdep$
+^dev$
 ^\.positai$
 ^\.claude$
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,13 @@
 - `ReadTntCharacters()` attaches an `xgroup` attribute (factor) when a TNT
   `xgroup` partition block is present, replacing the stand-alone `ReadXgroup()`.
 
+## Performance
+
+- `Consensus()` computes majority-rule and threshold consensus trees in time
+  linear in the number of trees (previously quadratic), after
+  Jansson, Shen & Sung (2016); implementation informed by their `FACT` package.
+  `SplitFrequency()` inherits the same single-pass speed-up.
+
 ## Fixes
 
 - `NexusTokens()` once again handles polymorphism tokens with internal

diff --git a/R/Consensus.R b/R/Consensus.R
@@ -1,14 +1,29 @@
 #' Construct consensus trees
 #'
 #' `Consensus()` calculates the consensus of a set of trees, using the
-#' algorithm of \insertCite{Day1985}{TreeTools}.
+#' cluster-table approach of \insertCite{Day1985}{TreeTools}.
+#'
+#' The strict consensus (`p = 1`) compares the clusters of the first tree
+#' against every other tree in linear time.  The majority-rule and threshold
+#' consensus (`0.5 <= p < 1`) instead count the frequency of every split across
+#' all trees in a single pass and retain those occurring in a proportion `p` or
+#' more of trees; this runs in time linear in the number of trees, after
+#' \insertCite{Jansson2016}{TreeTools} (implementation informed by the
+#' \acronym{FACT} package of Jansson, Shen and Sung).  By default the count uses
+#' a 128-bit hash, whose results are exact with overwhelming probability; set
+#' `exact = TRUE` for a slower but guaranteed-exact count.
 #'
 #' @param trees List of trees, optionally of class `multiPhylo`.
 #' @param p Proportion of trees that must contain a split for it to be reported
 #' in the consensus.  `p = 0.5` gives the majority-rule consensus; `p = 1` (the
 #' default) gives the strict consensus.
 #' @param check.labels Logical specifying whether to check that all trees have
 #' identical labels.  Defaults to `TRUE`, which is slower.
+#' @param exact Logical; if `TRUE`, majority/threshold consensus uses a slower
+#' but guaranteed-exact split count instead of the default 128-bit hashing
+#' (whose results are exact unless a hash collision conflates two distinct
+#' splits, which is vanishingly unlikely).  Ignored when `p = 1`, which is
+#' always exact.
 #'
 #' @return `Consensus()` returns an object of class `phylo`, rooted as in the
 #' first entry of `trees`.
@@ -23,7 +38,7 @@
 #' @references
 #' \insertAllCited{}
 #' @export
-Consensus <- function(trees, p = 1, check.labels = TRUE) {
+Consensus <- function(trees, p = 1, check.labels = TRUE, exact = FALSE) {
   if (length(trees) == 1L) {
     return(trees[[1]])
   }
@@ -69,7 +84,7 @@ Consensus <- function(trees, p = 1, check.labels = TRUE) {
 
   # Return:
   RootTree(.PreorderTree(
-    edge = splits_to_edge(consensus_tree(trees, p), nTip),
+    edge = splits_to_edge(consensus_tree(trees, p, exact = isTRUE(exact)), nTip),
     tip.label = TipLabels(trees[[1]])
   ), root)
 }

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -25,12 +25,12 @@ as_newick <- function(edge) {
     .Call(`_TreeTools_as_newick`, edge)
 }
 
-split_frequencies <- function(trees) {
-    .Call(`_TreeTools_split_frequencies`, trees)
+split_frequencies <- function(trees, exact = FALSE) {
+    .Call(`_TreeTools_split_frequencies`, trees, exact)
 }
 
-consensus_tree <- function(trees, p) {
-    .Call(`_TreeTools_consensus_tree`, trees, p)
+consensus_tree <- function(trees, p, exact = FALSE) {
+    .Call(`_TreeTools_consensus_tree`, trees, p, exact)
 }
 
 descendant_edges <- function(parent, child, postorder) {

diff --git a/R/Support.R b/R/Support.R
@@ -14,6 +14,13 @@
 #' or a `Splits` object. See
 #' [vignette](https://ms609.github.io/TreeTools/articles/load-trees.html) for
 #' possible methods of loading trees into R.
+#' @param exact Logical specifying whether to use the slower but guaranteed
+#' exact algorithm when counting the frequencies of _all_ splits (i.e. when
+#' `reference = NULL`).  The default (`FALSE`) uses a faster hashing approach
+#' whose results are exact with overwhelming probability (a 128-bit hash
+#' collision, which would conflate two distinct splits, is vanishingly
+#' unlikely); set `exact = TRUE` if certainty is required.  Ignored when
+#' `reference` is a tree or `Splits` object.
 #'
 #' @return `SplitFrequency()` returns the number of trees in `forest` that
 #' contain each split in `reference`.
@@ -31,7 +38,7 @@
 #' @template MRS
 #' @family Splits operations
 #' @export
-SplitFrequency <- function(reference, forest = NULL) {
+SplitFrequency <- function(reference, forest = NULL, exact = FALSE) {
   if (is.null(reference) || is.null(forest)) {
     if (is.null(forest)) forest <- reference
     if (inherits(forest, "phylo")) forest <- list(forest)
@@ -49,7 +56,7 @@ SplitFrequency <- function(reference, forest = NULL) {
     }
     forest <- RenumberTips(forest, tipLabels)
     forest <- Preorder(forest)
-    result <- split_frequencies(forest)
+    result <- split_frequencies(forest, exact = isTRUE(exact))
     splits <- result[["splits"]]
     counts <- result[["counts"]]
     nTip <- length(tipLabels)

diff --git a/dev/red-team/bench-majority-scaling.R b/dev/red-team/bench-majority-scaling.R
@@ -0,0 +1,57 @@
+# Benchmark: majority consensus scaling in k (number of trees) at fixed n.
+# The new core scales ~linearly in k; both count modes (hashed default, exact
+# opt-in) are timed.
+#
+# Historical result vs the previous O(k^2 n) core (random trees, n = 100):
+#   k:        100    200    400    800   1600
+#   new (s):  0.014  0.027  0.050  0.100  0.209   (exponent a = 0.97, linear)
+#   old (s):  0.027  0.095  0.353  1.36   5.36    (exponent a = 1.91, quadratic)
+#   speed-up: 1.9x   3.6x   7.0x   13.6x  25.6x
+#
+# Run:  Rscript dev/red-team/bench-majority-scaling.R
+suppressMessages(devtools::load_all(".", quiet = TRUE))
+
+# Random trees: their majority consensus is ~unresolved, so the legacy core
+# never hits its perfectly-resolved early exit and pays the full O(k^2 n).
+n_tip <- 100L
+ks <- c(100, 200, 400, 800, 1600)
+reps <- 2L
+set.seed(1)
+make_forest <- function(k) {
+  forest <- lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL))
+  forest <- Preorder(RenumberTips(forest, forest[[1]]))
+  structure(forest, class = "multiPhylo")
+}
+
+bench1 <- function(fn, forest, p) {
+  best <- Inf
+  for (i in seq_len(reps)) {
+    t0 <- Sys.time()
+    fn(forest, p)
+    best <- min(best, as.numeric(Sys.time() - t0, units = "secs"))
+  }
+  best
+}
+
+hashed <- function(f, p) TreeTools:::consensus_tree(f, p)
+exact  <- function(f, p) TreeTools:::consensus_tree(f, p, exact = TRUE)
+
+cat(sprintf("Majority consensus, fixed n = %d tips, p = 0.5\n", n_tip))
+cat(sprintf("%6s %12s %12s\n", "k", "hashed (s)", "exact (s)"))
+res <- data.frame()
+for (k in ks) {
+  forest <- make_forest(k)
+  th <- bench1(hashed, forest, 0.5)
+  te <- bench1(exact, forest, 0.5)
+  cat(sprintf("%6d %12.4f %12.4f\n", k, th, te))
+  res <- rbind(res, data.frame(k = k, hashed = th, exact = te))
+}
+
+# Scaling exponent: slope of log(time) vs log(k).
+fit_exp <- function(k, t) {
+  ok <- t > 0
+  unname(coef(lm(log(t[ok]) ~ log(k[ok])))[2])
+}
+cat(sprintf("\nEmpirical scaling exponent (time ~ k^a; target ~1.0, linear):\n"))
+cat(sprintf("  hashed a = %.2f\n", fit_exp(res$k, res$hashed)))
+cat(sprintf("  exact  a = %.2f\n", fit_exp(res$k, res$exact)))
diff --git a/dev/red-team/verify-consensus.R b/dev/red-team/verify-consensus.R
@@ -0,0 +1,102 @@
+# Consistency cross-check for the O(kn) consensus / split-frequency machinery on
+# adversarial inputs: the hashed (default) and exact split counts must agree, in
+# both Consensus() and SplitFrequency().
+#
+# Historical note: during development this script also compared the new core
+# against a transient legacy O(k^2 n) oracle (`consensus_tree_legacy`); that gate
+# passed with 0 failures across all fixtures below before the oracle was removed.
+# The independent oracle for ongoing regression is `ape::consensus()`, exercised
+# by tests/testthat/test-consensus.R.
+#
+# Run:  Rscript dev/red-team/verify-consensus.R
+suppressMessages(devtools::load_all(".", quiet = TRUE))
+set.seed(1)
+
+# Canonical set of splits from a packed RawMatrix (order-independent).
+split_set <- function(m, n_tip) {
+  if (is.null(m) || nrow(m) == 0) return(character(0))
+  out <- character(nrow(m))
+  for (r in seq_len(nrow(m))) {
+    bits <- as.logical(rawToBits(as.raw(m[r, ])))[seq_len(n_tip)]
+    if (isTRUE(bits[1])) bits <- !bits          # tip 1 on the FALSE side
+    side <- which(bits)
+    if (length(side) < 2 || length(side) > n_tip - 2) next  # skip trivial
+    out[r] <- paste(side, collapse = ",")
+  }
+  sort(unique(out[out != ""]))
+}
+
+fails <- 0L
+check <- function(cond, msg) {
+  if (!isTRUE(cond)) { cat("  FAIL:", msg, "\n"); fails <<- fails + 1L }
+}
+
+ps <- c(0.5, 0.55, 0.6, 2/3, 0.75, 0.8, 0.9, 0.99, 1)
+
+# ---- Random forests across a range of (n, k) ----------------------------------
+message("== random forests ==")
+# k = 1 is handled by R's Consensus() wrapper (returns the single tree), never
+# reaching consensus_tree(); the legacy oracle is not robust to it, so skip.
+for (n_tip in c(4, 5, 6, 8, 13, 20, 50)) {
+  for (k in c(2, 3, 5, 7, 8, 15, 32, 60)) {
+    message(sprintf("  n=%d k=%d", n_tip, k))
+    forest <- lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL))
+    forest <- RenumberTips(forest, forest[[1]])
+    forest <- Preorder(forest)
+    class(forest) <- "multiPhylo"
+    for (p in ps) {
+      hashed <- split_set(TreeTools:::consensus_tree(forest, p), n_tip)
+      exact  <- split_set(TreeTools:::consensus_tree(forest, p, exact = TRUE), n_tip)
+      check(identical(hashed, exact),
+            sprintf("consensus hashed!=exact n=%d k=%d p=%.3g", n_tip, k, p))
+    }
+  }
+}
+
+# ---- Adversarial structured fixtures -----------------------------------------
+cat("== structured fixtures ==\n")
+adversarial <- list(
+  all_identical = rep(list(BalancedTree(8)), 7),
+  star_disagree = list(ape::read.tree(text = "((a,b),(c,d));"),
+                       ape::read.tree(text = "((a,c),(b,d));")),
+  single_split  = list(ape::read.tree(text = "((a,b,c,d),(e,f,g));"),
+                       ape::read.tree(text = "((a,b,c,d),(e,f,g));")),
+  one_off_bal   = c(rep(list(BalancedTree(10)), 3), list(PectinateTree(10))),
+  threshold_tie = c(rep(list(BalancedTree(8)), 2), rep(list(PectinateTree(8)), 2)),
+  mixed_resoln  = list(BalancedTree(8), CollapseNode(BalancedTree(8), 11:12),
+                       PectinateTree(8))
+)
+for (nm in names(adversarial)) {
+  forest <- adversarial[[nm]]
+  forest <- RenumberTips(forest, forest[[1]])
+  forest <- Preorder(forest)
+  class(forest) <- "multiPhylo"
+  n_tip <- NTip(forest[[1]])
+  for (p in ps) {
+    hashed <- split_set(TreeTools:::consensus_tree(forest, p), n_tip)
+    exact  <- split_set(TreeTools:::consensus_tree(forest, p, exact = TRUE), n_tip)
+    check(identical(hashed, exact), sprintf("%s p=%.3g: hashed!=exact", nm, p))
+  }
+}
+
+# ---- Hashed vs exact split frequencies (gate 5) ------------------------------
+cat("== hashed vs exact SplitFrequency ==\n")
+for (n_tip in c(4, 6, 8, 13, 30)) {
+  for (k in c(1, 2, 5, 12, 40)) {
+    forest <- lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL))
+    class(forest) <- "multiPhylo"
+    sh <- SplitFrequency(forest, exact = FALSE)
+    se <- SplitFrequency(forest, exact = TRUE)
+    # Compare as (split -> count) maps, order-independent
+    key <- function(s) {
+      if (length(s) == 0) return(character(0))
+      paste(as.character(s), attr(s, "count"))
+    }
+    check(setequal(key(sh), key(se)),
+          sprintf("SplitFrequency n=%d k=%d: hashed != exact", n_tip, k))
+  }
+}
+
+cat(sprintf("\n==== %s : %d failures ====\n",
+            if (fails == 0) "PASS" else "FAIL", fails))
+quit(status = if (fails == 0) 0 else 1)
diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib
@@ -376,3 +376,14 @@ @article{Wilkinson1992
   pages = {375--385},
   doi = {10.1111/j.1096-0031.1992.tb00079.x}
 }
+
+@article{Jansson2016,
+  title = {Improved algorithms for constructing consensus trees},
+  author = {Jansson, Jesper and Shen, Chuanqi and Sung, Wing-Kin},
+  journal = {Journal of the ACM},
+  volume = {63},
+  number = {3},
+  pages = {28:1--28:24},
+  year = {2016},
+  doi = {10.1145/2898436}
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -11,6 +11,7 @@ Foulds
 Guillerme
 HCL
 Hennig's
+Jansson
 Klopfstein
 Krzywinski
 Lemant
@@ -36,6 +37,7 @@ Rdpack
 Reweight
 SPR
 Sackin's
+Shen
 Spasojevic
 Stemmier
 Stemwardness

diff --git a/man/Consensus.Rd b/man/Consensus.Rd
diff --git a/man/SplitFrequency.Rd b/man/SplitFrequency.Rd