MAPLE-Lab · konradswierczek · Sep 11, 2025 · Sep 11, 2025
diff --git a/R/permutation_tests.R b/R/permutation_tests.R
@@ -0,0 +1,277 @@
+# ============================================================================ #
+#' Permutation Test for Two Groups
+#'
+#' Performs a permutation test comparing the difference in means of a numeric
+#' variable between two groups. The null distribution is generated by randomly
+#' shuffling group labels.
+#'
+#' @param df A data.frame or tibble containing the data.
+#' @param groups A string specifying the grouping variable (must have exactly two levels).
+#' @param vals A string specifying the numeric response variable.
+#' @param n Number of permutations to perform. Default is 1000.
+#' @param type Type of test: either `"two-tailed"` (default) or `"one-tailed"`.
+#' @param seed Integer to set random seed for reproducibility. Default is 123.
+#' @param ci Confidence level for the null distribution (default 0.95).
+#'
+#' @return A list with components:
+#' \describe{
+#'   \item{permutations}{Vector of permuted test statistics.}
+#'   \item{observed_statistic}{Observed difference in group means.}
+#'   \item{confidence}{Confidence interval for the null distribution.}
+#'   \item{p_value}{Permutation p-value.}
+#' }
+#' @examples
+#' set.seed(123)
+#' df <- data.frame(
+#'   group = rep(c("A", "B"), each = 10),
+#'   value = c(rnorm(10, 0), rnorm(10, 1))
+#' )
+#' permutation_test(df, groups = "group", vals = "value", n = 500)
+#'
+#' @export
+permutation_test <- function(df, groups, vals, n = 1000,
+                             type = "two-tailed", seed = 123,
+                             ci = 0.95) {
+  set.seed(seed)
+  if (!groups %in% names(df)) stop("Group variable not found in dataframe.")
+  if (length(unique(df[[groups]])) != 2) stop("Group variable must have exactly two levels.")
+
+  group_levels <- sort(unique(df[[groups]]))
+  observed_statistic <- diff(tapply(df[[vals]], df[[groups]], mean)[group_levels])
+
+  permutations <- replicate(n, {
+    shuffled <- sample(df[[groups]])
+    diff(tapply(df[[vals]], shuffled, mean)[group_levels])
+  })
+
+  p_value <- switch(type,
+                    "one-tailed" = mean(permutations > observed_statistic),
+                    "two-tailed" = mean(abs(permutations) > abs(observed_statistic)),
+                    stop("Invalid type"))
+
+  alpha <- (1 - ci) / 2
+  confidence <- quantile(permutations, probs = c(alpha, 1 - alpha), na.rm = TRUE)
+
+  list(
+    permutations = as.vector(permutations),
+    observed_statistic = observed_statistic,
+    confidence = confidence,
+    p_value = p_value
+  )
+}
+
+# ============================================================================ #
+#' Pairwise Permutation Tests
+#'
+#' Performs pairwise permutation tests for all combinations of groups in a factor.
+#'
+#' @param df A data.frame or tibble containing the data.
+#' @param group The grouping variable.
+#' @param vals Numeric response variable.
+#' @param n Number of permutations. Default 1000.
+#' @param type Type of test: `"two-tailed"` or `"one-tailed"`.
+#' @param seed Random seed for reproducibility.
+#' @param ci Confidence level for null distribution (default 0.95).
+#'
+#' @return A tibble with columns:
+#' \describe{
+#'   \item{group1}{First group in the comparison.}
+#'   \item{group2}{Second group in the comparison.}
+#'   \item{p_value}{Permutation p-value.}
+#'   \item{observed_statistic}{Observed difference in means.}
+#'   \item{ci_low}{Lower bound of confidence interval.}
+#'   \item{ci_high}{Upper bound of confidence interval.}
+#' }
+#' @export
+pairwise_permutation_tests <- function(df, group, vals, n = 1000,
+                                       type = "two-tailed", seed = 123, ci = 0.95) {
+  comparisons <- combn(unique(df[[group]]), 2)
+  results <- apply(comparisons, 2, function(pair) {
+    subset_data <- df[df[[group]] %in% pair, , drop = FALSE]
+    permutation_test(subset_data, group, vals, n = n, type = type, seed = seed, ci = ci)
+  })
+
+  tibble(
+    group1 = comparisons[1, ],
+    group2 = comparisons[2, ],
+    p_value = sapply(results, `[[`, "p_value"),
+    observed_statistic = sapply(results, `[[`, "observed_statistic"),
+    ci_low = sapply(results, function(x) x$confidence[1]),
+    ci_high = sapply(results, function(x) x$confidence[2])
+  )
+}
+
+# ============================================================================ #
+#' Permutation ANOVA (one-way)
+#'
+#' Performs a one-way permutation ANOVA.
+#'
+#' @param df Data frame or tibble.
+#' @param group Factor variable.
+#' @param vals Numeric response variable.
+#' @param n Number of permutations.
+#' @param seed Random seed.
+#' @param ci Confidence level for null distribution (default 0.95).
+#'
+#' @return List with:
+#' \describe{
+#'   \item{observed_statistic}{Observed F-statistic.}
+#'   \item{permutations}{Permuted F-statistics.}
+#'   \item{confidence}{Confidence interval for null distribution.}
+#'   \item{p_value}{Permutation p-value.}
+#' }
+#' @export
+permutation_test_anova <- function(df, group, vals, n = 1000, seed = 123, ci = 0.95) {
+  set.seed(seed)
+  if (!group %in% names(df)) stop("Group variable not found in dataframe.")
+
+  fit <- aov(df[[vals]] ~ df[[group]])
+  observed_f <- summary(fit)[[1]][["F value"]][1]
+
+  permuted_f <- replicate(n, {
+    perm_group <- sample(df[[group]])
+    summary(aov(df[[vals]] ~ perm_group))[[1]][["F value"]][1]
+  })
+
+  p_value <- mean(permuted_f >= observed_f)
+  alpha <- (1 - ci) / 2
+  confidence <- quantile(permuted_f, c(alpha, 1 - alpha), na.rm = TRUE)
+
+  list(
+    observed_statistic = observed_f,
+    permutations = permuted_f,
+    confidence = confidence,
+    p_value = p_value
+  )
+}
+
+# ============================================================================ #
+#' Factorial Permutation ANOVA (two-way)
+#'
+#' Performs a two-way permutation ANOVA, reporting main effects and interaction.
+#'
+#' @param df Data frame or tibble.
+#' @param factor1 First factor.
+#' @param factor2 Second factor.
+#' @param vals Numeric response variable.
+#' @param n Number of permutations.
+#' @param seed Random seed.
+#' @param ci Confidence level for null distribution (default 0.95).
+#'
+#' @return A tibble with:
+#' \describe{
+#'   \item{factor}{Factor or interaction term.}
+#'   \item{observed_statistic}{Observed F-statistic.}
+#'   \item{p_value}{Permutation p-value.}
+#'   \item{ci_low}{Lower bound of confidence interval.}
+#'   \item{ci_high}{Upper bound of confidence interval.}
+#' }
+#' @export
+factorial_permutation_anova <- function(
+    df,
+    factor1,
+    factor2,
+    vals,
+    n = 1000,
+    seed = 123,
+    ci = 0.95
+) {
+  set.seed(seed)
+  fit <- aov(df[[vals]] ~ df[[factor1]] * df[[factor2]])
+  observed_f <- summary(fit)[[1]][["F value"]][1:3]
+
+  permuted_f1 <- numeric(n)
+  permuted_f2 <- numeric(n)
+  permuted_interaction <- numeric(n)
+
+  for (i in 1:n) {
+    perm_factor1 <- sample(df[[factor1]])
+    perm_factor2 <- sample(df[[factor2]])
+    fit_perm <- aov(df[[vals]] ~ perm_factor1 * perm_factor2)
+    permuted_f1[i] <- summary(fit_perm)[[1]][["F value"]][1]
+    permuted_f2[i] <- summary(fit_perm)[[1]][["F value"]][2]
+    permuted_interaction[i] <- summary(fit_perm)[[1]][["F value"]][3]
+  }
+
+  alpha <- (1 - ci) / 2
+  tibble(
+    factor = c(factor1, factor2, paste0(factor1, ":", factor2)),
+    observed_statistic = observed_f,
+    p_value = c(
+      mean(permuted_f1 >= observed_f[1]),
+      mean(permuted_f2 >= observed_f[2]),
+      mean(permuted_interaction >= observed_f[3])
+    ),
+    ci_low = c(
+      quantile(permuted_f1, alpha),
+      quantile(permuted_f2, alpha),
+      quantile(permuted_interaction, alpha)
+    ),
+    ci_high = c(
+      quantile(permuted_f1, 1-alpha),
+      quantile(permuted_f2, 1-alpha),
+      quantile(permuted_interaction, 1-alpha)
+    )
+  )
+}
+
+# ============================================================================ #
+#' General Permutation ANOVA (multi-factor)
+#'
+#' Performs permutation ANOVA for any number of factors.
+#'
+#' @param df Data frame or tibble.
+#' @param factors Character vector of factor names.
+#' @param response Response variable.
+#' @param n Number of permutations.
+#' @param seed Random seed.
+#' @param ci Confidence level for null distribution (default 0.95).
+#'
+#' @return A tibble with:
+#' \describe{
+#'   \item{observed_statistic}{Observed F-statistics.}
+#'   \item{permutations}{List of permuted F-statistics.}
+#'   \item{p_value}{Permutation p-values for each factor/interaction.}
+#'   \item{ci_low}{Lower bound of confidence interval.}
+#'   \item{ci_high}{Upper bound of confidence interval.}
+#' }
+#' @export
+permutation_anova <- function(
+    df,
+    factors,
+    response,
+    n = 1000,
+    seed = 123,
+    ci = 0.95
+) {
+  set.seed(seed)
+  for (f in factors) if (!f %in% names(df)) stop(
+    paste("Factor", f, "not found in dataframe.")
+  )
+
+  formula <- as.formula(
+    paste(
+      response,
+      "~",
+      paste(factors, collapse = "*")
+    )
+  )
+  observed_f <- anova(lm(formula, data = df))$`F value`
+
+  permutations <- replicate(n, {
+    permuted_response <- sample(df[[response]])
+    permuted_df <- df
+    permuted_df[[response]] <- permuted_response
+    anova(lm(formula, data = permuted_df))$`F value`
+  })
+
+  alpha <- (1 - ci) / 2
+  tibble(
+    observed_statistic = observed_f,
+    permutations = I(list(permutations)),
+    p_value = sapply(observed_f, function(x) mean(permutations >= x)),
+    ci_low = quantile(permutations, alpha),
+    ci_high = quantile(permutations, 1-alpha)
+  )
+}
+# ============================================================================ #