diff --git a/R/permutation_tests.R b/R/permutation_tests.R new file mode 100644 index 0000000..3183e7f --- /dev/null +++ b/R/permutation_tests.R @@ -0,0 +1,277 @@ +# ============================================================================ # +#' Permutation Test for Two Groups +#' +#' Performs a permutation test comparing the difference in means of a numeric +#' variable between two groups. The null distribution is generated by randomly +#' shuffling group labels. +#' +#' @param df A data.frame or tibble containing the data. +#' @param groups A string specifying the grouping variable (must have exactly two levels). +#' @param vals A string specifying the numeric response variable. +#' @param n Number of permutations to perform. Default is 1000. +#' @param type Type of test: either `"two-tailed"` (default) or `"one-tailed"`. +#' @param seed Integer to set random seed for reproducibility. Default is 123. +#' @param ci Confidence level for the null distribution (default 0.95). +#' +#' @return A list with components: +#' \describe{ +#' \item{permutations}{Vector of permuted test statistics.} +#' \item{observed_statistic}{Observed difference in group means.} +#' \item{confidence}{Confidence interval for the null distribution.} +#' \item{p_value}{Permutation p-value.} +#' } +#' @examples +#' set.seed(123) +#' df <- data.frame( +#' group = rep(c("A", "B"), each = 10), +#' value = c(rnorm(10, 0), rnorm(10, 1)) +#' ) +#' permutation_test(df, groups = "group", vals = "value", n = 500) +#' +#' @export +permutation_test <- function(df, groups, vals, n = 1000, + type = "two-tailed", seed = 123, + ci = 0.95) { + set.seed(seed) + if (!groups %in% names(df)) stop("Group variable not found in dataframe.") + if (length(unique(df[[groups]])) != 2) stop("Group variable must have exactly two levels.") + + group_levels <- sort(unique(df[[groups]])) + observed_statistic <- diff(tapply(df[[vals]], df[[groups]], mean)[group_levels]) + + permutations <- replicate(n, { + shuffled <- sample(df[[groups]]) + diff(tapply(df[[vals]], shuffled, mean)[group_levels]) + }) + + p_value <- switch(type, + "one-tailed" = mean(permutations > observed_statistic), + "two-tailed" = mean(abs(permutations) > abs(observed_statistic)), + stop("Invalid type")) + + alpha <- (1 - ci) / 2 + confidence <- quantile(permutations, probs = c(alpha, 1 - alpha), na.rm = TRUE) + + list( + permutations = as.vector(permutations), + observed_statistic = observed_statistic, + confidence = confidence, + p_value = p_value + ) +} + +# ============================================================================ # +#' Pairwise Permutation Tests +#' +#' Performs pairwise permutation tests for all combinations of groups in a factor. +#' +#' @param df A data.frame or tibble containing the data. +#' @param group The grouping variable. +#' @param vals Numeric response variable. +#' @param n Number of permutations. Default 1000. +#' @param type Type of test: `"two-tailed"` or `"one-tailed"`. +#' @param seed Random seed for reproducibility. +#' @param ci Confidence level for null distribution (default 0.95). +#' +#' @return A tibble with columns: +#' \describe{ +#' \item{group1}{First group in the comparison.} +#' \item{group2}{Second group in the comparison.} +#' \item{p_value}{Permutation p-value.} +#' \item{observed_statistic}{Observed difference in means.} +#' \item{ci_low}{Lower bound of confidence interval.} +#' \item{ci_high}{Upper bound of confidence interval.} +#' } +#' @export +pairwise_permutation_tests <- function(df, group, vals, n = 1000, + type = "two-tailed", seed = 123, ci = 0.95) { + comparisons <- combn(unique(df[[group]]), 2) + results <- apply(comparisons, 2, function(pair) { + subset_data <- df[df[[group]] %in% pair, , drop = FALSE] + permutation_test(subset_data, group, vals, n = n, type = type, seed = seed, ci = ci) + }) + + tibble( + group1 = comparisons[1, ], + group2 = comparisons[2, ], + p_value = sapply(results, `[[`, "p_value"), + observed_statistic = sapply(results, `[[`, "observed_statistic"), + ci_low = sapply(results, function(x) x$confidence[1]), + ci_high = sapply(results, function(x) x$confidence[2]) + ) +} + +# ============================================================================ # +#' Permutation ANOVA (one-way) +#' +#' Performs a one-way permutation ANOVA. +#' +#' @param df Data frame or tibble. +#' @param group Factor variable. +#' @param vals Numeric response variable. +#' @param n Number of permutations. +#' @param seed Random seed. +#' @param ci Confidence level for null distribution (default 0.95). +#' +#' @return List with: +#' \describe{ +#' \item{observed_statistic}{Observed F-statistic.} +#' \item{permutations}{Permuted F-statistics.} +#' \item{confidence}{Confidence interval for null distribution.} +#' \item{p_value}{Permutation p-value.} +#' } +#' @export +permutation_test_anova <- function(df, group, vals, n = 1000, seed = 123, ci = 0.95) { + set.seed(seed) + if (!group %in% names(df)) stop("Group variable not found in dataframe.") + + fit <- aov(df[[vals]] ~ df[[group]]) + observed_f <- summary(fit)[[1]][["F value"]][1] + + permuted_f <- replicate(n, { + perm_group <- sample(df[[group]]) + summary(aov(df[[vals]] ~ perm_group))[[1]][["F value"]][1] + }) + + p_value <- mean(permuted_f >= observed_f) + alpha <- (1 - ci) / 2 + confidence <- quantile(permuted_f, c(alpha, 1 - alpha), na.rm = TRUE) + + list( + observed_statistic = observed_f, + permutations = permuted_f, + confidence = confidence, + p_value = p_value + ) +} + +# ============================================================================ # +#' Factorial Permutation ANOVA (two-way) +#' +#' Performs a two-way permutation ANOVA, reporting main effects and interaction. +#' +#' @param df Data frame or tibble. +#' @param factor1 First factor. +#' @param factor2 Second factor. +#' @param vals Numeric response variable. +#' @param n Number of permutations. +#' @param seed Random seed. +#' @param ci Confidence level for null distribution (default 0.95). +#' +#' @return A tibble with: +#' \describe{ +#' \item{factor}{Factor or interaction term.} +#' \item{observed_statistic}{Observed F-statistic.} +#' \item{p_value}{Permutation p-value.} +#' \item{ci_low}{Lower bound of confidence interval.} +#' \item{ci_high}{Upper bound of confidence interval.} +#' } +#' @export +factorial_permutation_anova <- function( + df, + factor1, + factor2, + vals, + n = 1000, + seed = 123, + ci = 0.95 +) { + set.seed(seed) + fit <- aov(df[[vals]] ~ df[[factor1]] * df[[factor2]]) + observed_f <- summary(fit)[[1]][["F value"]][1:3] + + permuted_f1 <- numeric(n) + permuted_f2 <- numeric(n) + permuted_interaction <- numeric(n) + + for (i in 1:n) { + perm_factor1 <- sample(df[[factor1]]) + perm_factor2 <- sample(df[[factor2]]) + fit_perm <- aov(df[[vals]] ~ perm_factor1 * perm_factor2) + permuted_f1[i] <- summary(fit_perm)[[1]][["F value"]][1] + permuted_f2[i] <- summary(fit_perm)[[1]][["F value"]][2] + permuted_interaction[i] <- summary(fit_perm)[[1]][["F value"]][3] + } + + alpha <- (1 - ci) / 2 + tibble( + factor = c(factor1, factor2, paste0(factor1, ":", factor2)), + observed_statistic = observed_f, + p_value = c( + mean(permuted_f1 >= observed_f[1]), + mean(permuted_f2 >= observed_f[2]), + mean(permuted_interaction >= observed_f[3]) + ), + ci_low = c( + quantile(permuted_f1, alpha), + quantile(permuted_f2, alpha), + quantile(permuted_interaction, alpha) + ), + ci_high = c( + quantile(permuted_f1, 1-alpha), + quantile(permuted_f2, 1-alpha), + quantile(permuted_interaction, 1-alpha) + ) + ) +} + +# ============================================================================ # +#' General Permutation ANOVA (multi-factor) +#' +#' Performs permutation ANOVA for any number of factors. +#' +#' @param df Data frame or tibble. +#' @param factors Character vector of factor names. +#' @param response Response variable. +#' @param n Number of permutations. +#' @param seed Random seed. +#' @param ci Confidence level for null distribution (default 0.95). +#' +#' @return A tibble with: +#' \describe{ +#' \item{observed_statistic}{Observed F-statistics.} +#' \item{permutations}{List of permuted F-statistics.} +#' \item{p_value}{Permutation p-values for each factor/interaction.} +#' \item{ci_low}{Lower bound of confidence interval.} +#' \item{ci_high}{Upper bound of confidence interval.} +#' } +#' @export +permutation_anova <- function( + df, + factors, + response, + n = 1000, + seed = 123, + ci = 0.95 +) { + set.seed(seed) + for (f in factors) if (!f %in% names(df)) stop( + paste("Factor", f, "not found in dataframe.") + ) + + formula <- as.formula( + paste( + response, + "~", + paste(factors, collapse = "*") + ) + ) + observed_f <- anova(lm(formula, data = df))$`F value` + + permutations <- replicate(n, { + permuted_response <- sample(df[[response]]) + permuted_df <- df + permuted_df[[response]] <- permuted_response + anova(lm(formula, data = permuted_df))$`F value` + }) + + alpha <- (1 - ci) / 2 + tibble( + observed_statistic = observed_f, + permutations = I(list(permutations)), + p_value = sapply(observed_f, function(x) mean(permutations >= x)), + ci_low = quantile(permutations, alpha), + ci_high = quantile(permutations, 1-alpha) + ) +} +# ============================================================================ #