Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 277 additions & 0 deletions R/permutation_tests.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
# ============================================================================ #
#' Permutation Test for Two Groups
#'
#' Performs a permutation test comparing the difference in means of a numeric
#' variable between two groups. The null distribution is generated by randomly
#' shuffling group labels.
#'
#' @param df A data.frame or tibble containing the data.
#' @param groups A string specifying the grouping variable (must have exactly two levels).
#' @param vals A string specifying the numeric response variable.
#' @param n Number of permutations to perform. Default is 1000.
#' @param type Type of test: either `"two-tailed"` (default) or `"one-tailed"`.
#' @param seed Integer to set random seed for reproducibility. Default is 123.
#' @param ci Confidence level for the null distribution (default 0.95).
#'
#' @return A list with components:
#' \describe{
#' \item{permutations}{Vector of permuted test statistics.}
#' \item{observed_statistic}{Observed difference in group means.}
#' \item{confidence}{Confidence interval for the null distribution.}
#' \item{p_value}{Permutation p-value.}
#' }
#' @examples
#' set.seed(123)
#' df <- data.frame(
#' group = rep(c("A", "B"), each = 10),
#' value = c(rnorm(10, 0), rnorm(10, 1))
#' )
#' permutation_test(df, groups = "group", vals = "value", n = 500)
#'
#' @export
permutation_test <- function(df, groups, vals, n = 1000,
type = "two-tailed", seed = 123,
ci = 0.95) {
set.seed(seed)
if (!groups %in% names(df)) stop("Group variable not found in dataframe.")
if (length(unique(df[[groups]])) != 2) stop("Group variable must have exactly two levels.")

group_levels <- sort(unique(df[[groups]]))
observed_statistic <- diff(tapply(df[[vals]], df[[groups]], mean)[group_levels])

permutations <- replicate(n, {
shuffled <- sample(df[[groups]])
diff(tapply(df[[vals]], shuffled, mean)[group_levels])
})

p_value <- switch(type,
"one-tailed" = mean(permutations > observed_statistic),
"two-tailed" = mean(abs(permutations) > abs(observed_statistic)),
stop("Invalid type"))

alpha <- (1 - ci) / 2
confidence <- quantile(permutations, probs = c(alpha, 1 - alpha), na.rm = TRUE)

list(
permutations = as.vector(permutations),
observed_statistic = observed_statistic,
confidence = confidence,
p_value = p_value
)
}

# ============================================================================ #
#' Pairwise Permutation Tests
#'
#' Performs pairwise permutation tests for all combinations of groups in a factor.
#'
#' @param df A data.frame or tibble containing the data.
#' @param group The grouping variable.
#' @param vals Numeric response variable.
#' @param n Number of permutations. Default 1000.
#' @param type Type of test: `"two-tailed"` or `"one-tailed"`.
#' @param seed Random seed for reproducibility.
#' @param ci Confidence level for null distribution (default 0.95).
#'
#' @return A tibble with columns:
#' \describe{
#' \item{group1}{First group in the comparison.}
#' \item{group2}{Second group in the comparison.}
#' \item{p_value}{Permutation p-value.}
#' \item{observed_statistic}{Observed difference in means.}
#' \item{ci_low}{Lower bound of confidence interval.}
#' \item{ci_high}{Upper bound of confidence interval.}
#' }
#' @export
pairwise_permutation_tests <- function(df, group, vals, n = 1000,
type = "two-tailed", seed = 123, ci = 0.95) {
comparisons <- combn(unique(df[[group]]), 2)
results <- apply(comparisons, 2, function(pair) {
subset_data <- df[df[[group]] %in% pair, , drop = FALSE]
permutation_test(subset_data, group, vals, n = n, type = type, seed = seed, ci = ci)
})

tibble(
group1 = comparisons[1, ],
group2 = comparisons[2, ],
p_value = sapply(results, `[[`, "p_value"),
observed_statistic = sapply(results, `[[`, "observed_statistic"),
ci_low = sapply(results, function(x) x$confidence[1]),
ci_high = sapply(results, function(x) x$confidence[2])
)
}

# ============================================================================ #
#' Permutation ANOVA (one-way)
#'
#' Performs a one-way permutation ANOVA.
#'
#' @param df Data frame or tibble.
#' @param group Factor variable.
#' @param vals Numeric response variable.
#' @param n Number of permutations.
#' @param seed Random seed.
#' @param ci Confidence level for null distribution (default 0.95).
#'
#' @return List with:
#' \describe{
#' \item{observed_statistic}{Observed F-statistic.}
#' \item{permutations}{Permuted F-statistics.}
#' \item{confidence}{Confidence interval for null distribution.}
#' \item{p_value}{Permutation p-value.}
#' }
#' @export
permutation_test_anova <- function(df, group, vals, n = 1000, seed = 123, ci = 0.95) {
set.seed(seed)
if (!group %in% names(df)) stop("Group variable not found in dataframe.")

fit <- aov(df[[vals]] ~ df[[group]])
observed_f <- summary(fit)[[1]][["F value"]][1]

permuted_f <- replicate(n, {
perm_group <- sample(df[[group]])
summary(aov(df[[vals]] ~ perm_group))[[1]][["F value"]][1]
})

p_value <- mean(permuted_f >= observed_f)
alpha <- (1 - ci) / 2
confidence <- quantile(permuted_f, c(alpha, 1 - alpha), na.rm = TRUE)

list(
observed_statistic = observed_f,
permutations = permuted_f,
confidence = confidence,
p_value = p_value
)
}

# ============================================================================ #
#' Factorial Permutation ANOVA (two-way)
#'
#' Performs a two-way permutation ANOVA, reporting main effects and interaction.
#'
#' @param df Data frame or tibble.
#' @param factor1 First factor.
#' @param factor2 Second factor.
#' @param vals Numeric response variable.
#' @param n Number of permutations.
#' @param seed Random seed.
#' @param ci Confidence level for null distribution (default 0.95).
#'
#' @return A tibble with:
#' \describe{
#' \item{factor}{Factor or interaction term.}
#' \item{observed_statistic}{Observed F-statistic.}
#' \item{p_value}{Permutation p-value.}
#' \item{ci_low}{Lower bound of confidence interval.}
#' \item{ci_high}{Upper bound of confidence interval.}
#' }
#' @export
factorial_permutation_anova <- function(
df,
factor1,
factor2,
vals,
n = 1000,
seed = 123,
ci = 0.95
) {
set.seed(seed)
fit <- aov(df[[vals]] ~ df[[factor1]] * df[[factor2]])
observed_f <- summary(fit)[[1]][["F value"]][1:3]

permuted_f1 <- numeric(n)
permuted_f2 <- numeric(n)
permuted_interaction <- numeric(n)

for (i in 1:n) {
perm_factor1 <- sample(df[[factor1]])
perm_factor2 <- sample(df[[factor2]])
fit_perm <- aov(df[[vals]] ~ perm_factor1 * perm_factor2)
permuted_f1[i] <- summary(fit_perm)[[1]][["F value"]][1]
permuted_f2[i] <- summary(fit_perm)[[1]][["F value"]][2]
permuted_interaction[i] <- summary(fit_perm)[[1]][["F value"]][3]
}

alpha <- (1 - ci) / 2
tibble(
factor = c(factor1, factor2, paste0(factor1, ":", factor2)),
observed_statistic = observed_f,
p_value = c(
mean(permuted_f1 >= observed_f[1]),
mean(permuted_f2 >= observed_f[2]),
mean(permuted_interaction >= observed_f[3])
),
ci_low = c(
quantile(permuted_f1, alpha),
quantile(permuted_f2, alpha),
quantile(permuted_interaction, alpha)
),
ci_high = c(
quantile(permuted_f1, 1-alpha),
quantile(permuted_f2, 1-alpha),
quantile(permuted_interaction, 1-alpha)
)
)
}

# ============================================================================ #
#' General Permutation ANOVA (multi-factor)
#'
#' Performs permutation ANOVA for any number of factors.
#'
#' @param df Data frame or tibble.
#' @param factors Character vector of factor names.
#' @param response Response variable.
#' @param n Number of permutations.
#' @param seed Random seed.
#' @param ci Confidence level for null distribution (default 0.95).
#'
#' @return A tibble with:
#' \describe{
#' \item{observed_statistic}{Observed F-statistics.}
#' \item{permutations}{List of permuted F-statistics.}
#' \item{p_value}{Permutation p-values for each factor/interaction.}
#' \item{ci_low}{Lower bound of confidence interval.}
#' \item{ci_high}{Upper bound of confidence interval.}
#' }
#' @export
permutation_anova <- function(
df,
factors,
response,
n = 1000,
seed = 123,
ci = 0.95
) {
set.seed(seed)
for (f in factors) if (!f %in% names(df)) stop(
paste("Factor", f, "not found in dataframe.")
)

formula <- as.formula(
paste(
response,
"~",
paste(factors, collapse = "*")
)
)
observed_f <- anova(lm(formula, data = df))$`F value`

permutations <- replicate(n, {
permuted_response <- sample(df[[response]])
permuted_df <- df
permuted_df[[response]] <- permuted_response
anova(lm(formula, data = permuted_df))$`F value`
})

alpha <- (1 - ci) / 2
tibble(
observed_statistic = observed_f,
permutations = I(list(permutations)),
p_value = sapply(observed_f, function(x) mean(permutations >= x)),
ci_low = quantile(permutations, alpha),
ci_high = quantile(permutations, 1-alpha)
)
}
# ============================================================================ #
Loading