From 5e6c20710b9a94beb11cbc8a1caefffa4824e3ca Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Tue, 9 Dec 2025 16:48:35 +0100 Subject: [PATCH 01/15] test tree estimation helper function --- tests/testthat/test-tree.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-tree.R b/tests/testthat/test-tree.R index 637e5f9..520b69c 100644 --- a/tests/testthat/test-tree.R +++ b/tests/testthat/test-tree.R @@ -34,6 +34,14 @@ tree <- SDTree(x = X[, 1], y = Y, Q_type = 'no_deconfounding', expect_equal(tree$predictions, as.vector(predict(tree, data.frame(X = X[, 1])))) expect_equal(tree$predictions[1], predict(tree, data.frame(X = X[1, 1]))) + +#test tree with bootstrap sample +boot_index <- sample(1:50, 30, replace = T) +estimate_tree(Y = Y, X = X, A = NULL, max_leaves = 100, cp = 0, min_sample = 1, + mtry = NULL, fast = TRUE, Q_type = "trim", trim_quantile = 0.7, + q_hat = 3, Qf = NULL, gamma = NULL, max_candidates = 200, + Q_scale = TRUE, predictors = NULL, boot_index = boot_index) + #### does it work with only one covariate? set.seed(1) @@ -68,4 +76,4 @@ partDependence(tree, 1, X, subSample = 10) tree <- SDTree(x = X[, 1], y = Y, Q_type = 'no_deconfounding', cp = 0, min_sample = 5) expect_equal(tree$predictions, as.vector(predict(tree, data.frame(X = X[, 1])))) -expect_equal(tree$predictions[1], predict(tree, data.frame(X = X[1, 1]))) \ No newline at end of file +expect_equal(tree$predictions[1], predict(tree, data.frame(X = X[1, 1]))) From 15aeab31e10be6092bbaf4d6d342cba7a52f9166 Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Tue, 9 Dec 2025 16:49:14 +0100 Subject: [PATCH 02/15] add tree estimation helper function reduces ram usage in multicoring --- R/utility.R | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 263 insertions(+), 1 deletion(-) diff --git a/R/utility.R b/R/utility.R index 754a820..d7ff1a5 100644 --- a/R/utility.R +++ b/R/utility.R @@ -82,7 +82,7 @@ split_names <- function(node, var_names = NULL, digits = 2){ } -# finds all the reasonable spliting points in a data matrix +# finds all the reasonable splitting points in a data matrix find_s <- function(X, max_candidates = 100){ p <- ncol(X) if(p == 1){ @@ -268,4 +268,266 @@ Bbasis <- function(x, breaks){ return(Bx) } +estimate_tree <- function(boot_index, Y, X, A, max_leaves, cp, min_sample, mtry, fast, + Q_type, trim_quantile, q_hat, Qf, gamma, max_candidates, + Q_scale, predictors){ + if(is.null(boot_index)){ + boot_index <- 1:nrow(X) + tree_in_forest <- FALSE + }else{ + tree_in_forest <- TRUE + } + n <- length(boot_index) + + # estimate spectral transformation + if(!is.null(A)){ + if(is.null(gamma)) stop('gamma must be provided if A is provided') + if(is.vector(A)) A <- matrix(A) + if(!is.matrix(A)) stop('A must be a matrix') + if(nrow(A) != nrow(X)) stop('A must have n rows') + Wf <- get_Wf(matrix(A[boot_index, ], ncol = ncol(A)), gamma) + }else { + Wf <- function(v) v + } + if(is.null(Qf)){ + if(!is.null(A)){ + Qf <- function(v) get_Qf(Wf(X[boot_index, ]), Q_type, trim_quantile, q_hat, Q_scale)(Wf(v)) + }else{ + Qf <- get_Qf(X[boot_index, ], Q_type, trim_quantile, q_hat, Q_scale) + } + }else{ + if(!is.function(Qf)) stop('Q must be a function') + if(length(Qf(rnorm(n))) == n) stop('Q must map from n to n') + } + + #selection of predictors + if(!is.null(predictors)){ + if(is.character(predictors)){ + if(!all(predictors %in% colnames(X))) + stop("predictors must either be numeric columne index or in colnames of X") + predictors <- which(colnames(X) %in% predictors) + } + if(is.numeric(predictors)){ + if(!all(predictors > 0 & predictors <= ncol(X))) + stop("predictors must either be numeric columne index or in colnames of X") + } + pred_names <- colnames(X) + X <- matrix(X[, predictors], ncol = length(predictors)) + if(!is.null(pred_names)){ + colnames(X) <- pred_names[predictors] + } + } + + # number of covariates + p <- ncol(X) + if(!is.null(mtry) && mtry > p) stop('mtry must be at most p') + + # calculate first estimate + E <- matrix(1, n, 1) + E_tilde <- Qf(E) + Ue <- E_tilde / sqrt(sum(E_tilde ** 2)) + Y_tilde <- Qf(Y[boot_index]) + + # solve linear model + c_hat <- qr.coef(qr(E_tilde), Y_tilde) + c_hat <- as.numeric(c_hat) + + loss_start <- as.numeric(sum((Y_tilde - c_hat) ** 2) / n) + loss_temp <- loss_start + + # initialize tree + treeInfo <- c("name", "left", "right", "j", "s", "value", "dloss", + "res_dloss", "cp", "n_samples", "leaf") + d <- length(treeInfo) + + tree <- matrix(0, ncol = d, nrow = 1, dimnames = list(NULL, treeInfo)) + tree[1, c("name", "value", "dloss", "cp", "n_samples", "leaf")] <- + c(1, c_hat, loss_start, 10, n, 1) + treeSize <- 1 + + # memory for optimal splits + memory <- list() + potential_splits <- 1 + + # variable importance + var_imp <- rep(0, p) + names(var_imp) <- colnames(X) + + after_mtry <- 0 + + for(i in 1:max_leaves){ + # iterate over all possible splits every time + # for slow but slightly better solution + if(!fast){ + potential_splits <- 1:i + to_small <- sapply(potential_splits, + function(x){sum(E[, x]) < min_sample*2}) + potential_splits <- potential_splits[!to_small] + } + + #iterate over new to estimate splits + for(branch in potential_splits){ + # get samples in branch to evaluate + E_branch <- E[, branch] + index <- which(E_branch == 1) + X_branch <- matrix(X[boot_index[index], ], nrow = length(index)) + + # get potential splitting candidates + s <- find_s(X_branch, max_candidates = max_candidates) + n_splits <- nrow(s) + + # remove splits resulting in to small leaves + if(min_sample > 1) { + s <- s[-c(0:(min_sample - 1), (n_splits - min_sample + 2):(n_splits+1)), ] + } + s <- matrix(s, ncol = p) + + optSplits <- lapply(1:p, function(j){ + s_j <- unique(s[, j]) + E_next <- lapply(s_j, function(si) { + E_next <- matrix(0, nrow = n, ncol = 1) + E_next[index[X_branch[, j] > si], ] <- 1 + if(sum(E_next) == 0)return(NULL) + E_next + }) + E_next <- do.call(cbind, E_next) + if(is.null(E_next)) return(c(-10, j, 0, branch)) + U_next_prime <- Qf_temp(E_next, Ue, Qf) + U_next_size <- colSums(U_next_prime ** 2) + dloss <- as.numeric(crossprod(U_next_prime, Y_tilde))**2 / U_next_size + + opt <- which.max(unlist(dloss)) + c(dloss[[opt]], j, s_j[opt], branch) + }) + memory[[branch]] <- do.call(rbind, optSplits) + } + + if(i > after_mtry && !is.null(mtry)){ + Losses_dec <- lapply(memory, function(branch){ + branch[sample(1:p, mtry), ]}) + Losses_dec <- do.call(rbind, Losses_dec) + }else { + Losses_dec <- do.call(rbind, memory) + } + + loc <- which.max(Losses_dec[, 1]) + best_branch <- Losses_dec[loc, 4] + j <- Losses_dec[loc, 2] + s <- Losses_dec[loc, 3] + + if(Losses_dec[loc, 1] <= 0){ + break + } + + # divide observations in leaf + index <- which(E[, best_branch] == 1) + index_n_branches <- index[X[boot_index[index], j] > s] + + # new indicator matrix + E <- cbind(E, matrix(0, n, 1)) + E[index_n_branches, best_branch] <- 0 + E[index_n_branches, i+1] <- 1 + + E_tilde_branch <- E_tilde[, best_branch] + suppressWarnings({ + E_tilde[, best_branch] <- Qf(E[, best_branch]) + }) + E_tilde <- cbind(E_tilde, matrix(E_tilde_branch - E_tilde[, best_branch])) + + c_hat <- qr.coef(qr(E_tilde), Y_tilde) + + u_next_prime <- Qf_temp(E[, i + 1], Ue, Qf) + Ue <- cbind(Ue, u_next_prime / sqrt(sum(u_next_prime ** 2))) + + # check if loss decrease is larger than minimum loss decrease + # and if linear model could be estimated + if(sum(is.na(as.numeric(c_hat))) > 0){ + warning('singulaer matrix QE, tree might be to large, consider increasing cp') + break + } + + loss_dec <- as.numeric(loss_temp - loss(Y_tilde, E_tilde %*% c_hat)) + loss_temp <- loss_temp - loss_dec + + if(loss_dec <= cp * loss_start){ + break + } + # add loss decrease to variable importance + var_imp[j] <- var_imp[j] + loss_dec + + # add space for the two new leaves + tree <- rbind(tree, matrix(0, nrow = 2, ncol = d)) + + # select leaf to split + leaves <- tree[, "leaf"] == 1 + toSplit <- leaves & (tree[, "name"] == best_branch) + if(sum(toSplit) != 1) stop("Tries to split more than one leaf") + + # save split rule + tree[toSplit, c("left", "right", "j", "s", "res_dloss", "leaf")] <- + c(treeSize + 1, treeSize + 2, j, s, loss_dec, 2) + + # add new leaves + tree[treeSize + 1, c("name", "dloss", "cp", "n_samples", "leaf")] <- + c(tree[toSplit, "name"], loss_dec, loss_dec / loss_start, sum(E[, best_branch] == 1), 1) + tree[treeSize + 2, c("name", "dloss", "cp", "n_samples", "leaf")] <- + c(i + 1, loss_dec, loss_dec / loss_start, sum(E[, i + 1] == 1), 1) + treeSize <- treeSize + 2 + + # add estimates to tree leaves + c_hat <- as.numeric(c_hat) + # access leaf estimates by leaf names (i.e. columns of E) + tree[tree[, "leaf"] == 1, "value"] <- c_hat[tree[tree[, "leaf"] == 1, "name"]] + + # the two new partitions need to be checked for optimal splits in next iteration + potential_splits <- c(best_branch, i + 1) + + # a partition with less than min_sample observations or unique samples + # are not available for further splits + to_small <- sapply(potential_splits, function(x){ + new_samples <- nrow(unique(matrix(X[boot_index[as.logical(E[, x])],], nrow = sum(E[, x])))) + if(is.null(new_samples)) new_samples <- 0 + (new_samples < min_sample * 2) + }) + if(sum(to_small) > 0){ + for(el in potential_splits[to_small]){ + # to small partitions cannot decrease the loss + memory[[el]] <- matrix(0, p, 4) + } + potential_splits <- potential_splits[!to_small] + } + } + + if(i == max_leaves){ + warning('maximum number of iterations was reached, consider increasing m!') + } + + # predict the test set + if(tree_in_forest){ + f_X_hat <- NULL + }else{ + f_X_hat <- traverse_tree(tree, X) + } + + + var_names <- colnames(data.frame(X)) + names(var_imp) <- var_names + + # cp max of all splits after + new_cp <- getCp_max(tree) + tree[new_cp[[2]], "cp"] <- new_cp[[1]] + + # use max cp over siblings to ensure binary tree + for(i in 1:nrow(tree)){ + if(tree[i, c("j")] != 0){ + tree[tree[i, c("left", "right")], "cp"] <- + max(tree[tree[i, c("left", "right")], "cp"]) + } + } + + res <- list(predictions = f_X_hat, tree = tree, + var_names = var_names, var_importance = var_imp) + class(res) <- 'SDTree' + res +} From ff49465e8e41558ae10127ab18e13f38e9115cd1 Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Tue, 9 Dec 2025 17:24:24 +0100 Subject: [PATCH 03/15] prepare clean future usage --- R/predict.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/predict.R b/R/predict.R index 033acd1..8af3fd5 100644 --- a/R/predict.R +++ b/R/predict.R @@ -74,6 +74,13 @@ predict.SDForest <- function(object, newdata, mc.cores = 1, ...){ }) preds_i } + #if(mc.cores > 1){ + # plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + # with(future::plan(plan, workers = mc.cores), local = TRUE) + #} + #preds_list <- future.apply::future_lapply(future.seed = TRUE, + # X = object$forest, worker_fun) + if(mc.cores > 1){ if(Sys.info()[["sysname"]] == "Linux"){ preds_list <- parallel::mclapply(object$forest, From dcbedd9b2f124cb154d9ef7d7b9d6e835281b7ce Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Tue, 9 Dec 2025 17:24:43 +0100 Subject: [PATCH 04/15] use helper function to estimate tree --- R/SDTree.R | 251 +---------------------------------------------------- 1 file changed, 3 insertions(+), 248 deletions(-) diff --git a/R/SDTree.R b/R/SDTree.R index 4d0a54e..6c63fc7 100644 --- a/R/SDTree.R +++ b/R/SDTree.R @@ -138,253 +138,8 @@ SDTree <- function(formula = NULL, data = NULL, x = NULL, y = NULL, max_leaves = if(!is.null(mtry) && mtry < 1) stop('mtry must be larger than 0') if(n < 2 * min_sample) stop('n must be at least 2 * min_sample') if(max_candidates < 1) stop('max_candidates must be at least 1') - - # estimate spectral transformation - - if(!is.null(A)){ - if(is.null(gamma)) stop('gamma must be provided if A is provided') - if(is.vector(A)) A <- matrix(A) - if(!is.matrix(A)) stop('A must be a matrix') - if(nrow(A) != n) stop('A must have n rows') - Wf <- get_Wf(A, gamma) - }else { - Wf <- function(v) v - } - - if(is.null(Qf)){ - if(!is.null(A)){ - Qf <- function(v) get_Qf(Wf(X), Q_type, trim_quantile, q_hat, Q_scale)(Wf(v)) - }else{ - Qf <- get_Qf(X, Q_type, trim_quantile, q_hat, Q_scale) - } - }else{ - if(!is.function(Qf)) stop('Q must be a function') - if(length(Qf(rnorm(n))) == n) stop('Q must map from n to n') - } - - #selection of predictors - if(!is.null(predictors)){ - if(is.character(predictors)){ - if(!all(predictors %in% colnames(X))) - stop("predictors must either be numeric columne index or in colnames of X") - predictors <- which(colnames(X) %in% predictors) - } - if(is.numeric(predictors)){ - if(!all(predictors > 0 & predictors <= ncol(X))) - stop("predictors must either be numeric columne index or in colnames of X") - } - pred_names <- colnames(X) - X <- matrix(X[, predictors], ncol = length(predictors)) - if(!is.null(pred_names)){ - colnames(X) <- pred_names[predictors] - } - } - - # number of covariates - p <- ncol(X) - - if(!is.null(mtry) && mtry > p) stop('mtry must be at most p') - - # calculate first estimate - E <- matrix(1, n, 1) - E_tilde <- Qf(E) - Ue <- E_tilde / sqrt(sum(E_tilde ** 2)) - Y_tilde <- Qf(Y) - - # solve linear model - c_hat <- qr.coef(qr(E_tilde), Y_tilde) - c_hat <- as.numeric(c_hat) - - loss_start <- as.numeric(sum((Y_tilde - c_hat) ** 2) / n) - loss_temp <- loss_start - - # initialize tree - treeInfo <- c("name", "left", "right", "j", "s", "value", "dloss", - "res_dloss", "cp", "n_samples", "leaf") - d <- length(treeInfo) - - tree <- matrix(0, ncol = d, nrow = 1, dimnames = list(NULL, treeInfo)) - tree[1, c("name", "value", "dloss", "cp", "n_samples", "leaf")] <- - c(1, c_hat, loss_start, 10, n, 1) - treeSize <- 1 - - # memory for optimal splits - memory <- list() - potential_splits <- 1 - - # variable importance - var_imp <- rep(0, p) - names(var_imp) <- colnames(X) - - after_mtry <- 0 - - for(i in 1:max_leaves){ - # iterate over all possible splits every time - # for slow but slightly better solution - if(!fast){ - potential_splits <- 1:i - to_small <- sapply(potential_splits, - function(x){sum(E[, x]) < min_sample*2}) - potential_splits <- potential_splits[!to_small] - } - - #iterate over new to estimate splits - for(branch in potential_splits){ - # get samples in branch to evaluate - E_branch <- E[, branch] - index <- which(E_branch == 1) - X_branch <- matrix(X[index, ], nrow = length(index)) - - # get potential splitting candidates - s <- find_s(X_branch, max_candidates = max_candidates) - n_splits <- nrow(s) - - # remove splits resulting in to small leaves - if(min_sample > 1) { - s <- s[-c(0:(min_sample - 1), (n_splits - min_sample + 2):(n_splits+1)), ] - } - s <- matrix(s, ncol = p) - - optSplits <- lapply(1:p, function(j){ - s_j <- unique(s[, j]) - E_next <- lapply(s_j, function(si) { - E_next <- matrix(0, nrow = n, ncol = 1) - E_next[index[X_branch[, j] > si], ] <- 1 - if(sum(E_next) == 0)return(NULL) - E_next - }) - E_next <- do.call(cbind, E_next) - if(is.null(E_next)) return(c(-10, j, 0, branch)) - U_next_prime <- Qf_temp(E_next, Ue, Qf) - U_next_size <- colSums(U_next_prime ** 2) - dloss <- as.numeric(crossprod(U_next_prime, Y_tilde))**2 / U_next_size - - opt <- which.max(unlist(dloss)) - c(dloss[[opt]], j, s_j[opt], branch) - }) - memory[[branch]] <- do.call(rbind, optSplits) - } - - if(i > after_mtry && !is.null(mtry)){ - Losses_dec <- lapply(memory, function(branch){ - branch[sample(1:p, mtry), ]}) - Losses_dec <- do.call(rbind, Losses_dec) - }else { - Losses_dec <- do.call(rbind, memory) - } - - loc <- which.max(Losses_dec[, 1]) - best_branch <- Losses_dec[loc, 4] - j <- Losses_dec[loc, 2] - s <- Losses_dec[loc, 3] - - if(Losses_dec[loc, 1] <= 0){ - break - } - - # divide observations in leaf - index <- which(E[, best_branch] == 1) - index_n_branches <- index[X[index, j] > s] - - # new indicator matrix - E <- cbind(E, matrix(0, n, 1)) - E[index_n_branches, best_branch] <- 0 - E[index_n_branches, i+1] <- 1 - - E_tilde_branch <- E_tilde[, best_branch] - suppressWarnings({ - E_tilde[, best_branch] <- Qf(E[, best_branch]) - }) - E_tilde <- cbind(E_tilde, matrix(E_tilde_branch - E_tilde[, best_branch])) - - c_hat <- qr.coef(qr(E_tilde), Y_tilde) - - u_next_prime <- Qf_temp(E[, i + 1], Ue, Qf) - Ue <- cbind(Ue, u_next_prime / sqrt(sum(u_next_prime ** 2))) - - # check if loss decrease is larger than minimum loss decrease - # and if linear model could be estimated - if(sum(is.na(as.numeric(c_hat))) > 0){ - warning('singulaer matrix QE, tree might be to large, consider increasing cp') - break - } - - loss_dec <- as.numeric(loss_temp - loss(Y_tilde, E_tilde %*% c_hat)) - loss_temp <- loss_temp - loss_dec - - if(loss_dec <= cp * loss_start){ - break - } - # add loss decrease to variable importance - var_imp[j] <- var_imp[j] + loss_dec - - # add space for the two new leaves - tree <- rbind(tree, matrix(0, nrow = 2, ncol = d)) - - # select leaf to split - leaves <- tree[, "leaf"] == 1 - toSplit <- leaves & (tree[, "name"] == best_branch) - if(sum(toSplit) != 1) stop("Tries to split more than one leaf") - - # save split rule - tree[toSplit, c("left", "right", "j", "s", "res_dloss", "leaf")] <- - c(treeSize + 1, treeSize + 2, j, s, loss_dec, 2) - - # add new leaves - tree[treeSize + 1, c("name", "dloss", "cp", "n_samples", "leaf")] <- - c(tree[toSplit, "name"], loss_dec, loss_dec / loss_start, sum(E[, best_branch] == 1), 1) - tree[treeSize + 2, c("name", "dloss", "cp", "n_samples", "leaf")] <- - c(i + 1, loss_dec, loss_dec / loss_start, sum(E[, i + 1] == 1), 1) - treeSize <- treeSize + 2 - - # add estimates to tree leaves - c_hat <- as.numeric(c_hat) - # access leaf estimates by leaf names (i.e. columns of E) - tree[tree[, "leaf"] == 1, "value"] <- c_hat[tree[tree[, "leaf"] == 1, "name"]] - - # the two new partitions need to be checked for optimal splits in next iteration - potential_splits <- c(best_branch, i + 1) - - # a partition with less than min_sample observations or unique samples - # are not available for further splits - to_small <- sapply(potential_splits, function(x){ - new_samples <- nrow(unique(matrix(X[as.logical(E[, x]),], nrow = sum(E[, x])))) - if(is.null(new_samples)) new_samples <- 0 - (new_samples < min_sample * 2) - }) - if(sum(to_small) > 0){ - for(el in potential_splits[to_small]){ - # to small partitions cannot decrease the loss - memory[[el]] <- matrix(0, p, 4) - } - potential_splits <- potential_splits[!to_small] - } - } - - if(i == max_leaves){ - warning('maximum number of iterations was reached, consider increasing m!') - } - - # predict the test set - f_X_hat <- traverse_tree(tree, X) - - var_names <- colnames(data.frame(X)) - names(var_imp) <- var_names - - # cp max of all splits after - new_cp <- getCp_max(tree) - tree[new_cp[[2]], "cp"] <- new_cp[[1]] - # use max cp over siblings to ensure binary tree - for(i in 1:nrow(tree)){ - if(tree[i, c("j")] != 0){ - tree[tree[i, c("left", "right")], "cp"] <- - max(tree[tree[i, c("left", "right")], "cp"]) - } - } - - res <- list(predictions = f_X_hat, tree = tree, - var_names = var_names, var_importance = var_imp) - class(res) <- 'SDTree' - res + return(estimate_tree(boot_index = NULL, Y, X, A, max_leaves, cp, min_sample, mtry, fast, + Q_type, trim_quantile, q_hat, Qf, gamma, max_candidates, + Q_scale, predictors)) } From d3665a6efc54fcd89ad3b74858e534fc3f5bb07a Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Tue, 9 Dec 2025 17:25:01 +0100 Subject: [PATCH 05/15] update future usage --- R/SDForest.R | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/R/SDForest.R b/R/SDForest.R index 7580dd6..962ef7f 100644 --- a/R/SDForest.R +++ b/R/SDForest.R @@ -276,27 +276,21 @@ SDForest <- function(formula = NULL, data = NULL, x = NULL, y = NULL, nTree = 10 ind <- do.call(c, ind) } - #use random generater that works with multiprocessing + #use random generator that works with multiprocessing ok <- RNGkind("L'Ecuyer-CMRG") - + # Worker wrapper for bagged trees worker_fun <- function(i) { - Xi <- matrix(X[i, ], ncol = ncol(X)) - colnames(Xi) <- colnames(X) - if(!is.null(A)){ - Ai <- matrix(A[i, ], ncol = ncol(A)) - }else{ - Ai <- NULL - } - # protect SDTree call res_i <- tryCatch({ - tree_obj <- SDTree(x = Xi, y = Y[i], - cp = cp, min_sample = min_sample, + tree_obj <- estimate_tree(X = X, Y = Y, Qf = NULL, + cp = cp, min_sample = min_sample, max_leaves = n, Q_type = Q_type, trim_quantile = trim_quantile, - q_hat = q_hat, mtry = mtry, A = Ai, gamma = gamma, - max_candidates = max_candidates, - Q_scale = Q_scale, predictors = predictors) + q_hat = q_hat, mtry = mtry, A = A, gamma = gamma, + max_candidates = max_candidates, fast = TRUE, + Q_scale = Q_scale, predictors = predictors, + boot_index = i) + list(ok = TRUE, tree = tree_obj) }, error = function(e) { list(ok = FALSE, error = conditionMessage(e)) @@ -304,30 +298,28 @@ SDForest <- function(formula = NULL, data = NULL, x = NULL, y = NULL, nTree = 10 # convert warnings to tagged results if needed list(ok = TRUE, tree = NULL, warning = conditionMessage(w)) }) + p(sprintf("i=%g", i)) + res_i } + state <- progressr::handlers(global = NA) + if(verbose & !state) progressr::handlers(global = TRUE) + p <- progressr::progressor(along = ind) if(mc.cores > 1){ - if(Sys.info()[["sysname"]] == "Linux"){ - if(verbose) print('mclapply') - res_list <- parallel::mclapply(ind, worker_fun, mc.cores = mc.cores) - }else{ - if(verbose) print('future') - future::plan('multisession', workers = mc.cores) - res_list <- future.apply::future_lapply(future.seed = TRUE, X = ind, worker_fun) - } - }else{ - res_list <- pbapply::pblapply(ind, worker_fun) + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = mc.cores), local = TRUE) } - RNGkind(ok[1]) + res_list <- future.apply::future_lapply(future.seed = TRUE, X = ind, worker_fun) - #check worker statuses + # check worker statuses failed_workers <- which(vapply(res_list, function(z) !isTRUE(z$ok), logical(1))) if (length(failed_workers) > 0) { stop(sprintf("SDForest: %d worker(s) failed, first error: %s", length(failed_workers), res_list[[failed_workers[1]]]$error)) } res <- lapply(res_list, function(res) res$tree) + if(verbose & !state) progressr::handlers(global = FALSE) #selection of predictors if(!is.null(predictors)){ @@ -437,6 +429,7 @@ SDForest <- function(formula = NULL, data = NULL, x = NULL, y = NULL, nTree = 10 output$ooEnv_predictions <- ooEnv_predictions } + RNGkind(ok[1]) class(output) <- 'SDForest' output } From 34fd2938830d07442d8dc08cc9d7ad9572343d0f Mon Sep 17 00:00:00 2001 From: markus Date: Wed, 10 Dec 2025 21:38:15 +0100 Subject: [PATCH 06/15] change verbose --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 904ee4f..0a56aeb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,9 @@ Imports: tidyr, fda, grplasso, - rlang + rlang, + progressr, + parallelly Suggests: plotly, datasets, From 8094d7f8cb55aca965c4dec1f05874201fdfcf99 Mon Sep 17 00:00:00 2001 From: markus Date: Wed, 10 Dec 2025 21:39:33 +0100 Subject: [PATCH 07/15] update/fix progess reporting and parallelization --- R/SDForest.R | 17 ++++++----- R/partDependence.R | 41 ++++++++++++++----------- R/paths.R | 48 +++++++++++++++++++++++------- R/plot.R | 42 +++++++++++++++----------- R/predict.R | 37 ++++++++--------------- man/SDForest.Rd | 8 +++-- man/partDependence.Rd | 17 +++++++++-- man/plot.SDForest.Rd | 5 +++- man/predict.SDForest.Rd | 10 +++++-- man/regPath.SDForest.Rd | 18 ++++++++++- man/stabilitySelection.SDForest.Rd | 5 +++- tests/testthat/test-parallel.R | 2 +- 12 files changed, 159 insertions(+), 91 deletions(-) diff --git a/R/SDForest.R b/R/SDForest.R index 962ef7f..13578be 100644 --- a/R/SDForest.R +++ b/R/SDForest.R @@ -34,8 +34,9 @@ #' @param mtry Number of randomly selected covariates to consider for a split, #' if \code{NULL} half of the covariates are available for each split. #' \eqn{\text{mtry} = \lfloor \frac{p}{2} \rfloor} -#' @param mc.cores Number of cores to use for parallel processing, -#' if \code{mc.cores > 1} the trees are estimated in parallel. +#' @param mc.cores Number of cores to use for parallel computation `vignette("Runtime")`. +#' The `future` package is used for parallel processing. +#' To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/). #' @param Q_type Type of deconfounding, one of 'trim', 'pca', 'no_deconfounding'. #' 'trim' corresponds to the Trim transform \insertCite{Cevid2020SpectralModels}{SDModels} #' as implemented in the Doubly debiased lasso \insertCite{Guo2022DoublyConfounding}{SDModels}, @@ -67,7 +68,8 @@ #' @param Q_scale Should data be scaled to estimate the spectral transformation? #' Default is \code{TRUE} to not reduce the signal of high variance covariates, #' and we do not know of a scenario where this hurts. -#' @param verbose If \code{TRUE} fitting information is shown. +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html) #' @param predictors Subset of colnames(X) or numerical indices of the covariates #' for which an effect on y should be estimated. All the other covariates are only #' used for deconfounding. @@ -298,19 +300,19 @@ SDForest <- function(formula = NULL, data = NULL, x = NULL, y = NULL, nTree = 10 # convert warnings to tagged results if needed list(ok = TRUE, tree = NULL, warning = conditionMessage(w)) }) - p(sprintf("i=%g", i)) + p() res_i } - state <- progressr::handlers(global = NA) - if(verbose & !state) progressr::handlers(global = TRUE) - p <- progressr::progressor(along = ind) + progressr::with_progress({ + p <- progressr::progressor(along = ind, enable = verbose) if(mc.cores > 1){ plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" with(future::plan(plan, workers = mc.cores), local = TRUE) } res_list <- future.apply::future_lapply(future.seed = TRUE, X = ind, worker_fun) + }) # check worker statuses failed_workers <- which(vapply(res_list, function(z) !isTRUE(z$ok), logical(1))) @@ -319,7 +321,6 @@ SDForest <- function(formula = NULL, data = NULL, x = NULL, y = NULL, nTree = 10 length(failed_workers), res_list[[failed_workers[1]]]$error)) } res <- lapply(res_list, function(res) res$tree) - if(verbose & !state) progressr::handlers(global = FALSE) #selection of predictors if(!is.null(predictors)){ diff --git a/R/partDependence.R b/R/partDependence.R index c514ade..5c87a1f 100644 --- a/R/partDependence.R +++ b/R/partDependence.R @@ -17,8 +17,11 @@ #' If NULL, tries to extract the dataset from the model object. #' @param subSample Number of samples to draw from the original data for the empirical #' partial dependence. If NULL, all the observations are used. -#' @param mc.cores Number of cores to use for parallel computation. -#' Parallel computing is only supported for unix. +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/) +#' @param mc.cores Number of cores to use for parallel computation `vignette("Runtime")`. +#' The `future` package is used for parallel processing. +#' To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/). #' @return An object of class \code{partDependence} containing #' \item{preds_mean}{The average prediction for each value of the variable of interest.} #' \item{x_seq}{The sequence of values for the variable of interest.} @@ -34,7 +37,8 @@ #' plot(pd) #' @seealso \code{\link{SDForest}}, \code{\link{SDTree}} #' @export -partDependence <- function(object, j, X = NULL, subSample = NULL, mc.cores = 1){ +partDependence <- function(object, j, X = NULL, subSample = NULL, + verbose = TRUE, mc.cores = 1){ j_name <- j if(is.null(X)){ @@ -58,21 +62,22 @@ partDependence <- function(object, j, X = NULL, subSample = NULL, mc.cores = 1){ x_seq <- seq(min(X[, j]), max(X[, j]), length.out = 100) - if(mc.cores > 1){ - preds <- parallel::mclapply(x_seq, function(x){ - X_new <- X - X_new[, j] <- x - pred <- predict(object, newdata = X_new) - return(pred) - }, mc.cores = mc.cores) - }else{ - preds <- pbapply::pblapply(x_seq, function(x){ - X_new <- X - X_new[, j] <- x - pred <- predict(object, newdata = X_new) - return(pred) - }) - } + progressr::with_progress({ + p <- progressr::progressor(along = x_seq, enable = verbose) + if(mc.cores > 1){ + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = mc.cores), local = TRUE) + } + preds <- future.apply::future_lapply(future.seed = TRUE, + X = x_seq, + function(x){ + X_new <- X + X_new[, j] <- x + pred <- predict(object, newdata = X_new) + p(sprintf("x=%g", x)) + return(pred) + }) + }) preds <- do.call(rbind, preds) preds_mean <- rowMeans(preds) diff --git a/R/paths.R b/R/paths.R index 406553b..9a9ad96 100644 --- a/R/paths.R +++ b/R/paths.R @@ -58,6 +58,11 @@ regPath.SDTree <- function(object, cp_seq = NULL, ...){ #' @param X The training data, if NULL the data from the forest object is used. #' @param Y The training response variable, if NULL the data from the forest object is used. #' @param Q The transformation matrix, if NULL the data from the forest object is used. +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html) +#' @param mc.cores Number of cores to use for parallel computation `vignette("Runtime")`. +#' The `future` package is used for parallel processing. +#' To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/). #' @param ... Further arguments passed to or from other methods. #' @return An object of class \code{paths} containing #' \item{cp}{The sequence of complexity parameters.} @@ -84,21 +89,29 @@ regPath.SDTree <- function(object, cp_seq = NULL, ...){ #' #' @export regPath.SDForest <- function(object, cp_seq = NULL, X = NULL, Y = NULL, Q = NULL, - ...){ + verbose = TRUE, mc.cores = 1, ...){ if(is.null(cp_seq)) cp_seq <- get_cp_seq(object) cp_seq <- sort(cp_seq) - res <- pbapply::pblapply(cp_seq, function(cp){ - pruned_object <- prune(object, cp, X, Y, Q, pred = FALSE) - return(list(var_importance = pruned_object$var_importance, - oob_SDloss = pruned_object$oob_SDloss, - oob_loss = pruned_object$oob_loss))}) + progressr::with_progress({ + p <- progressr::progressor(along = cp_seq, enable = verbose) + if(mc.cores > 1){ + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = mc.cores), local = TRUE) + } + res <- future.apply::future_lapply(future.seed = TRUE, + X = cp_seq, + function(cp){ + pruned_object <- prune(object, cp, X, Y, Q, pred = FALSE) + p(sprintf("cp=%g", cp)) + return(list(var_importance = pruned_object$var_importance, + oob_SDloss = pruned_object$oob_SDloss, + oob_loss = pruned_object$oob_loss))}) + }) - #varImp_path <- t(sapply(res, function(x)x$var_importance)) varImp_path <- do.call(rbind, lapply(res, function(x)x$var_importance)) colnames(varImp_path) <- object$var_names - #loss_path <- t(sapply(res, function(x) c(x$oob_SDloss, x$oob_loss))) loss_path <- do.call(rbind, lapply(res, function(x) c(x$oob_SDloss, x$oob_loss))) colnames(loss_path) <- c('oob SDE', 'oob MSE') paths <- list(cp = cp_seq, varImp_path = varImp_path, loss_path = loss_path, @@ -125,6 +138,8 @@ stabilitySelection <- function(object, ...) UseMethod('stabilitySelection') #' @param object an SDForest object #' @param cp_seq A sequence of complexity parameters. #' If NULL, the sequence is calculated automatically using only relevant values. +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html) #' @param ... Further arguments passed to or from other methods. #' @return An object of class \code{paths} containing #' \item{cp}{The sequence of complexity parameters.} @@ -145,11 +160,22 @@ stabilitySelection <- function(object, ...) UseMethod('stabilitySelection') #' plot(paths, plotly = TRUE) #' } #' @export -stabilitySelection.SDForest <- function(object, cp_seq = NULL, ...){ +stabilitySelection.SDForest <- function(object, cp_seq = NULL, + verbose = TRUE, ...){ if(is.null(cp_seq)) cp_seq <- get_cp_seq(object) cp_seq <- sort(cp_seq) - - imp <- pbapply::pblapply(object$forest, function(x)regPath(x, cp_seq)$varImp_path > 0) + + progressr::with_progress({ + p <- progressr::progressor(along = 1:length(object$forest), enable = verbose) + imp <- lapply(1:length(object$forest), + function(i){ + path <- regPath(object$forest[[i]], + cp_seq, vebose = FALSE)$varImp_path > 0 + p() + path + }) + }) + imp <- lapply(imp, function(x)matrix(as.numeric(x), ncol = ncol(x))) imp <- Reduce('+', imp) / length(object$forest) diff --git a/R/plot.R b/R/plot.R index 1eb2f67..6f79785 100644 --- a/R/plot.R +++ b/R/plot.R @@ -44,7 +44,7 @@ plot.SDTree <- function(x, main = "", digits = 2, digits_decisions = 2, if(weighted){ #res_dloss <- edges$res_dloss #re scale edge weights - edges$res_dloss <- (edges$res_dloss - min(edges$res_dloss)) / (max(edges$res_dloss) - min(edges$res_dloss)) * 2 + 0.5 + edges$res_dloss <- (edges$res_dloss - min(edges$res_dloss)) / ((max(edges$res_dloss) - min(edges$res_dloss)) * 2 + 0.1) + 0.5 }else{ edges$res_dloss <- 0.5 } @@ -66,7 +66,8 @@ plot.SDTree <- function(x, main = "", digits = 2, digits_decisions = 2, ggplot2::annotate("segment", x = nLeaves*1.02, y = depth*0.98, xend = nLeaves*1.1, yend = depth*0.98, arrow = ggplot2::arrow(length = ggplot2::unit(0.1, "inches")), color = "black") + ggplot2::annotate("text", x = nLeaves * 1.05, y = depth, label = "no", size = 4) + - ggplot2::ggtitle(main) + ggplot2::ggtitle(main) + + ggplot2::ylim(-0.1*depth, depth*1.1) } #' Plot performance of SDForest against number of trees @@ -75,6 +76,8 @@ plot.SDTree <- function(x, main = "", digits = 2, digits_decisions = 2, #' not stabilize one can fit another SDForest and merge the two. #' @author Markus Ulmer #' @param x Fitted object of class \code{SDForest}. +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html) #' @param ... Further arguments passed to or from other methods. #' @return A ggplot object #' @seealso \code{\link{SDForest}} @@ -85,26 +88,31 @@ plot.SDTree <- function(x, main = "", digits = 2, digits_decisions = 2, #' model <- SDForest(x = X, y = y, Q_type = 'no_deconfounding', cp = 0.5, nTree = 500) #' plot(model) #' @export -plot.SDForest <- function(x, ...){ +plot.SDForest <- function(x, verbose = TRUE, ...){ Y_ <- x$Q(x$Y) # iterate over observations - preds <- pbapply::pblapply(1:length(x$Y), function(i){ - if(length(x$oob_ind[[i]]) == 0){ - return(NA) - } - xi <- matrix(x$X[i, ], nrow = 1) - - # predict for each tree - pred <- rep(NA, length(x$forest)) - model_idx <- x$oob_ind[[i]] - model_idx <- model_idx[model_idx <= length(x$forest)] - predictions <- sapply(model_idx, function(model){ - traverse_tree(x$forest[[model]]$tree, xi) + progressr::with_progress({ + p <- progressr::progressor(along = 1:length(x$Y), enable = verbose) + preds <- lapply(1:length(x$Y), function(i){ + if(length(x$oob_ind[[i]]) == 0){ + return(NA) + } + xi <- matrix(x$X[i, ], nrow = 1) + + # predict for each tree + pred <- rep(NA, length(x$forest)) + model_idx <- x$oob_ind[[i]] + model_idx <- model_idx[model_idx <= length(x$forest)] + predictions <- sapply(model_idx, function(model){ + traverse_tree(x$forest[[model]]$tree, xi) + }) + pred[model_idx] <- predictions + p() + pred }) - pred[model_idx] <- predictions - pred }) + preds <- do.call(rbind, preds) diff --git a/R/predict.R b/R/predict.R index 8af3fd5..b8c8483 100644 --- a/R/predict.R +++ b/R/predict.R @@ -35,8 +35,11 @@ predict.SDTree <- function(object, newdata, ...){ #' @param object Fitted object of class \code{SDForest}. #' @param newdata New test data of class \code{data.frame} containing #' the covariates for which to predict the response. -#' @param mc.cores Number of cores to use for parallel processing, -#' if \code{mc.cores > 1} the trees predict in parallel. +#' @param mc.cores Number of cores to use for parallel computation `vignette("Runtime")`. +#' The `future` package is used for parallel processing. +#' To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/). +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html) #' @param ... Further arguments passed to or from other methods. #' @return A vector of predictions for the new data. #' @examples @@ -48,7 +51,7 @@ predict.SDTree <- function(object, newdata, ...){ #' predict(model, newdata = data.frame(X)) #' @seealso \code{\link{SDForest}} #' @export -predict.SDForest <- function(object, newdata, mc.cores = 1, ...){ +predict.SDForest <- function(object, newdata, mc.cores = 1, verbose = FALSE, ...){ # predict function for the spectral deconfounded random forest # using the mean over all trees as the prediction # check data type @@ -62,9 +65,9 @@ predict.SDForest <- function(object, newdata, mc.cores = 1, ...){ if(any(is.na(X))) stop('X must not contain missing values') - worker_fun <- function(tree){ + worker_fun <- function(i){ preds_i <- tryCatch({ - preds <- traverse_tree(tree[["tree"]], X) + preds <- traverse_tree(object$forest[[i]][["tree"]], X) list(ok = TRUE, preds = preds) }, error = function(e) { list(ok = FALSE, error = conditionMessage(e)) @@ -74,28 +77,14 @@ predict.SDForest <- function(object, newdata, mc.cores = 1, ...){ }) preds_i } - #if(mc.cores > 1){ - # plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" - # with(future::plan(plan, workers = mc.cores), local = TRUE) - #} - #preds_list <- future.apply::future_lapply(future.seed = TRUE, - # X = object$forest, worker_fun) if(mc.cores > 1){ - if(Sys.info()[["sysname"]] == "Linux"){ - preds_list <- parallel::mclapply(object$forest, - worker_fun, - mc.cores = mc.cores) - }else{ - future::plan('multisession', workers = mc.cores) - preds_list <- future.apply::future_lapply(future.seed = TRUE, - X = object$forest, - worker_fun) - } - }else{ - preds_list <- pbapply::pblapply(object$forest, worker_fun) + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = mc.cores), local = TRUE) } - + preds_list <- future.apply::future_lapply(future.seed = TRUE, + X = 1:length(object$forest), + worker_fun) #check worker statuses failed_workers <- which(vapply(preds_list, function(z) !isTRUE(z$ok), logical(1))) if (length(failed_workers) > 0) { diff --git a/man/SDForest.Rd b/man/SDForest.Rd index ab85532..14ca259 100644 --- a/man/SDForest.Rd +++ b/man/SDForest.Rd @@ -57,8 +57,9 @@ A split is only performed if both resulting leaves have at least if \code{NULL} half of the covariates are available for each split. \eqn{\text{mtry} = \lfloor \frac{p}{2} \rfloor}} -\item{mc.cores}{Number of cores to use for parallel processing, -if \code{mc.cores > 1} the trees are estimated in parallel.} +\item{mc.cores}{Number of cores to use for parallel computation `vignette("Runtime")`. +The `future` package is used for parallel processing. +To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/).} \item{Q_type}{Type of deconfounding, one of 'trim', 'pca', 'no_deconfounding'. 'trim' corresponds to the Trim transform \insertCite{Cevid2020SpectralModels}{SDModels} @@ -105,7 +106,8 @@ proposed at each node for each covariate.} Default is \code{TRUE} to not reduce the signal of high variance covariates, and we do not know of a scenario where this hurts.} -\item{verbose}{If \code{TRUE} fitting information is shown.} +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html)} \item{predictors}{Subset of colnames(X) or numerical indices of the covariates for which an effect on y should be estimated. All the other covariates are only diff --git a/man/partDependence.Rd b/man/partDependence.Rd index 4af75a6..ea9b932 100644 --- a/man/partDependence.Rd +++ b/man/partDependence.Rd @@ -4,7 +4,14 @@ \alias{partDependence} \title{Partial dependence} \usage{ -partDependence(object, j, X = NULL, subSample = NULL, mc.cores = 1) +partDependence( + object, + j, + X = NULL, + subSample = NULL, + verbose = TRUE, + mc.cores = 1 +) } \arguments{ \item{object}{A model object that has a predict method that takes newdata as argument @@ -20,8 +27,12 @@ If NULL, tries to extract the dataset from the model object.} \item{subSample}{Number of samples to draw from the original data for the empirical partial dependence. If NULL, all the observations are used.} -\item{mc.cores}{Number of cores to use for parallel computation. -Parallel computing is only supported for unix.} +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/)} + +\item{mc.cores}{Number of cores to use for parallel computation `vignette("Runtime")`. +The `future` package is used for parallel processing. +To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/).} } \value{ An object of class \code{partDependence} containing diff --git a/man/plot.SDForest.Rd b/man/plot.SDForest.Rd index 2e8261d..b1f5efe 100644 --- a/man/plot.SDForest.Rd +++ b/man/plot.SDForest.Rd @@ -4,11 +4,14 @@ \alias{plot.SDForest} \title{Plot performance of SDForest against number of trees} \usage{ -\method{plot}{SDForest}(x, ...) +\method{plot}{SDForest}(x, verbose = TRUE, ...) } \arguments{ \item{x}{Fitted object of class \code{SDForest}.} +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html)} + \item{...}{Further arguments passed to or from other methods.} } \value{ diff --git a/man/predict.SDForest.Rd b/man/predict.SDForest.Rd index af78ba5..c78120a 100644 --- a/man/predict.SDForest.Rd +++ b/man/predict.SDForest.Rd @@ -4,7 +4,7 @@ \alias{predict.SDForest} \title{Predictions for the SDForest} \usage{ -\method{predict}{SDForest}(object, newdata, mc.cores = 1, ...) +\method{predict}{SDForest}(object, newdata, mc.cores = 1, verbose = FALSE, ...) } \arguments{ \item{object}{Fitted object of class \code{SDForest}.} @@ -12,8 +12,12 @@ \item{newdata}{New test data of class \code{data.frame} containing the covariates for which to predict the response.} -\item{mc.cores}{Number of cores to use for parallel processing, -if \code{mc.cores > 1} the trees predict in parallel.} +\item{mc.cores}{Number of cores to use for parallel computation `vignette("Runtime")`. +The `future` package is used for parallel processing. +To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/).} + +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html)} \item{...}{Further arguments passed to or from other methods.} } diff --git a/man/regPath.SDForest.Rd b/man/regPath.SDForest.Rd index 24f0b75..3172a02 100644 --- a/man/regPath.SDForest.Rd +++ b/man/regPath.SDForest.Rd @@ -5,7 +5,16 @@ \alias{regPath} \title{Calculate the regularization path of an SDForest} \usage{ -\method{regPath}{SDForest}(object, cp_seq = NULL, X = NULL, Y = NULL, Q = NULL, ...) +\method{regPath}{SDForest}( + object, + cp_seq = NULL, + X = NULL, + Y = NULL, + Q = NULL, + verbose = TRUE, + mc.cores = 1, + ... +) } \arguments{ \item{object}{an SDForest object} @@ -19,6 +28,13 @@ If NULL, the sequence is calculated automatically using only relevant values.} \item{Q}{The transformation matrix, if NULL the data from the forest object is used.} +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html)} + +\item{mc.cores}{Number of cores to use for parallel computation `vignette("Runtime")`. +The `future` package is used for parallel processing. +To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/).} + \item{...}{Further arguments passed to or from other methods.} } \value{ diff --git a/man/stabilitySelection.SDForest.Rd b/man/stabilitySelection.SDForest.Rd index d48f305..bf7f994 100644 --- a/man/stabilitySelection.SDForest.Rd +++ b/man/stabilitySelection.SDForest.Rd @@ -5,7 +5,7 @@ \alias{stabilitySelection} \title{Calculate the stability selection of an SDForest} \usage{ -\method{stabilitySelection}{SDForest}(object, cp_seq = NULL, ...) +\method{stabilitySelection}{SDForest}(object, cp_seq = NULL, verbose = TRUE, ...) } \arguments{ \item{object}{an SDForest object} @@ -13,6 +13,9 @@ \item{cp_seq}{A sequence of complexity parameters. If NULL, the sequence is calculated automatically using only relevant values.} +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html)} + \item{...}{Further arguments passed to or from other methods.} } \value{ diff --git a/tests/testthat/test-parallel.R b/tests/testthat/test-parallel.R index b094ca2..e5347c4 100644 --- a/tests/testthat/test-parallel.R +++ b/tests/testthat/test-parallel.R @@ -1,5 +1,5 @@ set.seed(1) -n <- 50 +n <- 20 X <- matrix(rnorm(n * 20), nrow = n) Y <- rnorm(n) From 30fc45693b10811c1a14fdf35da08aae8773b43d Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 11:38:11 +0100 Subject: [PATCH 08/15] remove dependence pbapply is replaced by progressr --- DESCRIPTION | 1 - 1 file changed, 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0a56aeb..5f56d20 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,7 +16,6 @@ Imports: ggraph, gridExtra, parallel, - pbapply, Rdpack, tidyr, fda, From f77960b53aa2b6826f82a6d85f365bcf482cac2f Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 11:38:35 +0100 Subject: [PATCH 09/15] update parallelization and progress reporting --- R/SDAM.R | 85 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/R/SDAM.R b/R/SDAM.R index 527b80a..b5057dd 100644 --- a/R/SDAM.R +++ b/R/SDAM.R @@ -36,9 +36,11 @@ #' Default is \code{TRUE} to not reduce the signal of high variance covariates. #' @param ind_lin A vector of indices specifying which covariates to model linearly (i.e. not expanded into basis function). #' Default is `NULL`. -#' @param mc.cores Number of cores to use for parallel processing, if \code{mc.cores > 1} -#' the cross validation is parallelized. Default is `1`. (only supported for unix) -#' @param verbose If \code{TRUE} fitting information is shown. +#' @param mc.cores Number of cores to use for parallel computation `vignette("Runtime")`. +#' The `future` package is used for parallel processing. +#' To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/). +#' @param verbose If \code{TRUE} progress updates are shown using the `progressr` package. +#' To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html) #' @param notRegularized A vector of indices specifying which covariates not to regularize. #' Default is `NULL`. #' @return An object of class `SDAM` containing the following elements: @@ -98,7 +100,8 @@ #' # predict #' predict(model, newdata = wine[42, ]) #' -#' ## alternative function call +#' ## alternative function call with customized progress bar +#' progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) #' mod_none <- SDAM(x = as.matrix(wine[1:10, -c(1, 2)]), y = wine$alcohol[1:10], #' Q_type = "no_deconfounding", nfolds = 2, n_K = 4, #' n_lambda1 = 4, n_lambda2 = 8) @@ -156,8 +159,15 @@ SDAM <- function(formula = NULL, data = NULL, x = NULL, y = NULL, n_unique_X <- apply(X, 2, function(x){length(unique(x))}) # Generate the design and model parameters for every K in vK - lmodK <- list() - for (i in 1:length(vK)){ + progressr::with_progress({ + pr <- progressr::progressor(along = 1:(n_K), enable = verbose) + pr(sprintf("Design generation"), amount = 0, class = "sticky") + if(mc.cores > 1){ + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = min(mc.cores, n_K)), local = TRUE) + } + + lmodK <- future.apply::future_lapply(future.seed = TRUE, 1:length(vK), function(i){ K <- vK[i] # effective number of basis functions for each Xj, j = 1,..., p # K_eff[j] can be at most equal to the number of unique values of Xj @@ -213,9 +223,11 @@ SDAM <- function(formula = NULL, data = NULL, x = NULL, y = NULL, lambda <- rep(0, n_lambda1) index <- rep(1, length(index)) } - lmodK[[i]] <- list(Rlist = Rlist, lbreaks = lbreaks, index = index, B = B, - QB = QB, lambda = lambda, K = K, K_eff = K_eff) - } + pr() + list(Rlist = Rlist, lbreaks = lbreaks, index = index, + QB = QB, lambda = lambda, K = K, K_eff = K_eff) + }) + }) # generate folds for CV ind <- sample(rep(1:nfolds, length.out = n), replace = FALSE) @@ -236,20 +248,34 @@ SDAM <- function(formula = NULL, data = NULL, x = NULL, y = NULL, QYpred <- predict(mod, newdata = listK$QB[test, ]) mse <- apply(QYpred, 2, function(y){mean((y - QY[test])^2)}) + pr() return(mse) } mse_fold <- function(l){ - MSEl <- lapply(lmodK, function(listK){mse_fold_K(l, listK)}) + MSEl <- future.apply::future_lapply(future.seed = TRUE, lmodK, + mse_fold_K, + l = l) return(unname(do.call(rbind, MSEl))) } - if(verbose) print("Initial cross-validation") - if(mc.cores == 1){ - MSES <- pbapply::pblapply(1:nfolds, mse_fold) - } else { - MSES <- parallel::mclapply(1:nfolds, mse_fold, mc.cores = mc.cores) - } + #use random generator that works with multiprocessing + ok <- RNGkind("L'Ecuyer-CMRG") + progressr::with_progress({ + pr <- progressr::progressor(along = 1:(nfolds * n_K), enable = verbose) + pr(sprintf("Initial cross-validation"), amount = 0, class = "sticky") + if(mc.cores > 1){ + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = min(mc.cores, nfolds)), local = TRUE) + } + MSES <- lapply(X = 1:nfolds, mse_fold) + }) + + #if(mc.cores == 1){ + # MSES <- pbapply::pblapply(1:nfolds, mse_fold) + #} else { + # MSES <- parallel::mclapply(1:nfolds, mse_fold, mc.cores = mc.cores) + #} # aggregate MSEs over folds MSES.agg <- Reduce("+", MSES) / nfolds @@ -267,13 +293,25 @@ SDAM <- function(formula = NULL, data = NULL, x = NULL, y = NULL, length.out = n_lambda2)) } - if(verbose) print("Second stage cross-validation") - if(mc.cores == 1){ - MSES1 <- pbapply::pblapply(1:nfolds, mse_fold_K, listK = modK.min) - } else { - MSES1 <- parallel::mclapply(1:nfolds, mse_fold_K, listK = modK.min, - mc.cores = mc.cores) - } + progressr::with_progress({ + pr <- progressr::progressor(along = 1:nfolds, enable = verbose) + pr(sprintf("Second stage cross-validation"), amount = 0, class = "sticky") + if(mc.cores > 1){ + plan <- if (parallelly::supportsMulticore()) "multicore" else "multisession" + with(future::plan(plan, workers = min(mc.cores, nfolds)), local = TRUE) + } + MSES1 <- future.apply::future_lapply(future.seed = TRUE, + X = 1:nfolds, + mse_fold_K, + listK = modK.min) + }) + #if(verbose) print("Second stage cross-validation") + #if(mc.cores == 1){ + # MSES1 <- pbapply::pblapply(1:nfolds, mse_fold_K, listK = modK.min) + #} else { + # MSES1 <- parallel::mclapply(1:nfolds, mse_fold_K, listK = modK.min, + # mc.cores = mc.cores) + #} MSES1 <- do.call(rbind, MSES1) MSE1.agg <- apply(MSES1, 2, mean) @@ -339,6 +377,7 @@ SDAM <- function(formula = NULL, data = NULL, x = NULL, y = NULL, # estimated active set lreturn$active <- active class(lreturn) <- "SDAM" + RNGkind(ok[1]) return(lreturn) } From 5c811bb97c1285b6588e532e3b2d552571e5930f Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 11:39:00 +0100 Subject: [PATCH 10/15] update examples with custom progress bar --- R/SDForest.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/SDForest.R b/R/SDForest.R index 13578be..ef61cc2 100644 --- a/R/SDForest.R +++ b/R/SDForest.R @@ -129,6 +129,8 @@ #' # comparison to classical random forest #' fit_ranger <- ranger::ranger(Y ~ ., train_data, importance = 'impurity') #' +#' # you can customize the progress bar see parameter verbose +#' progressr::handlers("cli") #' fit <- SDForest(x = X, y = Y, nTree = 100, Q_type = 'pca', q_hat = 2) #' fit <- SDForest(Y ~ ., nTree = 100, train_data) #' fit @@ -139,6 +141,7 @@ #' plot(fit) #' #' # a few more might be helpfull +#' progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) #' fit2 <- SDForest(Y ~ ., nTree = 50, train_data) #' fit <- mergeForest(fit, fit2) #' From cfcf4f7cb988dea2aad609230907e783a0fafe4e Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 11:39:28 +0100 Subject: [PATCH 11/15] add reference to the future package --- inst/REFERENCES.bib | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 08e990b..b2f964a 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -186,4 +186,16 @@ @misc{londschien2025domain archivePrefix={arXiv}, primaryClass={stat.AP}, url={https://arxiv.org/abs/2507.21783}, +} + +@Article{RJ-2021-048, + author = {Henrik Bengtsson}, + title = {A Unifying Framework for Parallel and Distributed Processing in R using Futures}, + year = {2021}, + journal = {The R Journal}, + doi = {10.32614/RJ-2021-048}, + url = {https://doi.org/10.32614/RJ-2021-048}, + pages = {208--227}, + volume = {13}, + number = {2}, } \ No newline at end of file From d14ebb83836f70218a520a1a6587114b0b93290f Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 11:40:11 +0100 Subject: [PATCH 12/15] update description of parallelization and progress report --- man/SDAM.Rd | 11 +++++++---- man/SDForest.Rd | 3 +++ vignettes/Runtime.Rmd | 27 ++++++++++++++++++++------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/man/SDAM.Rd b/man/SDAM.Rd index 703d338..7aa2df8 100644 --- a/man/SDAM.Rd +++ b/man/SDAM.Rd @@ -66,10 +66,12 @@ Default is \code{TRUE} to not reduce the signal of high variance covariates.} \item{ind_lin}{A vector of indices specifying which covariates to model linearly (i.e. not expanded into basis function). Default is `NULL`.} -\item{mc.cores}{Number of cores to use for parallel processing, if \code{mc.cores > 1} -the cross validation is parallelized. Default is `1`. (only supported for unix)} +\item{mc.cores}{Number of cores to use for parallel computation `vignette("Runtime")`. +The `future` package is used for parallel processing. +To use custom processing plans mc.cores has to be <= 1, see [`future` package](https://future.futureverse.org/).} -\item{verbose}{If \code{TRUE} fitting information is shown.} +\item{verbose}{If \code{TRUE} progress updates are shown using the `progressr` package. +To customize the progress bar, see [`progressr` package](https://progressr.futureverse.org/articles/progressr-intro.html)} \item{notRegularized}{A vector of indices specifying which covariates not to regularize. Default is `NULL`.} @@ -139,7 +141,8 @@ plot(partDependence(model, mostImp)) # predict predict(model, newdata = wine[42, ]) -## alternative function call +## alternative function call with customized progress bar +progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) mod_none <- SDAM(x = as.matrix(wine[1:10, -c(1, 2)]), y = wine$alcohol[1:10], Q_type = "no_deconfounding", nfolds = 2, n_K = 4, n_lambda1 = 4, n_lambda2 = 8) diff --git a/man/SDForest.Rd b/man/SDForest.Rd index 14ca259..0991e7b 100644 --- a/man/SDForest.Rd +++ b/man/SDForest.Rd @@ -185,6 +185,8 @@ sim_data$j # comparison to classical random forest fit_ranger <- ranger::ranger(Y ~ ., train_data, importance = 'impurity') +# you can customize the progress bar see parameter verbose +progressr::handlers("cli") fit <- SDForest(x = X, y = Y, nTree = 100, Q_type = 'pca', q_hat = 2) fit <- SDForest(Y ~ ., nTree = 100, train_data) fit @@ -195,6 +197,7 @@ fit plot(fit) # a few more might be helpfull +progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) fit2 <- SDForest(Y ~ ., nTree = 50, train_data) fit <- mergeForest(fit, fit2) diff --git a/vignettes/Runtime.Rmd b/vignettes/Runtime.Rmd index 0e051df..29683ad 100644 --- a/vignettes/Runtime.Rmd +++ b/vignettes/Runtime.Rmd @@ -20,22 +20,35 @@ For this package, we have written methods to estimate regressions trees and rand $$\hat{f} = \text{argmin}_{f' \in \mathcal{F}} \frac{||Q(\mathbf{Y} - f'(\mathbf{X}))||_2^2}{n}$$ The package is currently fully written in @RCoreTeam2024R:Computing for now and it gets quite slow for larger sample sizes. There might be a faster cpp version in the future, but for now, there are a few ways to increase the computations if you apply the methods to larger data sets. -## Computations +## Parallel Processing -Some speedup can be achieved by taking advantage of modern hardware. +Many of our functions support parallel processing using the parameter `mc.cores` to control the number of cores used. -### Multicore - -When estimating an SDForest, the most obvious way to increase the computations is to fit the individual trees on different cores in parallel. Parallel computing is supported for both Unix and Windows. Depending on how your system is set up, some linear algebra libraries might already run in parallel. In this case, the speed improvement from choosing more than one core to run on might not be that large. Be aware of potential RAM-overflows. - -```{r core, eval=FALSE} +```{r core1, eval=FALSE} # fits the individual SDTrees in parallel on 22 cores fit <- SDForest(x = X, y = Y, mc.cores = 22) +# predicts with the individual SDTrees in parallel +predict(fit, newdata = data.frame(X), mc.cores = 10) + +# evaluates different strengths of regularization in parallel +paths <- regPath(fit, mc.cores = 10) + +# predicts potential outcomes for different values of covariate one in parallel +pd <- partDependence(model, 1, mc.cores = 10) + # performs cross validation in parallel model <- SDAM(X, Y, cv_k = 5, mc.cores = 5) ``` +To support parallelization, we use the R package [future](https://future.futureverse.org/) @RJ-2021-048. If `mc.cores` is larger than one, `multicore` (forking of processes) is used if possible, and `multisession` otherwise. If `mc.cores` is smaller than two, we process sequentially or use a pre-specified plan. This way, a user can freely choose and set up any [backend](https://future.futureverse.org/articles/future-2b-backend.html). + +```{r core2, eval=FALSE} +# predefined plan +future::plan(multisession, workers = 2) +# fits the individual SDTrees in parallel on 2 cores +fit <- SDForest(x = X, y = Y) +``` ## Approximations From 2fa590e1dd10b169245f0ec463c0ce5179f300f5 Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 12:01:38 +0100 Subject: [PATCH 13/15] fix details --- DESCRIPTION | 1 + R/SDForest.R | 2 +- README.Rmd | 2 +- README.md | 12 ++++-------- man/SDForest.Rd | 2 +- vignettes/articles/SDTree.Rmd | 2 +- 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5f56d20..1a0931a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Suggests: ranger, HDclassif, qpdf, + cli, testthat (>= 3.0.0) RdMacros: Rdpack Encoding: UTF-8 diff --git a/R/SDForest.R b/R/SDForest.R index ef61cc2..d7d180b 100644 --- a/R/SDForest.R +++ b/R/SDForest.R @@ -141,7 +141,7 @@ #' plot(fit) #' #' # a few more might be helpfull -#' progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) +#' progressr::handlers(progressr::handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) #' fit2 <- SDForest(Y ~ ., nTree = 50, train_data) #' fit <- mergeForest(fit, fit2) #' diff --git a/README.Rmd b/README.Rmd index e2f3708..0ade575 100644 --- a/README.Rmd +++ b/README.Rmd @@ -75,7 +75,7 @@ You can also estimate just one Spectrally Deconfounded Regression Tree using the ```{r SDTree, fig.height=7} Tree <- SDTree(Y ~ ., train_data, cp = 0.01) -plot(Tree) +#plot(Tree) ``` Or you can estimate a Spectrally Deconfounded Additive Model, with theoretical guarantees, using the `SDAM` function. See also the article [SDAM](https://www.markus-ulmer.ch/SDModels/articles/SDAM.html). diff --git a/README.md b/README.md index 62bc28f..ba5d343 100644 --- a/README.md +++ b/README.md @@ -76,8 +76,8 @@ fit #> #> Number of trees: 100 #> Number of covariates: 50 -#> OOB loss: 0.1554798 -#> OOB spectral loss: 0.05246865 +#> OOB loss: 0.1617913 +#> OOB spectral loss: 0.05095329 ``` You can also estimate just one Spectrally Deconfounded Regression Tree @@ -86,24 +86,20 @@ using the `SDTree` function. See also the article ``` r Tree <- SDTree(Y ~ ., train_data, cp = 0.01) -plot(Tree) +#plot(Tree) ``` - - Or you can estimate a Spectrally Deconfounded Additive Model, with theoretical guarantees, using the `SDAM` function. See also the article [SDAM](https://www.markus-ulmer.ch/SDModels/articles/SDAM.html). ``` r model <- SDAM(Y ~ ., train_data) -#> [1] "Initial cross-validation" -#> [1] "Second stage cross-validation" model #> SDAM result #> #> Number of covariates: 50 -#> Number of active covariates: 4 +#> Number of active covariates: 3 ```
Date: Fri, 12 Dec 2025 12:07:04 +0100 Subject: [PATCH 14/15] fix progress example --- R/SDAM.R | 2 +- man/SDAM.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/SDAM.R b/R/SDAM.R index b5057dd..4195e90 100644 --- a/R/SDAM.R +++ b/R/SDAM.R @@ -101,7 +101,7 @@ #' predict(model, newdata = wine[42, ]) #' #' ## alternative function call with customized progress bar -#' progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) +#' progressr::handlers(progressr::handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) #' mod_none <- SDAM(x = as.matrix(wine[1:10, -c(1, 2)]), y = wine$alcohol[1:10], #' Q_type = "no_deconfounding", nfolds = 2, n_K = 4, #' n_lambda1 = 4, n_lambda2 = 8) diff --git a/man/SDAM.Rd b/man/SDAM.Rd index 7aa2df8..286888b 100644 --- a/man/SDAM.Rd +++ b/man/SDAM.Rd @@ -142,7 +142,7 @@ plot(partDependence(model, mostImp)) predict(model, newdata = wine[42, ]) ## alternative function call with customized progress bar -progressr::handlers(handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) +progressr::handlers(progressr::handler_txtprogressbar(char = cli::col_red(cli::symbol$heart))) mod_none <- SDAM(x = as.matrix(wine[1:10, -c(1, 2)]), y = wine$alcohol[1:10], Q_type = "no_deconfounding", nfolds = 2, n_K = 4, n_lambda1 = 4, n_lambda2 = 8) From 2b111479cdaffb8abf3589c882d7c9325d7bf11d Mon Sep 17 00:00:00 2001 From: Markus Ulmer Date: Fri, 12 Dec 2025 12:27:07 +0100 Subject: [PATCH 15/15] version update --- DESCRIPTION | 2 +- NEWS.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1a0931a..9abb785 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SDModels Title: Spectrally Deconfounded Models -Version: 2.0.1 +Version: 2.0.2 Authors@R: c( person("Markus", "Ulmer", email = "markus.ulmer@stat.math.ethz.ch", role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0001-7783-8475")), diff --git a/NEWS.md b/NEWS.md index 03671ab..00376d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# SDModels 2.0.2 + +* Switch all the parallelization to futures. See `vignette("Runtime")` +* Switch all the progress updates to progressr. Progress updates are now also available for parallel processing and are customizable. +* Process are much more RAM efficient now. + # SDModels 2.0.1 * Fix bug in SDTree and SDForest where an error occurred, if X had columns with only one unique value.