From 7356265561a363f7a26613d22eee2ce03f09a5b6 Mon Sep 17 00:00:00 2001 From: Hugo Rodrigues Date: Fri, 12 Jun 2026 14:25:00 -0400 Subject: [PATCH] fix(benchmark): restore offline AfSP WRB benchmark in the suite (v0.9.111) The offline AfSP (Africa Soil Profiles) WRB benchmark was silently dropped from run_all_benchmarks()'s consolidated report. Two bugs, one mechanism: 1. load_afsp_sample() returns a wrapper list(pedons, pulled_on, source, filter) -- length 4 -- mirroring load_wosis_stratified_sample() / load_kssl_sample(). The suite fed that wrapper straight into benchmark_afsp() instead of its $pedons slot. 2. benchmark_afsp() then iterated the 4 wrapper elements and hit $site on the atomic ones (pulled_on, source) -> "$ operator is invalid for atomic vectors", which .suite_run_afsp()'s tryCatch swallowed to NULL, so AfSP never appeared. The bundled inst/extdata/afsp_sample.rds was NOT a stub: it already holds the full stratified 5 x 24 = 120-pedon sample. Fix (engine untouched -- harness only): - New internal .afsp_as_pedons() unwraps $pedons when handed the wrapper and passes a bare pedon list through unchanged. - benchmark_afsp() calls it (accepts either form) and skips any non-PedonRecord member instead of erroring. - .suite_run_afsp() unwraps before its length-guard and max_n slice. - load_afsp_sample() docs corrected from the never-shipped "130 / 26 RSGs" to the actual 120 / 24, and clarify the wrapper return. Result: run_all_benchmarks() surfaces a real afsp_sample/wrb2022 row (n=120, acc 25%, bal-acc 25%, macro-F1 0.22, kappa 0.22, NIR 4.2%, bootstrap CI) -- a 2nd independent OFFLINE WRB benchmark alongside FEBR -- with the full v0.9.110 metric set. Tests: - New test-v09111-afsp-benchmark-harness.R: benchmark_afsp(load_afsp_sample()) (raw wrapper) returns non-NULL with a non-empty confusion matrix, junk members are skipped, .suite_run_afsp() yields a full-metric row, and run_all_benchmarks() surfaces AfSP. - test-v09106-benchmark-suite.R: replace the stale assertion that encoded the old buggy "only canonical survives" behaviour with one that asserts external datasets are skipped while offline rows (canonical + AfSP) appear. Full suite: 1702 tests, 0 failures, 0 errors. Co-Authored-By: Claude Opus 4.8 (1M context) --- DESCRIPTION | 4 +- NEWS.md | 38 +++++++ R/afsp.R | 44 +++++++-- R/benchmark-suite.R | 6 +- man/benchmark_afsp.Rd | 6 +- man/dot-afsp_as_pedons.Rd | 20 ++++ man/load_afsp_sample.Rd | 13 ++- tests/testthat/test-v09106-benchmark-suite.R | 11 ++- .../test-v09111-afsp-benchmark-harness.R | 99 +++++++++++++++++++ 9 files changed, 222 insertions(+), 19 deletions(-) create mode 100644 man/dot-afsp_as_pedons.Rd create mode 100644 tests/testthat/test-v09111-afsp-benchmark-harness.R diff --git a/DESCRIPTION b/DESCRIPTION index 15227dd94..b8980fcf3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: soilKey Type: Package Title: Automated Soil Profile Classification per WRB 2022, SiBCS 5 and USDA Soil Taxonomy 13 -Version: 0.9.110 -Date: 2026-06-11 +Version: 0.9.111 +Date: 2026-06-12 Authors@R: person("Hugo", "Rodrigues", email = "rodrigues.machado.hugo@gmail.com", diff --git a/NEWS.md b/NEWS.md index 6f0674461..b242f8467 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,41 @@ +# soilKey 0.9.111 (2026-06-12) + +A benchmark-harness bug fix split out of the v0.9.110 methodology work (B1): +the offline **AfSP (Africa Soil Profiles) WRB benchmark** was silently dropped +from the consolidated report. \strong{The classification engine is unchanged} +-- every edit is in the AfSP loader/benchmark and the suite wiring. + +## AfSP offline WRB benchmark restored + +\itemize{ + \item \code{load_afsp_sample()} returns a \strong{wrapper} + \code{list(pedons, pulled_on, source, filter)} (mirroring + \code{load_wosis_stratified_sample()} / \code{load_kssl_sample()}), but + the suite fed that 4-element wrapper straight into + \code{benchmark_afsp()} instead of its \code{$pedons} slot. The + benchmark then iterated the wrapper and hit \code{$site} on the atomic + members (\code{pulled_on}, \code{source}) -- \emph{"$ operator is + invalid for atomic vectors"} -- which \code{.suite_run_afsp()}'s + \code{tryCatch} swallowed to \code{NULL}, so AfSP never appeared in + \code{run_all_benchmarks()}. + \item New internal \code{.afsp_as_pedons()} unwraps the \code{$pedons} slot + when handed the wrapper and passes a bare pedon list through unchanged. + \code{benchmark_afsp()} now calls it (so it accepts \strong{either} + form) and skips any non-\code{PedonRecord} member instead of erroring; + \code{.suite_run_afsp()} unwraps before its length-guard and + \code{max_n} slice. + \item Result: \code{run_all_benchmarks()} surfaces a real + \strong{afsp_sample/wrb2022} row -- a second independent \strong{offline} + WRB benchmark (Africa, RSG level, n = 120 across 24 RSGs) alongside + \strong{FEBR} -- with the full v0.9.110 metric set (balanced accuracy, + macro-F1, kappa, NIR, bootstrap CI). + \item The bundled \code{inst/extdata/afsp_sample.rds} was \strong{not} a stub: + it already held the full stratified 5 x 24 = 120-pedon sample. The + \code{load_afsp_sample()} docs are corrected from the never-shipped + "130 / 26 RSGs" to the actual 120 / 24, and clarify the wrapper return. +} + + # soilKey 0.9.110 (2026-06-11) The "**benchmark methodology**" release (front B1 of the accuracy work). A diff --git a/R/afsp.R b/R/afsp.R index 4b3f84422..9c00334df 100644 --- a/R/afsp.R +++ b/R/afsp.R @@ -249,16 +249,21 @@ load_afsp_pedons <- function(afsp_dir, #' Load the bundled AfSP stratified sample (v0.9.77) #' -#' Returns a 130-profile snapshot from AfSP v1.2 stratified by WRB -#' RSG (5 profiles per RSG x 26 RSGs), pre-built so users can run +#' Returns a 120-profile snapshot from AfSP v1.2 stratified by WRB +#' RSG (5 profiles per RSG x 24 RSGs), pre-built so users can run #' the African WRB benchmark offline without the 35 MB ZIP download. #' #' This is the African analogue of #' \code{\link{load_wosis_stratified_sample}} (global WoSIS) and #' \code{\link{load_kssl_nasis_sample}} (US KSSL+NASIS). #' -#' @return A list with \code{pedons}, \code{pulled_on}, \code{source}, -#' \code{filter}. +#' Like its siblings, this returns a \strong{wrapper} list -- the pedons +#' are in the \code{$pedons} slot. Pass \code{$pedons} to per-pedon loops, +#' or hand the whole wrapper to \code{\link{benchmark_afsp}}, which unwraps +#' it for you. +#' +#' @return A list with \code{pedons} (120 \code{\link{PedonRecord}}s), +#' \code{pulled_on}, \code{source}, \code{filter}. #' #' @section Reference: #' Leenaars, J. G. B., van Oostrum, A. J. M., & Ruiperez Gonzalez, M. @@ -276,27 +281,50 @@ load_afsp_sample <- function() { } +#' Coerce an AfSP sample to a bare list of PedonRecords +#' +#' \code{\link{load_afsp_sample}} returns a wrapper +#' \code{list(pedons, pulled_on, source, filter)} (mirroring +#' \code{\link{load_wosis_stratified_sample}} / +#' \code{\link{load_kssl_sample}}), but the benchmark + suite loops iterate +#' pedons. This unwraps the \code{$pedons} slot when handed the wrapper and +#' otherwise passes a bare pedon list through unchanged, so callers can feed +#' either form. Without it, iterating the 4-element wrapper hits +#' \code{$site} on atomic elements (\code{pulled_on}, \code{source}) and +#' throws "$ operator is invalid for atomic vectors". +#' @keywords internal +.afsp_as_pedons <- function(x) { + if (is.list(x) && !is.null(x$pedons) && is.list(x$pedons)) x$pedons else x +} + + #' Benchmark soilKey WRB predictions against AfSP ground truth #' -#' @param pedons List of \code{\link{PedonRecord}} from -#' \code{\link{load_afsp_pedons}} or \code{\link{load_afsp_sample}}. +#' @param pedons A list of \code{\link{PedonRecord}} (e.g.\ from +#' \code{\link{load_afsp_pedons}}), or the wrapper returned by +#' \code{\link{load_afsp_sample}} (its \code{$pedons} slot is +#' unwrapped automatically). #' @param verbose Print progress. #' #' @return List with \code{accuracy}, \code{n_compared}, \code{confusion}, #' \code{per_class_recall}. #' @export benchmark_afsp <- function(pedons, verbose = TRUE) { + pedons <- .afsp_as_pedons(pedons) if (length(pedons) == 0L) stop("Empty pedon list.") if (isTRUE(verbose)) cat(sprintf("[afsp] benchmarking %d pedons\n", length(pedons))) preds <- vapply(pedons, function(pr) { + if (!inherits(pr, "PedonRecord")) return(NA_character_) res <- tryCatch(classify_wrb2022(pr, on_missing = "silent"), error = function(e) NULL) if (is.null(res)) NA_character_ else sub("s$", "", res$rsg_or_order) }, character(1)) - refs <- vapply(pedons, function(p) p$site$reference_wrb %||% NA_character_, - character(1)) + refs <- vapply(pedons, function(p) { + if (!inherits(p, "PedonRecord")) return(NA_character_) + p$site$reference_wrb %||% NA_character_ + }, character(1)) in_scope <- !is.na(refs) & !is.na(preds) n_correct <- sum(in_scope & refs == preds) diff --git a/R/benchmark-suite.R b/R/benchmark-suite.R index ea1a270c6..3ebe8239a 100644 --- a/R/benchmark-suite.R +++ b/R/benchmark-suite.R @@ -189,7 +189,11 @@ run_all_benchmarks <- function(datasets = "auto", paths = NULL, max_n = 300L, # AfSP offline sample, classified under WRB. .suite_run_afsp <- function(max_n, verbose) { - peds <- tryCatch(load_afsp_sample(), error = function(e) NULL) + # load_afsp_sample() returns the wrapper list(pedons, pulled_on, ...); + # unwrap to the bare pedon list so the length-guard and max_n slice + # operate on the 120 pedons, not the 4 wrapper elements (which would + # otherwise slice/feed atomic members into benchmark_afsp and crash). + peds <- .afsp_as_pedons(tryCatch(load_afsp_sample(), error = function(e) NULL)) if (is.null(peds) || !length(peds)) return(NULL) if (!is.null(max_n)) peds <- peds[seq_len(min(max_n, length(peds)))] res <- tryCatch(benchmark_afsp(peds, verbose = FALSE), error = function(e) NULL) diff --git a/man/benchmark_afsp.Rd b/man/benchmark_afsp.Rd index 92ff3d18e..2c8c4d701 100644 --- a/man/benchmark_afsp.Rd +++ b/man/benchmark_afsp.Rd @@ -7,8 +7,10 @@ benchmark_afsp(pedons, verbose = TRUE) } \arguments{ -\item{pedons}{List of \code{\link{PedonRecord}} from -\code{\link{load_afsp_pedons}} or \code{\link{load_afsp_sample}}.} +\item{pedons}{A list of \code{\link{PedonRecord}} (e.g.\ from +\code{\link{load_afsp_pedons}}), or the wrapper returned by +\code{\link{load_afsp_sample}} (its \code{$pedons} slot is +unwrapped automatically).} \item{verbose}{Print progress.} } diff --git a/man/dot-afsp_as_pedons.Rd b/man/dot-afsp_as_pedons.Rd new file mode 100644 index 000000000..8765e113f --- /dev/null +++ b/man/dot-afsp_as_pedons.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/afsp.R +\name{.afsp_as_pedons} +\alias{.afsp_as_pedons} +\title{Coerce an AfSP sample to a bare list of PedonRecords} +\usage{ +.afsp_as_pedons(x) +} +\description{ +\code{\link{load_afsp_sample}} returns a wrapper +\code{list(pedons, pulled_on, source, filter)} (mirroring +\code{\link{load_wosis_stratified_sample}} / +\code{\link{load_kssl_sample}}), but the benchmark + suite loops iterate +pedons. This unwraps the \code{$pedons} slot when handed the wrapper and +otherwise passes a bare pedon list through unchanged, so callers can feed +either form. Without it, iterating the 4-element wrapper hits +\code{$site} on atomic elements (\code{pulled_on}, \code{source}) and +throws "$ operator is invalid for atomic vectors". +} +\keyword{internal} diff --git a/man/load_afsp_sample.Rd b/man/load_afsp_sample.Rd index 7320b43cd..4dd65102b 100644 --- a/man/load_afsp_sample.Rd +++ b/man/load_afsp_sample.Rd @@ -7,18 +7,23 @@ load_afsp_sample() } \value{ -A list with \code{pedons}, \code{pulled_on}, \code{source}, - \code{filter}. +A list with \code{pedons} (120 \code{\link{PedonRecord}}s), + \code{pulled_on}, \code{source}, \code{filter}. } \description{ -Returns a 130-profile snapshot from AfSP v1.2 stratified by WRB -RSG (5 profiles per RSG x 26 RSGs), pre-built so users can run +Returns a 120-profile snapshot from AfSP v1.2 stratified by WRB +RSG (5 profiles per RSG x 24 RSGs), pre-built so users can run the African WRB benchmark offline without the 35 MB ZIP download. } \details{ This is the African analogue of \code{\link{load_wosis_stratified_sample}} (global WoSIS) and \code{\link{load_kssl_nasis_sample}} (US KSSL+NASIS). + +Like its siblings, this returns a \strong{wrapper} list -- the pedons +are in the \code{$pedons} slot. Pass \code{$pedons} to per-pedon loops, +or hand the whole wrapper to \code{\link{benchmark_afsp}}, which unwraps +it for you. } \section{Reference}{ diff --git a/tests/testthat/test-v09106-benchmark-suite.R b/tests/testthat/test-v09106-benchmark-suite.R index ac37757c0..25497f300 100644 --- a/tests/testthat/test-v09106-benchmark-suite.R +++ b/tests/testthat/test-v09106-benchmark-suite.R @@ -28,8 +28,15 @@ test_that("auto-detection skips absent datasets without error", { lucas_csv = empty, esdb_root = empty, redape = empty) res <- expect_no_error(suppressWarnings( run_all_benchmarks(datasets = "auto", paths = paths, verbose = FALSE))) - # only the canonical row survives (external datasets skipped, no error) - expect_true(all(res$summary$dataset == "canonical")) + # Every external (network/path-gated) dataset is skipped, leaving only the + # offline rows: the canonical sanity row always, plus the bundled AfSP sample + # when its .rds is present (v0.9.111 -- previously dropped by the harness bug). + external <- c("bdsolos", "febr", "kssl", "lucas_esdb", "redape") + expect_false(any(res$summary$dataset %in% external)) + expect_true("canonical" %in% res$summary$dataset) + if (nzchar(system.file("extdata", "afsp_sample.rds", package = "soilKey")) || + file.exists(file.path("inst", "extdata", "afsp_sample.rds"))) + expect_true("afsp_sample" %in% res$summary$dataset) }) diff --git a/tests/testthat/test-v09111-afsp-benchmark-harness.R b/tests/testthat/test-v09111-afsp-benchmark-harness.R new file mode 100644 index 000000000..5badbf814 --- /dev/null +++ b/tests/testthat/test-v09111-afsp-benchmark-harness.R @@ -0,0 +1,99 @@ +# ============================================================================= +# Tests for v0.9.111 -- AfSP offline benchmark harness fix. +# +# Two distinct bugs kept the AfSP/wrb2022 row out of the consolidated suite: +# 1. load_afsp_sample() returns the WRAPPER list(pedons, pulled_on, source, +# filter) -- length 4 -- and the harness fed that straight into +# benchmark_afsp() instead of its $pedons slot. +# 2. benchmark_afsp() then iterated the 4 wrapper elements, hitting $site on +# the atomic ones (pulled_on, source) -> "$ operator is invalid for atomic +# vectors", caught by .suite_run_afsp()'s tryCatch -> silent NULL -> AfSP +# never appeared in run_all_benchmarks(). +# The fix: .afsp_as_pedons() unwraps the wrapper, benchmark_afsp() accepts +# either form and skips non-PedonRecord elements, and .suite_run_afsp() unwraps +# before its length-guard / max_n slice. +# ============================================================================= + + +.afsp_sample_present <- function() { + file.exists(file.path("inst", "extdata", "afsp_sample.rds")) || + nzchar(system.file("extdata", "afsp_sample.rds", package = "soilKey")) +} + + +# ---- unwrap helper --------------------------------------------------------- + +test_that("v0.9.111: .afsp_as_pedons() unwraps the wrapper but passes a bare list", { + bare <- list("a", "b", "c") + expect_identical(soilKey:::.afsp_as_pedons(bare), bare) + + wrapped <- list(pedons = bare, pulled_on = "2026-05-09", source = "x", + filter = list()) + expect_identical(soilKey:::.afsp_as_pedons(wrapped), bare) + + # NULL (failed load) flows straight through for the downstream length-guard. + expect_null(soilKey:::.afsp_as_pedons(NULL)) +}) + + +# ---- bug #2 regression: benchmark_afsp() on the raw wrapper ----------------- + +test_that("v0.9.111: benchmark_afsp() accepts the load_afsp_sample() wrapper directly", { + testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present") + peds <- load_afsp_sample() + expect_length(peds, 4L) # it IS the 4-element wrapper + + # Previously: "$ operator is invalid for atomic vectors". Now it unwraps. + res <- suppressWarnings(benchmark_afsp(peds, verbose = FALSE)) + expect_false(is.null(res)) + expect_true(is.table(res$confusion)) + expect_gt(sum(res$confusion), 0L) # non-empty confusion matrix + expect_equal(res$n_total, 120L) + + # Wrapper and its $pedons slot yield the identical benchmark. + res2 <- suppressWarnings(benchmark_afsp(peds$pedons, verbose = FALSE)) + expect_identical(res$confusion, res2$confusion) + expect_equal(res$accuracy, res2$accuracy) +}) + + +test_that("v0.9.111: benchmark_afsp() skips non-PedonRecord members without erroring", { + testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present") + peds <- load_afsp_sample()$pedons + # Inject atomic junk among the pedons; the refs/preds loops must not crash. + poisoned <- c(peds[1:5], list("not-a-pedon", 42L, NA)) + res <- suppressWarnings(benchmark_afsp(poisoned, verbose = FALSE)) + expect_false(is.null(res)) + expect_equal(res$n_total, 8L) # 5 real + 3 junk + expect_lte(res$n_compared, 5L) # only real pedons are in scope +}) + + +# ---- bug #1 regression: the suite surfaces a real AfSP row ------------------ + +test_that("v0.9.111: .suite_run_afsp() returns a full-metric AfSP/wrb2022 row", { + testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present") + row <- suppressWarnings(soilKey:::.suite_run_afsp(300L, verbose = FALSE)) + expect_false(is.null(row)) + expect_equal(nrow(row), 1L) + expect_identical(row$dataset, "afsp_sample") + expect_identical(row$system, "wrb2022") + expect_equal(row$n_compared, 120L) + # v0.9.110 metric columns are all present (confusion -> full metric set). + expect_true(all(c("accuracy", "acc_lo", "acc_hi", "balanced_acc", + "macro_f1", "kappa", "nir") %in% names(row))) + expect_false(is.na(row$balanced_acc)) + expect_false(is.na(row$kappa)) +}) + + +test_that("v0.9.111: run_all_benchmarks() surfaces AfSP as an offline WRB row", { + testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present") + # "canonical" runs no network datasets but still attaches the offline AfSP row. + out <- suppressWarnings(run_all_benchmarks(datasets = "canonical", + verbose = FALSE)) + hit <- out$summary$dataset == "afsp_sample" & out$summary$system == "wrb2022" + expect_true(any(hit)) + expect_equal(out$summary$n_compared[hit], 120L) + expect_gt(out$summary$accuracy[hit], 0.15) +})