From 7356265561a363f7a26613d22eee2ce03f09a5b6 Mon Sep 17 00:00:00 2001
From: Hugo Rodrigues <rodrigues.h@Hugos-MacBook-Pro.local>
Date: Fri, 12 Jun 2026 14:25:00 -0400
Subject: [PATCH] fix(benchmark): restore offline AfSP WRB benchmark in the
 suite (v0.9.111)

The offline AfSP (Africa Soil Profiles) WRB benchmark was silently dropped
from run_all_benchmarks()'s consolidated report. Two bugs, one mechanism:

  1. load_afsp_sample() returns a wrapper list(pedons, pulled_on, source,
     filter) -- length 4 -- mirroring load_wosis_stratified_sample() /
     load_kssl_sample(). The suite fed that wrapper straight into
     benchmark_afsp() instead of its $pedons slot.
  2. benchmark_afsp() then iterated the 4 wrapper elements and hit $site on
     the atomic ones (pulled_on, source) -> "$ operator is invalid for
     atomic vectors", which .suite_run_afsp()'s tryCatch swallowed to NULL,
     so AfSP never appeared.

The bundled inst/extdata/afsp_sample.rds was NOT a stub: it already holds the
full stratified 5 x 24 = 120-pedon sample.

Fix (engine untouched -- harness only):
  - New internal .afsp_as_pedons() unwraps $pedons when handed the wrapper
    and passes a bare pedon list through unchanged.
  - benchmark_afsp() calls it (accepts either form) and skips any
    non-PedonRecord member instead of erroring.
  - .suite_run_afsp() unwraps before its length-guard and max_n slice.
  - load_afsp_sample() docs corrected from the never-shipped "130 / 26 RSGs"
    to the actual 120 / 24, and clarify the wrapper return.

Result: run_all_benchmarks() surfaces a real afsp_sample/wrb2022 row (n=120,
acc 25%, bal-acc 25%, macro-F1 0.22, kappa 0.22, NIR 4.2%, bootstrap CI) -- a
2nd independent OFFLINE WRB benchmark alongside FEBR -- with the full v0.9.110
metric set.

Tests:
  - New test-v09111-afsp-benchmark-harness.R: benchmark_afsp(load_afsp_sample())
    (raw wrapper) returns non-NULL with a non-empty confusion matrix, junk
    members are skipped, .suite_run_afsp() yields a full-metric row, and
    run_all_benchmarks() surfaces AfSP.
  - test-v09106-benchmark-suite.R: replace the stale assertion that encoded the
    old buggy "only canonical survives" behaviour with one that asserts external
    datasets are skipped while offline rows (canonical + AfSP) appear.

Full suite: 1702 tests, 0 failures, 0 errors.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 DESCRIPTION                                   |  4 +-
 NEWS.md                                       | 38 +++++++
 R/afsp.R                                      | 44 +++++++--
 R/benchmark-suite.R                           |  6 +-
 man/benchmark_afsp.Rd                         |  6 +-
 man/dot-afsp_as_pedons.Rd                     | 20 ++++
 man/load_afsp_sample.Rd                       | 13 ++-
 tests/testthat/test-v09106-benchmark-suite.R  | 11 ++-
 .../test-v09111-afsp-benchmark-harness.R      | 99 +++++++++++++++++++
 9 files changed, 222 insertions(+), 19 deletions(-)
 create mode 100644 man/dot-afsp_as_pedons.Rd
 create mode 100644 tests/testthat/test-v09111-afsp-benchmark-harness.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 15227dd94..b8980fcf3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: soilKey
 Type: Package
 Title: Automated Soil Profile Classification per WRB 2022, SiBCS 5 and USDA Soil Taxonomy 13
-Version: 0.9.110
-Date: 2026-06-11
+Version: 0.9.111
+Date: 2026-06-12
 Authors@R:
     person("Hugo", "Rodrigues",
            email = "rodrigues.machado.hugo@gmail.com",
diff --git a/NEWS.md b/NEWS.md
index 6f0674461..b242f8467 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,41 @@
+# soilKey 0.9.111 (2026-06-12)
+
+A benchmark-harness bug fix split out of the v0.9.110 methodology work (B1):
+the offline **AfSP (Africa Soil Profiles) WRB benchmark** was silently dropped
+from the consolidated report. \strong{The classification engine is unchanged}
+-- every edit is in the AfSP loader/benchmark and the suite wiring.
+
+## AfSP offline WRB benchmark restored
+
+\itemize{
+  \item \code{load_afsp_sample()} returns a \strong{wrapper}
+        \code{list(pedons, pulled_on, source, filter)} (mirroring
+        \code{load_wosis_stratified_sample()} / \code{load_kssl_sample()}), but
+        the suite fed that 4-element wrapper straight into
+        \code{benchmark_afsp()} instead of its \code{$pedons} slot. The
+        benchmark then iterated the wrapper and hit \code{$site} on the atomic
+        members (\code{pulled_on}, \code{source}) -- \emph{"$ operator is
+        invalid for atomic vectors"} -- which \code{.suite_run_afsp()}'s
+        \code{tryCatch} swallowed to \code{NULL}, so AfSP never appeared in
+        \code{run_all_benchmarks()}.
+  \item New internal \code{.afsp_as_pedons()} unwraps the \code{$pedons} slot
+        when handed the wrapper and passes a bare pedon list through unchanged.
+        \code{benchmark_afsp()} now calls it (so it accepts \strong{either}
+        form) and skips any non-\code{PedonRecord} member instead of erroring;
+        \code{.suite_run_afsp()} unwraps before its length-guard and
+        \code{max_n} slice.
+  \item Result: \code{run_all_benchmarks()} surfaces a real
+        \strong{afsp_sample/wrb2022} row -- a second independent \strong{offline}
+        WRB benchmark (Africa, RSG level, n = 120 across 24 RSGs) alongside
+        \strong{FEBR} -- with the full v0.9.110 metric set (balanced accuracy,
+        macro-F1, kappa, NIR, bootstrap CI).
+  \item The bundled \code{inst/extdata/afsp_sample.rds} was \strong{not} a stub:
+        it already held the full stratified 5 x 24 = 120-pedon sample. The
+        \code{load_afsp_sample()} docs are corrected from the never-shipped
+        "130 / 26 RSGs" to the actual 120 / 24, and clarify the wrapper return.
+}
+
+
 # soilKey 0.9.110 (2026-06-11)
 
 The "**benchmark methodology**" release (front B1 of the accuracy work). A
diff --git a/R/afsp.R b/R/afsp.R
index 4b3f84422..9c00334df 100644
--- a/R/afsp.R
+++ b/R/afsp.R
@@ -249,16 +249,21 @@ load_afsp_pedons <- function(afsp_dir,
 
 #' Load the bundled AfSP stratified sample (v0.9.77)
 #'
-#' Returns a 130-profile snapshot from AfSP v1.2 stratified by WRB
-#' RSG (5 profiles per RSG x 26 RSGs), pre-built so users can run
+#' Returns a 120-profile snapshot from AfSP v1.2 stratified by WRB
+#' RSG (5 profiles per RSG x 24 RSGs), pre-built so users can run
 #' the African WRB benchmark offline without the 35 MB ZIP download.
 #'
 #' This is the African analogue of
 #' \code{\link{load_wosis_stratified_sample}} (global WoSIS) and
 #' \code{\link{load_kssl_nasis_sample}} (US KSSL+NASIS).
 #'
-#' @return A list with \code{pedons}, \code{pulled_on}, \code{source},
-#'   \code{filter}.
+#' Like its siblings, this returns a \strong{wrapper} list -- the pedons
+#' are in the \code{$pedons} slot. Pass \code{$pedons} to per-pedon loops,
+#' or hand the whole wrapper to \code{\link{benchmark_afsp}}, which unwraps
+#' it for you.
+#'
+#' @return A list with \code{pedons} (120 \code{\link{PedonRecord}}s),
+#'   \code{pulled_on}, \code{source}, \code{filter}.
 #'
 #' @section Reference:
 #' Leenaars, J. G. B., van Oostrum, A. J. M., & Ruiperez Gonzalez, M.
@@ -276,27 +281,50 @@ load_afsp_sample <- function() {
 }
 
 
+#' Coerce an AfSP sample to a bare list of PedonRecords
+#'
+#' \code{\link{load_afsp_sample}} returns a wrapper
+#' \code{list(pedons, pulled_on, source, filter)} (mirroring
+#' \code{\link{load_wosis_stratified_sample}} /
+#' \code{\link{load_kssl_sample}}), but the benchmark + suite loops iterate
+#' pedons. This unwraps the \code{$pedons} slot when handed the wrapper and
+#' otherwise passes a bare pedon list through unchanged, so callers can feed
+#' either form. Without it, iterating the 4-element wrapper hits
+#' \code{$site} on atomic elements (\code{pulled_on}, \code{source}) and
+#' throws "$ operator is invalid for atomic vectors".
+#' @keywords internal
+.afsp_as_pedons <- function(x) {
+  if (is.list(x) && !is.null(x$pedons) && is.list(x$pedons)) x$pedons else x
+}
+
+
 #' Benchmark soilKey WRB predictions against AfSP ground truth
 #'
-#' @param pedons List of \code{\link{PedonRecord}} from
-#'        \code{\link{load_afsp_pedons}} or \code{\link{load_afsp_sample}}.
+#' @param pedons A list of \code{\link{PedonRecord}} (e.g.\ from
+#'        \code{\link{load_afsp_pedons}}), or the wrapper returned by
+#'        \code{\link{load_afsp_sample}} (its \code{$pedons} slot is
+#'        unwrapped automatically).
 #' @param verbose Print progress.
 #'
 #' @return List with \code{accuracy}, \code{n_compared}, \code{confusion},
 #'   \code{per_class_recall}.
 #' @export
 benchmark_afsp <- function(pedons, verbose = TRUE) {
+  pedons <- .afsp_as_pedons(pedons)
   if (length(pedons) == 0L) stop("Empty pedon list.")
   if (isTRUE(verbose))
     cat(sprintf("[afsp] benchmarking %d pedons\n", length(pedons)))
 
   preds <- vapply(pedons, function(pr) {
+    if (!inherits(pr, "PedonRecord")) return(NA_character_)
     res <- tryCatch(classify_wrb2022(pr, on_missing = "silent"),
                      error = function(e) NULL)
     if (is.null(res)) NA_character_ else sub("s$", "", res$rsg_or_order)
   }, character(1))
-  refs <- vapply(pedons, function(p) p$site$reference_wrb %||% NA_character_,
-                  character(1))
+  refs <- vapply(pedons, function(p) {
+    if (!inherits(p, "PedonRecord")) return(NA_character_)
+    p$site$reference_wrb %||% NA_character_
+  }, character(1))
 
   in_scope <- !is.na(refs) & !is.na(preds)
   n_correct <- sum(in_scope & refs == preds)
diff --git a/R/benchmark-suite.R b/R/benchmark-suite.R
index ea1a270c6..3ebe8239a 100644
--- a/R/benchmark-suite.R
+++ b/R/benchmark-suite.R
@@ -189,7 +189,11 @@ run_all_benchmarks <- function(datasets = "auto", paths = NULL, max_n = 300L,
 
 # AfSP offline sample, classified under WRB.
 .suite_run_afsp <- function(max_n, verbose) {
-  peds <- tryCatch(load_afsp_sample(), error = function(e) NULL)
+  # load_afsp_sample() returns the wrapper list(pedons, pulled_on, ...);
+  # unwrap to the bare pedon list so the length-guard and max_n slice
+  # operate on the 120 pedons, not the 4 wrapper elements (which would
+  # otherwise slice/feed atomic members into benchmark_afsp and crash).
+  peds <- .afsp_as_pedons(tryCatch(load_afsp_sample(), error = function(e) NULL))
   if (is.null(peds) || !length(peds)) return(NULL)
   if (!is.null(max_n)) peds <- peds[seq_len(min(max_n, length(peds)))]
   res <- tryCatch(benchmark_afsp(peds, verbose = FALSE), error = function(e) NULL)
diff --git a/man/benchmark_afsp.Rd b/man/benchmark_afsp.Rd
index 92ff3d18e..2c8c4d701 100644
--- a/man/benchmark_afsp.Rd
+++ b/man/benchmark_afsp.Rd
@@ -7,8 +7,10 @@
 benchmark_afsp(pedons, verbose = TRUE)
 }
 \arguments{
-\item{pedons}{List of \code{\link{PedonRecord}} from
-\code{\link{load_afsp_pedons}} or \code{\link{load_afsp_sample}}.}
+\item{pedons}{A list of \code{\link{PedonRecord}} (e.g.\ from
+\code{\link{load_afsp_pedons}}), or the wrapper returned by
+\code{\link{load_afsp_sample}} (its \code{$pedons} slot is
+unwrapped automatically).}
 
 \item{verbose}{Print progress.}
 }
diff --git a/man/dot-afsp_as_pedons.Rd b/man/dot-afsp_as_pedons.Rd
new file mode 100644
index 000000000..8765e113f
--- /dev/null
+++ b/man/dot-afsp_as_pedons.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/afsp.R
+\name{.afsp_as_pedons}
+\alias{.afsp_as_pedons}
+\title{Coerce an AfSP sample to a bare list of PedonRecords}
+\usage{
+.afsp_as_pedons(x)
+}
+\description{
+\code{\link{load_afsp_sample}} returns a wrapper
+\code{list(pedons, pulled_on, source, filter)} (mirroring
+\code{\link{load_wosis_stratified_sample}} /
+\code{\link{load_kssl_sample}}), but the benchmark + suite loops iterate
+pedons. This unwraps the \code{$pedons} slot when handed the wrapper and
+otherwise passes a bare pedon list through unchanged, so callers can feed
+either form. Without it, iterating the 4-element wrapper hits
+\code{$site} on atomic elements (\code{pulled_on}, \code{source}) and
+throws "$ operator is invalid for atomic vectors".
+}
+\keyword{internal}
diff --git a/man/load_afsp_sample.Rd b/man/load_afsp_sample.Rd
index 7320b43cd..4dd65102b 100644
--- a/man/load_afsp_sample.Rd
+++ b/man/load_afsp_sample.Rd
@@ -7,18 +7,23 @@
 load_afsp_sample()
 }
 \value{
-A list with \code{pedons}, \code{pulled_on}, \code{source},
-  \code{filter}.
+A list with \code{pedons} (120 \code{\link{PedonRecord}}s),
+  \code{pulled_on}, \code{source}, \code{filter}.
 }
 \description{
-Returns a 130-profile snapshot from AfSP v1.2 stratified by WRB
-RSG (5 profiles per RSG x 26 RSGs), pre-built so users can run
+Returns a 120-profile snapshot from AfSP v1.2 stratified by WRB
+RSG (5 profiles per RSG x 24 RSGs), pre-built so users can run
 the African WRB benchmark offline without the 35 MB ZIP download.
 }
 \details{
 This is the African analogue of
 \code{\link{load_wosis_stratified_sample}} (global WoSIS) and
 \code{\link{load_kssl_nasis_sample}} (US KSSL+NASIS).
+
+Like its siblings, this returns a \strong{wrapper} list -- the pedons
+are in the \code{$pedons} slot. Pass \code{$pedons} to per-pedon loops,
+or hand the whole wrapper to \code{\link{benchmark_afsp}}, which unwraps
+it for you.
 }
 \section{Reference}{
 
diff --git a/tests/testthat/test-v09106-benchmark-suite.R b/tests/testthat/test-v09106-benchmark-suite.R
index ac37757c0..25497f300 100644
--- a/tests/testthat/test-v09106-benchmark-suite.R
+++ b/tests/testthat/test-v09106-benchmark-suite.R
@@ -28,8 +28,15 @@ test_that("auto-detection skips absent datasets without error", {
                 lucas_csv = empty, esdb_root = empty, redape = empty)
   res <- expect_no_error(suppressWarnings(
     run_all_benchmarks(datasets = "auto", paths = paths, verbose = FALSE)))
-  # only the canonical row survives (external datasets skipped, no error)
-  expect_true(all(res$summary$dataset == "canonical"))
+  # Every external (network/path-gated) dataset is skipped, leaving only the
+  # offline rows: the canonical sanity row always, plus the bundled AfSP sample
+  # when its .rds is present (v0.9.111 -- previously dropped by the harness bug).
+  external <- c("bdsolos", "febr", "kssl", "lucas_esdb", "redape")
+  expect_false(any(res$summary$dataset %in% external))
+  expect_true("canonical" %in% res$summary$dataset)
+  if (nzchar(system.file("extdata", "afsp_sample.rds", package = "soilKey")) ||
+      file.exists(file.path("inst", "extdata", "afsp_sample.rds")))
+    expect_true("afsp_sample" %in% res$summary$dataset)
 })
 
 
diff --git a/tests/testthat/test-v09111-afsp-benchmark-harness.R b/tests/testthat/test-v09111-afsp-benchmark-harness.R
new file mode 100644
index 000000000..5badbf814
--- /dev/null
+++ b/tests/testthat/test-v09111-afsp-benchmark-harness.R
@@ -0,0 +1,99 @@
+# =============================================================================
+# Tests for v0.9.111 -- AfSP offline benchmark harness fix.
+#
+# Two distinct bugs kept the AfSP/wrb2022 row out of the consolidated suite:
+#   1. load_afsp_sample() returns the WRAPPER list(pedons, pulled_on, source,
+#      filter) -- length 4 -- and the harness fed that straight into
+#      benchmark_afsp() instead of its $pedons slot.
+#   2. benchmark_afsp() then iterated the 4 wrapper elements, hitting $site on
+#      the atomic ones (pulled_on, source) -> "$ operator is invalid for atomic
+#      vectors", caught by .suite_run_afsp()'s tryCatch -> silent NULL -> AfSP
+#      never appeared in run_all_benchmarks().
+# The fix: .afsp_as_pedons() unwraps the wrapper, benchmark_afsp() accepts
+# either form and skips non-PedonRecord elements, and .suite_run_afsp() unwraps
+# before its length-guard / max_n slice.
+# =============================================================================
+
+
+.afsp_sample_present <- function() {
+  file.exists(file.path("inst", "extdata", "afsp_sample.rds")) ||
+    nzchar(system.file("extdata", "afsp_sample.rds", package = "soilKey"))
+}
+
+
+# ---- unwrap helper ---------------------------------------------------------
+
+test_that("v0.9.111: .afsp_as_pedons() unwraps the wrapper but passes a bare list", {
+  bare <- list("a", "b", "c")
+  expect_identical(soilKey:::.afsp_as_pedons(bare), bare)
+
+  wrapped <- list(pedons = bare, pulled_on = "2026-05-09", source = "x",
+                  filter = list())
+  expect_identical(soilKey:::.afsp_as_pedons(wrapped), bare)
+
+  # NULL (failed load) flows straight through for the downstream length-guard.
+  expect_null(soilKey:::.afsp_as_pedons(NULL))
+})
+
+
+# ---- bug #2 regression: benchmark_afsp() on the raw wrapper -----------------
+
+test_that("v0.9.111: benchmark_afsp() accepts the load_afsp_sample() wrapper directly", {
+  testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present")
+  peds <- load_afsp_sample()
+  expect_length(peds, 4L)              # it IS the 4-element wrapper
+
+  # Previously: "$ operator is invalid for atomic vectors". Now it unwraps.
+  res <- suppressWarnings(benchmark_afsp(peds, verbose = FALSE))
+  expect_false(is.null(res))
+  expect_true(is.table(res$confusion))
+  expect_gt(sum(res$confusion), 0L)    # non-empty confusion matrix
+  expect_equal(res$n_total, 120L)
+
+  # Wrapper and its $pedons slot yield the identical benchmark.
+  res2 <- suppressWarnings(benchmark_afsp(peds$pedons, verbose = FALSE))
+  expect_identical(res$confusion, res2$confusion)
+  expect_equal(res$accuracy, res2$accuracy)
+})
+
+
+test_that("v0.9.111: benchmark_afsp() skips non-PedonRecord members without erroring", {
+  testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present")
+  peds <- load_afsp_sample()$pedons
+  # Inject atomic junk among the pedons; the refs/preds loops must not crash.
+  poisoned <- c(peds[1:5], list("not-a-pedon", 42L, NA))
+  res <- suppressWarnings(benchmark_afsp(poisoned, verbose = FALSE))
+  expect_false(is.null(res))
+  expect_equal(res$n_total, 8L)        # 5 real + 3 junk
+  expect_lte(res$n_compared, 5L)       # only real pedons are in scope
+})
+
+
+# ---- bug #1 regression: the suite surfaces a real AfSP row ------------------
+
+test_that("v0.9.111: .suite_run_afsp() returns a full-metric AfSP/wrb2022 row", {
+  testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present")
+  row <- suppressWarnings(soilKey:::.suite_run_afsp(300L, verbose = FALSE))
+  expect_false(is.null(row))
+  expect_equal(nrow(row), 1L)
+  expect_identical(row$dataset, "afsp_sample")
+  expect_identical(row$system, "wrb2022")
+  expect_equal(row$n_compared, 120L)
+  # v0.9.110 metric columns are all present (confusion -> full metric set).
+  expect_true(all(c("accuracy", "acc_lo", "acc_hi", "balanced_acc",
+                    "macro_f1", "kappa", "nir") %in% names(row)))
+  expect_false(is.na(row$balanced_acc))
+  expect_false(is.na(row$kappa))
+})
+
+
+test_that("v0.9.111: run_all_benchmarks() surfaces AfSP as an offline WRB row", {
+  testthat::skip_if_not(.afsp_sample_present(), "Bundled AfSP sample not present")
+  # "canonical" runs no network datasets but still attaches the offline AfSP row.
+  out <- suppressWarnings(run_all_benchmarks(datasets = "canonical",
+                                             verbose = FALSE))
+  hit <- out$summary$dataset == "afsp_sample" & out$summary$system == "wrb2022"
+  expect_true(any(hit))
+  expect_equal(out$summary$n_compared[hit], 120L)
+  expect_gt(out$summary$accuracy[hit], 0.15)
+})