From b44f2b9828cf2c1b42b3caae0afe9b85ef3addd6 Mon Sep 17 00:00:00 2001 From: andrewpbray Date: Tue, 26 May 2026 13:34:01 -0700 Subject: [PATCH] Add weighted MAE to results table and tests - compute_mae_and_isp now returns MAE, wMAE, and ISP separately; wMAE uses rubric point weights from metadata when supplied, NA otherwise - generate_results_row passes metadata.json to compute_mae_and_isp and populates wMAE_ columns alongside MAE_ and ISP_ - generate_gt_results_table adds wMAE spanner and label stripping - New test file test-compute-mae-and-isp.R with 18 tests covering all three metrics across no-metadata, with-metadata, and edge cases --- R/computing-accuracy.R | 5 +- R/generate-results-table.R | 26 +++++-- tests/testthat/test-compute-mae-and-isp.R | 93 +++++++++++++++++++++++ 3 files changed, 116 insertions(+), 8 deletions(-) create mode 100644 tests/testthat/test-compute-mae-and-isp.R diff --git a/R/computing-accuracy.R b/R/computing-accuracy.R index 02574ec..75f3675 100644 --- a/R/computing-accuracy.R +++ b/R/computing-accuracy.R @@ -274,8 +274,9 @@ compute_mae_and_isp <- function(file1, file2, metadata_file = NULL){ eval1 <- readr::read_csv(file1, show_col_types = FALSE) eval2 <- readr::read_csv(file2, show_col_types = FALSE) weights <- if (!is.null(metadata_file)) scores_from_metadata(metadata_file) else NULL - list(MAE = rubric_mae(eval1, eval2, weights = weights), - ISP = isp(eval1, eval2)) + list(MAE = rubric_mae(eval1, eval2), + wMAE = if (!is.null(weights)) rubric_mae(eval1, eval2, weights = weights) else NA_real_, + ISP = isp(eval1, eval2)) } diff --git a/R/generate-results-table.R b/R/generate-results-table.R index 855eeef..ca8fc69 100644 --- a/R/generate-results-table.R +++ b/R/generate-results-table.R @@ -42,6 +42,10 @@ generate_gt_results_table <- function(results_table){ label = "MAE", columns = starts_with("MAE_") ) |> + gt::tab_spanner( + label = "wMAE", + columns = starts_with("wMAE_") + ) |> gt::tab_spanner( label = "ISP", columns = starts_with("ISP_") @@ -50,6 +54,10 @@ generate_gt_results_table <- function(results_table){ tidyr::starts_with("MAE_"), ~ sub("^MAE_", "", .x) ) |> + gt::cols_label_with( + tidyr::starts_with("wMAE_"), + ~ sub("^wMAE_", "", .x) + ) |> gt::cols_label_with( tidyr::starts_with("ISP_"), ~ sub("^ISP_", "", .x) @@ -75,8 +83,11 @@ generate_results_row <- function(dir) { name = sub("-.*$", "", files) ) - mae_vals <- list() - isp_vals <- list() + metadata_file <- paste0(dir, "/metadata.json") + + mae_vals <- list() + wmae_vals <- list() + isp_vals <- list() for (status in unique(df$status)) { group <- df[df$status == status, ] @@ -94,10 +105,12 @@ generate_results_row <- function(dir) { pair_name <- paste0(n1, ".v.", n2) - metrics <- compute_mae_and_isp(paste0(dir, f1), paste0(dir, f2)) + metrics <- compute_mae_and_isp(paste0(dir, f1), paste0(dir, f2), + metadata_file = metadata_file) - mae_vals[[pair_name]] <- metrics$MAE - isp_vals[[pair_name]] <- metrics$ISP + mae_vals[[pair_name]] <- metrics$MAE + wmae_vals[[pair_name]] <- metrics$wMAE + isp_vals[[pair_name]] <- metrics$ISP } } @@ -116,7 +129,8 @@ generate_results_row <- function(dir) { ) for (nm in names(mae_vals)) { - results_row[[paste0("MAE_", nm)]] <- mae_vals[[nm]] + results_row[[paste0("MAE_", nm)]] <- mae_vals[[nm]] + results_row[[paste0("wMAE_", nm)]] <- wmae_vals[[nm]] } for (nm in names(isp_vals)) { diff --git a/tests/testthat/test-compute-mae-and-isp.R b/tests/testthat/test-compute-mae-and-isp.R new file mode 100644 index 0000000..8df97e5 --- /dev/null +++ b/tests/testthat/test-compute-mae-and-isp.R @@ -0,0 +1,93 @@ +# Shared fixtures ----------------------------------------------------------- + +experts_csv <- system.file("extdata", "experts-calibrated.csv", + package = "GradingAccuracy") +students_csv <- system.file("extdata", "students-calibrated.csv", + package = "GradingAccuracy") +metadata_json <- system.file("extdata", "metadata.json", + package = "GradingAccuracy") + +# compute_mae_and_isp - without metadata ------------------------------------ + +test_that("compute_mae_and_isp - returns MAE, wMAE (NA), and ISP without metadata", { + result <- compute_mae_and_isp(experts_csv, students_csv) + + expect_named(result, c("MAE", "wMAE", "ISP")) + expect_true(is.numeric(result$MAE)) + expect_true(is.na(result$wMAE)) + expect_true(is.numeric(result$ISP)) +}) + +test_that("compute_mae_and_isp - MAE is non-negative", { + result <- compute_mae_and_isp(experts_csv, students_csv) + expect_gte(result$MAE, 0) +}) + +test_that("compute_mae_and_isp - ISP is between 0 and 1", { + result <- compute_mae_and_isp(experts_csv, students_csv) + expect_gte(result$ISP, 0) + expect_lte(result$ISP, 1) +}) + +# compute_mae_and_isp - with metadata --------------------------------------- + +test_that("compute_mae_and_isp - returns numeric wMAE when metadata supplied", { + result <- compute_mae_and_isp(experts_csv, students_csv, + metadata_file = metadata_json) + + expect_named(result, c("MAE", "wMAE", "ISP")) + expect_true(is.numeric(result$wMAE)) + expect_false(is.na(result$wMAE)) +}) + +test_that("compute_mae_and_isp - wMAE is non-negative", { + result <- compute_mae_and_isp(experts_csv, students_csv, + metadata_file = metadata_json) + expect_gte(result$wMAE, 0) +}) + +test_that("compute_mae_and_isp - MAE unaffected by presence of metadata", { + result_no_meta <- compute_mae_and_isp(experts_csv, students_csv) + result_with_meta <- compute_mae_and_isp(experts_csv, students_csv, + metadata_file = metadata_json) + + expect_equal(result_no_meta$MAE, result_with_meta$MAE) + expect_equal(result_no_meta$ISP, result_with_meta$ISP) +}) + +test_that("compute_mae_and_isp - wMAE equals MAE when all weights are equal", { + # Write a temp metadata with equal weights matching the 4-item rubric + equal_weight_meta <- tempfile(fileext = ".json") + jsonlite::write_json( + list( + rubric = list( + calibrated = list(scores = list(1, 1, 1, 1)) + ) + ), + equal_weight_meta, auto_unbox = TRUE + ) + result <- compute_mae_and_isp(experts_csv, students_csv, + metadata_file = equal_weight_meta) + expect_equal(result$wMAE, result$MAE) +}) + +test_that("compute_mae_and_isp - wMAE correct for known example", { + # experts vs students differ only on R3 for SID 1002 (weight 0.5) + # All other rows match. Expected wMAE = 0.5 / 5 = 0.1 + weights <- c(1.0, 0.5, 0.5, 0.0) + result <- compute_mae_and_isp(experts_csv, students_csv, + metadata_file = metadata_json) + expected_wmae <- (0 + 0.5 + 0 + 0 + 0) / 5 + expect_equal(result$wMAE, expected_wmae) +}) + +# compute_mae_and_isp - identical files ------------------------------------- + +test_that("compute_mae_and_isp - identical files give MAE=0, wMAE=0, ISP=1", { + result <- compute_mae_and_isp(experts_csv, experts_csv, + metadata_file = metadata_json) + + expect_equal(result$MAE, 0) + expect_equal(result$wMAE, 0) + expect_equal(result$ISP, 1) +})