grading-accuracy-study · andrewpbray · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/NAMESPACE b/NAMESPACE
@@ -16,6 +16,7 @@ export(isp)
 export(normalize_full_credit)
 export(read_evals)
 export(rubric_mae)
+export(scores_from_metadata)
 export(update_scores)
 export(update_scores_in_metadata)
 export(validate_metadata_json)

diff --git a/R/computing-accuracy.R b/R/computing-accuracy.R
@@ -261,20 +261,70 @@ find_differences_wrt_students <- function(experts_file, student_file,
 #'
 #' @param file1 file path for first grades csv
 #' @param file2 file path for second grades csv
+#' @param metadata_file optional path to a metadata JSON file. When supplied,
+#'   rubric item point values are extracted and passed as weights to
+#'   `rubric_mae()`, producing a point-weighted MAE.
 #'
 #' @return a list
 #'
 #' @importFrom readr read_csv
 #'
 #' @export
-compute_mae_and_isp <- function(file1, file2){
+compute_mae_and_isp <- function(file1, file2, metadata_file = NULL){
   eval1 <- readr::read_csv(file1, show_col_types = FALSE)
   eval2 <- readr::read_csv(file2, show_col_types = FALSE)
-  list(MAE = rubric_mae(eval1, eval2),
+  weights <- if (!is.null(metadata_file)) scores_from_metadata(metadata_file) else NULL
+  list(MAE = rubric_mae(eval1, eval2, weights = weights),
        ISP = isp(eval1, eval2))
 }
 
 
+#' Extract Rubric Item Scores from a Metadata JSON File
+#'
+#' Reads a metadata JSON file and returns the ordered numeric vector of rubric
+#' item point values. This vector can be passed as the `weights` argument to
+#' `rubric_mae()` to compute a point-weighted MAE.
+#'
+#' @param metadata_file path to a metadata JSON file
+#' @param calibrated logical; if `TRUE` (default) extract scores from
+#'   `rubric$calibrated$scores`, otherwise from `rubric$uncalibrated$scores`
+#'
+#' @return a numeric vector of point values, one per rubric item
+#'
+#' @importFrom jsonlite read_json
+#' @export
+scores_from_metadata <- function(metadata_file, calibrated = TRUE) {
+  meta <- jsonlite::read_json(metadata_file)
+  rubric_type <- if (calibrated) "calibrated" else "uncalibrated"
+  scores <- meta$rubric[[rubric_type]]$scores
+  if (is.null(scores)) {
+    stop(paste0("No scores found in metadata for rubric type '", rubric_type, "'"))
+  }
+  scores_unlisted <- unlist(scores, use.names = FALSE)
+  numeric_scores <- suppressWarnings(as.numeric(scores_unlisted))
+
+  if (any(is.na(numeric_scores) & !is.na(scores_unlisted))) {
+    stop(
+      paste0(
+        "Scores in metadata for rubric type '", rubric_type,
+        "' must be numeric and contain no non-numeric values"
+      )
+    )
+  }
+
+  if (any(is.na(numeric_scores))) {
+    stop(
+      paste0(
+        "Scores in metadata for rubric type '", rubric_type,
+        "' must not contain NA values"
+      )
+    )
+  }
+
+  numeric_scores
+}
+
+
 #' Calculate Proportion of Identical Scores
 #'
 #' This function calculates the proportion of identical scores
@@ -324,21 +374,36 @@ isp <- function(eval1, eval2){
 #' It's recommended to `normalize_full_credit()` for `eval1` and `eval2` prior
 #' to using this function.
 #'
+#' When `weights` is supplied, each rubric item's disagreement is scaled by its
+#' point value before summing, so a mismatch on a 1-point item contributes more
+#' than a mismatch on a 0.5-point item. Use `scores_from_metadata()` to extract
+#' the weights vector from a metadata JSON file.
+#'
 #' @param eval1 first dataframe of Gradescope evaluations
 #' @param eval2 second dataframe of Gradescope evaluations
+#' @param weights optional numeric vector of point values, one per rubric item
+#'   (in the same order as the R1, R2, ... columns). When `NULL` (default),
+#'   all items are treated as equally weighted.
 #'
 #' @return double for mean absolute error
 #'
 #' @export
-rubric_mae <- function(eval1, eval2){
+rubric_mae <- function(eval1, eval2, weights = NULL){
+  if (!is.null(weights)) {
+    rubric_cols <- grep("^R[0-9]+$", names(eval1), value = TRUE)
+    if (length(weights) != length(rubric_cols)) {
+      stop(paste0("Length of weights (", length(weights), ") must match ",
+                  "number of rubric columns (", length(rubric_cols), ")"))
+    }
+  }
   # find differences in rubric toggles
-  error_per_student <- find_differences(eval1, eval2)$error_per_student
+  error_per_student <- find_differences(eval1, eval2, weights = weights)$error_per_student
   # mean absolute error calculation
   mean(error_per_student)
 }
 
 
-find_differences <- function(eval1, eval2){
+find_differences <- function(eval1, eval2, weights = NULL){
   if (!("SID" %in% colnames(eval1)) || !("SID" %in% colnames(eval2))){
     stop("Missing SID")
   }
@@ -369,7 +434,12 @@ find_differences <- function(eval1, eval2){
   # elementwise matrix comparison
   check_equal <- rubric1 != rubric2
 
-  error_per_student <- rowSums(check_equal)
+  if (!is.null(weights)) {
+    error_per_student <- as.vector(check_equal %*% weights)
+    names(error_per_student) <- students
+  } else {
+    error_per_student <- rowSums(check_equal)
+  }
 
   return (list(error_per_student = error_per_student,
               rubric1 = rubric1,

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -31,3 +31,4 @@ reference:
   - compute_mae_and_isp
   - isp
   - rubric_mae
+  - scores_from_metadata
diff --git a/inst/extdata/metadata-calibrated.json b/inst/extdata/metadata-calibrated.json
@@ -15,7 +15,9 @@
     "scoring_type": "positive",
     "is_proctored": true,
     "n_submissions": 0,
-    "mean_score": 0.0
+    "mean_score": 0.0,
+    "total_points": 2,
+    "rubric_type": "positive disjoint"
   },
   "rubric": {
     "calibrated": {

diff --git a/inst/extdata/metadata-uncalibrated.json b/inst/extdata/metadata-uncalibrated.json
@@ -15,7 +15,9 @@
     "scoring_type": "positive",
     "is_proctored": true,
     "n_submissions": 0,
-    "mean_score": 0.0
+    "mean_score": 0.0,
+    "total_points": 2,
+    "rubric_type": "positive disjoint"
   },
   "rubric": {
     "calibrated": {

diff --git a/inst/extdata/metadata.json b/inst/extdata/metadata.json
@@ -15,7 +15,9 @@
     "scoring_type": "positive",
     "is_proctored": true,
     "n_submissions": 5,
-    "mean_score": 0.6
+    "mean_score": 0.6,
+    "total_points": 2,
+    "rubric_type": "positive disjoint"
   },
   "rubric": {
     "calibrated": {

diff --git a/man/compute_mae_and_isp.Rd b/man/compute_mae_and_isp.Rd
diff --git a/man/rubric_mae.Rd b/man/rubric_mae.Rd
diff --git a/man/scores_from_metadata.Rd b/man/scores_from_metadata.Rd
diff --git a/tests/testthat/test-computing-accuracy.R b/tests/testthat/test-computing-accuracy.R
@@ -271,6 +271,55 @@ test_that("normalize_full_credit - missing rubric items, row names", {
                                         rubric_items = c("R2", "R4")))
 })
 
+test_that("rubric_mae - weighted, basic", {
+  eval1 <- data.frame(
+    SID = c(1111, 2222, 3333),
+    R1 = c(T, T, F),
+    R2 = c(T, F, T)
+  )
+  eval2 <- data.frame(
+    SID = c(1111, 2222, 3333),
+    R1 = c(T, F, F),  # 2222 differs on R1 (weight 1.0)
+    R2 = c(T, F, F)   # 3333 differs on R2 (weight 0.5)
+  )
+  # 1111: no diff -> 0; 2222: R1 diff -> 1.0; 3333: R2 diff -> 0.5
+  # MAE = (0 + 1.0 + 0.5) / 3
+  actual_mae <- rubric_mae(eval1, eval2, weights = c(1.0, 0.5))
+  expect_equal(actual_mae, (0 + 1.0 + 0.5) / 3)
+})
+
+test_that("rubric_mae - weighted equals unweighted when all weights are 1", {
+  eval1 <- data.frame(
+    SID = c(1111, 2222, 3333, 4444, 5555),
+    R1 = c(T, T, T, F, F),
+    R2 = c(T, F, T, F, T)
+  )
+  eval2 <- data.frame(
+    SID = c(1111, 2222, 3333, 4444, 5555),
+    R1 = c(T, T, F, F, F),
+    R2 = c(T, T, T, T, T)
+  )
+  expect_equal(rubric_mae(eval1, eval2, weights = c(1, 1)),
+               rubric_mae(eval1, eval2))
+})
+
+test_that("rubric_mae - wrong weights length errors", {
+  eval1 <- data.frame(SID = 1111, R1 = TRUE, R2 = FALSE)
+  eval2 <- data.frame(SID = 1111, R1 = TRUE, R2 = TRUE)
+  expect_error(rubric_mae(eval1, eval2, weights = c(1.0, 0.5, 0.5)))
+})
+
+test_that("scores_from_metadata - calibrated", {
+  path <- system.file("extdata", "metadata-calibrated.json", package = "GradingAccuracy")
+  scores <- scores_from_metadata(path)
+  expect_equal(scores, c(1.0, 0.5, 0.5, 0.0))
+})
+
+test_that("scores_from_metadata - uncalibrated returns error for null", {
+  path <- system.file("extdata", "metadata-calibrated.json", package = "GradingAccuracy")
+  expect_error(scores_from_metadata(path, calibrated = FALSE))
+})
+
 test_that("normalize_full_credit - missing rubric items, indices", {
   eval_before <- data.frame(
     SID = c(1111, 3333, 2222, 4444, 5555),

diff --git a/vignettes/creating-metadata-json.Rmd b/vignettes/creating-metadata-json.Rmd
@@ -41,6 +41,8 @@ The `course_info` object must have the following keys (and their corresponding v
 - `is_proctored` : whether the assignment was taken in a proctored environment or if it was take-home
 - `n_submissions`: the number of student submissions, which can be programmatically updated with `update_scores_in_metadata()`
 - `mean_score`: the mean score of experts using, which can be programmatically updated with `update_scores_in_metadata()` (which uses `mean(experts$Score/max(experts$Score))`)
+- `total_points`: the total number of points the question is worth
+- `rubric_type`: the type of rubric (e.g. `"positive disjoint"`)
 
 
 The `rubric` object must have a `calibrated` rubric object and optionally an `uncalibrated` object that follows the same structure. A rubric object (either `calibrated` or `uncalibrated`) must have the following structure: