From c979c11c10a1c60c8081e9f994f897862974de13 Mon Sep 17 00:00:00 2001 From: Trevor Riley Date: Mon, 1 Jun 2026 10:05:19 -0400 Subject: [PATCH 1/7] feat: deferred manual deduplication (Goal 1) + Shiny wiring Enable an auto-dedup-now, review-later workflow so users can pause after automatic deduplication, export, and complete manual review on re-import. Package: - export_dedup_candidates() / reimport_dedup_candidates(): persist and restore the $manual_dedup candidate pairs across an export/reimport boundary (IDs kept as character for re-merging). - export_csv() gains manual_dedup_complete flag (written as a column on full exports; read back by reimport_csv()) as a UX guard. - reimport_csv() now reads all columns as character, matching the canonical all-character types from dedup_citations(). Required so a reimported set can re-enter dedup_citations_add_manual() without column-type clashes (read.csv otherwise infers integer ids/years). - Tests in test-reimport.R cover the round-trip and merge. Shiny app: - file_reimport observer now handles multiple files and routes by content (candidate-pairs CSV vs deduplicated citation set vs RIS), fixing a latent length>1 condition error on multi-file selection. - Restoring candidate pairs repopulates the Manual deduplication tab on a reimported set; the result column is dropped so merges follow the user's row selection. - Export tab: Candidate Pairs (CSV) download; CSV export sets the manual_dedup_complete flag based on whether pairs remain pending. --- NAMESPACE | 2 + NEWS.md | 11 ++++ R/export.R | 60 ++++++++++++++++++- R/reimport.R | 54 ++++++++++++++++- inst/shiny-app/CiteSource/app.R | 99 ++++++++++++++++++++++++++------ man/export_csv.Rd | 10 +++- man/export_dedup_candidates.Rd | 38 ++++++++++++ man/reimport_dedup_candidates.Rd | 39 +++++++++++++ tests/testthat/test-reimport.R | 74 ++++++++++++++++++++++++ 9 files changed, 367 insertions(+), 20 deletions(-) create mode 100644 man/export_dedup_candidates.Rd create mode 100644 man/reimport_dedup_candidates.Rd create mode 100644 tests/testthat/test-reimport.R diff --git a/NAMESPACE b/NAMESPACE index 060ad70c..138bd851 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(dedup_citations_add_manual) export(dedup_log) export(export_bib) export(export_csv) +export(export_dedup_candidates) export(export_ris) export(plot_contributions) export(plot_source_overlap_heatmap) @@ -25,6 +26,7 @@ export(read_citations) export(record_counts) export(record_level_table) export(reimport_csv) +export(reimport_dedup_candidates) export(reimport_ris) export(runShiny) export(run_shiny) diff --git a/NEWS.md b/NEWS.md index 04029592..76b8a86f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,17 @@ ## New features +- Deferred manual deduplication: run automatic dedup now and complete manual + review later. `export_dedup_candidates()` / `reimport_dedup_candidates()` + persist and restore the `$manual_dedup` candidate pairs, and `export_csv()` + gains a `manual_dedup_complete` flag (written as a column, read back by + `reimport_csv()`) so downstream steps know whether review is still pending. + Re-import, mark `result == "match"`, and merge with + `dedup_citations_add_manual()`. +- `reimport_csv()` now reads all columns as character, matching the canonical + (all-character) types produced by `dedup_citations()`. This is required so a + reimported set can re-enter `dedup_citations_add_manual()` (and future + re-deduplication) without column-type clashes. - `read_citations()` now warns when `cite_label` values are outside the standard vocabulary (`search`, `screened`, `final`), since phase-analysis functions depend on those exact strings. diff --git a/R/export.R b/R/export.R index 6be27c27..262816f5 100644 --- a/R/export.R +++ b/R/export.R @@ -15,6 +15,12 @@ #' @param trim_abstracts Some databases may return full-text that is misidentified as an abstract. This inflates file size and may lead to issues with Excel, #' which cannot deal with more than 32,000 characters per field. Therefore, the default is to trim very long abstracts to 32,000 characters. Set a lower number to reduce file size, or #' NULL to retain abstracts as they are. +#' @param manual_dedup_complete Logical. Records, in a `manual_dedup_complete` +#' column, whether manual deduplication has been completed for this set +#' (default `FALSE`). Set `TRUE` after confirming manual pairs with +#' [dedup_citations_add_manual()]. This flag is read back by [reimport_csv()] +#' and lets later steps know whether candidate pairs still need review. Only +#' written when `fields = "full"`. #' @return No return value, called for side effects. Saves the deduplicated citations as a 'CSV' file to the specified location. #' @export #' @examples @@ -28,7 +34,7 @@ #' export_csv(dedup_results, tempfile(fileext = ".csv"), fields = "standard") #' } -export_csv <- function(unique_citations, filename, fields = "full", separate = NULL, trim_abstracts = 32000) { +export_csv <- function(unique_citations, filename, fields = "full", separate = NULL, trim_abstracts = 32000, manual_dedup_complete = FALSE) { # Warn if the filename doesn't end with .csv if (tolower(tools::file_ext(filename)) != "csv") { warning("Function saves a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") @@ -78,9 +84,61 @@ export_csv <- function(unique_citations, filename, fields = "full", separate = N dplyr::select(-tidyselect::all_of(separate)) |> dplyr::bind_cols(separated) } + + # Record manual-dedup status on reimportable (full) exports only, so that + # later steps (and the Shiny app) know whether manual review is still pending. + if (identical(fields, "full")) { + unique_citations$manual_dedup_complete <- isTRUE(manual_dedup_complete) + } + utils::write.csv(unique_citations, filename, row.names = FALSE) } +#' Export manual-review candidate pairs to a CSV file +#' +#' Saves the candidate duplicate pairs returned as the `$manual_dedup` element +#' of `dedup_citations(manual = TRUE)` so that manual review can be completed +#' later. Combine with [export_csv()] to defer manual deduplication: export the +#' automatically deduplicated unique citations *and* these candidate pairs now, +#' then re-import both later with [reimport_csv()] and +#' [reimport_dedup_candidates()] to finish the review. Note that *existing files +#' are overwritten without warning.* +#' +#' @param manual_dedup Data frame of candidate pairs, i.e. the `$manual_dedup` +#' element of `dedup_citations(manual = TRUE)`. +#' @param filename Name (and path) of file, should end in .csv +#' @return No return value, called for side effects. Saves the candidate pairs +#' as a 'CSV' file to the specified location. +#' @export +#' @seealso [reimport_dedup_candidates()], [dedup_citations_add_manual()] +#' @examples +#' if (interactive()) { +#' examplecitations_path <- system.file("extdata", "examplecitations.rds", package = "CiteSource") +#' examplecitations <- readRDS(examplecitations_path) +#' dedup_results <- dedup_citations(examplecitations, manual = TRUE) +#' export_dedup_candidates(dedup_results$manual_dedup, tempfile(fileext = ".csv")) +#' } +export_dedup_candidates <- function(manual_dedup, filename) { + if (tolower(tools::file_ext(filename)) != "csv") { + warning("Function saves a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") + } + + if (!all(c("duplicate_id.x", "duplicate_id.y") %in% names(manual_dedup))) { + stop( + "manual_dedup must contain duplicate_id.x and duplicate_id.y columns. ", + "Pass the $manual_dedup element of dedup_citations(manual = TRUE)." + ) + } + + # Seed an empty result column to prompt reviewers to mark confirmed duplicates + # (dedup_citations_add_manual() merges only rows where result == "match"). + if (!"result" %in% names(manual_dedup)) { + manual_dedup$result <- "" + } + + utils::write.csv(manual_dedup, filename, row.names = FALSE) +} + #' Export data frame to RIS file #' #' This function saves a data frame as a RIS file with specified columns mapped to RIS fields. Note that diff --git a/R/reimport.R b/R/reimport.R index e4df627f..79fbe3a3 100644 --- a/R/reimport.R +++ b/R/reimport.R @@ -17,8 +17,11 @@ reimport_csv <- function(filename) { # Warn if the filename doesn't end with .csv if (tolower(tools::file_ext(filename)) != "csv") warning("Function reads a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") - # Read the CSV file - unique_citations_imported <- utils::read.csv(filename, stringsAsFactors = FALSE) + # Read the CSV file. All columns are read as character so the reimported data + # matches the (all-character) types produced by dedup_citations(). This keeps + # the reimport faithful and is required for re-deduplication / adding manual + # pairs via dedup_citations_add_manual(), which fails on mixed column types. + unique_citations_imported <- utils::read.csv(filename, stringsAsFactors = FALSE, colClasses = "character") # Check if the required columns are present if (!all(c("cite_source", "cite_label", "cite_string", "duplicate_id", "record_ids") %in% names(unique_citations_imported))) { @@ -29,9 +32,56 @@ reimport_csv <- function(filename) { ) } + # The manual_dedup_complete flag (written by export_csv) is left as character + # ("TRUE"/"FALSE") like every other column, so the reimported set can be passed + # through dedup_citations_add_manual() / re-deduplication without column-type + # clashes. Test it with e.g. `df$manual_dedup_complete[1] == "TRUE"`. + unique_citations_imported } +#' Reimport manual-review candidate pairs exported from CiteSource +#' +#' Reads a CSV of candidate duplicate pairs previously written by +#' [export_dedup_candidates()] (i.e. the `$manual_dedup` element of +#' `dedup_citations(manual = TRUE)`). This supports a deferred workflow: run +#' automatic deduplication now, export both the unique citations and the +#' candidate pairs, and complete the manual review later after re-importing. +#' +#' After review, set the `result` column to `"match"` for confirmed duplicates +#' and pass the result, together with the reimported unique citations, to +#' [dedup_citations_add_manual()]. +#' +#' @param filename Name (and path) of the candidate-pairs CSV, should end in .csv +#' @return A data frame of candidate pairs with `duplicate_id.x` / `duplicate_id.y` +#' read as character (matching the unique citations from [reimport_csv()]), ready +#' for review and [dedup_citations_add_manual()]. +#' @export +#' @seealso [export_dedup_candidates()], [dedup_citations_add_manual()] +#' @examples +#' if (interactive()) { +#' candidates <- reimport_dedup_candidates("path/to/candidates.csv") +#' # mark confirmed duplicates, then merge into the reimported unique set +#' candidates$result <- ifelse(candidates$result == "match", "match", "no_match") +#' final <- dedup_citations_add_manual(reimport_csv("unique.csv"), candidates) +#' } +reimport_dedup_candidates <- function(filename) { + if (tolower(tools::file_ext(filename)) != "csv") { + warning("Function reads a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") + } + + candidates <- utils::read.csv(filename, stringsAsFactors = FALSE, colClasses = "character") + + if (!all(c("duplicate_id.x", "duplicate_id.y") %in% names(candidates))) { + stop( + "Columns duplicate_id.x and duplicate_id.y were not found in ", filename, + ". This function expects a candidate-pairs file written by export_dedup_candidates()." + ) + } + + candidates +} + #' Reimport a RIS-file exported from CiteSource #' #' This function reimports a RIS file that was tagged and deduplicated by CiteSource. diff --git a/inst/shiny-app/CiteSource/app.R b/inst/shiny-app/CiteSource/app.R index 20d298fb..a210af59 100644 --- a/inst/shiny-app/CiteSource/app.R +++ b/inst/shiny-app/CiteSource/app.R @@ -358,6 +358,8 @@ ui <- shiny::navbarPage("CiteSource", ), shiny::hr(), shiny::h5("OR: Re-upload an .ris or .csv exported from CiteSource"), + shiny::p("You can also add a candidate-pairs CSV to resume manual deduplication.", + style = "font-size:0.8em;color:#6c757d;margin-top:-6px;"), shiny::fileInput("file_reimport", "", multiple = TRUE, accept = c(".ris", ".csv") @@ -818,6 +820,14 @@ ui <- shiny::navbarPage("CiteSource", "CSV of every merged duplicate pair, flagged as automated or manual.", style="color:#6c757d;font-size:0.82em;margin-bottom:8px;"), shiny::downloadButton("downloadDedupLog", "Dedup Log (CSV)", + style="margin-bottom:4px;"), + shiny::tags$hr(style="margin:14px 0 10px 0;"), + shiny::tags$strong("Manual review candidates", + style="font-size:0.88em;display:block;margin-bottom:4px;"), + shiny::p( + "CSV of unresolved candidate pairs. Re-upload it together with the citations CSV to finish manual deduplication later.", + style="color:#6c757d;font-size:0.82em;margin-bottom:8px;"), + shiny::downloadButton("downloadCandidates", "Candidate Pairs (CSV)", style="margin-bottom:4px;") ) ), @@ -1194,22 +1204,59 @@ server <- function(input, output, session) { }) shiny::observeEvent(input$file_reimport, { - file_extension <- tolower(tools::file_ext(input$file_reimport$datapath)) - - if (file_extension == "csv") { - rv$latest_unique <- reimport_csv(input$file_reimport$datapath) - } else if (file_extension == "ris") { - rv$latest_unique <- reimport_ris(input$file_reimport$datapath) - } else { - warning("Invalid file extension, needs to be .ris or .csv") + files <- input$file_reimport # data frame: one row per file (multiple = TRUE) + + n_unique_imported <- 0L + n_candidates_imported <- 0L + errors <- character(0) + + for (i in seq_len(nrow(files))) { + path <- files$datapath[i] + nm <- files$name[i] + ext <- tolower(tools::file_ext(nm)) + + tryCatch({ + if (ext == "ris") { + rv$latest_unique <- reimport_ris(path) + n_unique_imported <- nrow(rv$latest_unique) + } else if (ext == "csv") { + # Route by content: a candidate-pairs file has duplicate_id.x / .y; + # a deduplicated citation set has the cite_* / duplicate_id columns. + hdr <- names(utils::read.csv(path, nrows = 1, stringsAsFactors = FALSE)) + if (all(c("duplicate_id.x", "duplicate_id.y") %in% hdr)) { + cand <- reimport_dedup_candidates(path) + # Drop the R-workflow `result` column: in the app the merge decision + # is the user's row selection, not result == "match". + cand$result <- NULL + rv$pairs_to_check <- cand + n_candidates_imported <- nrow(cand) + } else { + rv$latest_unique <- reimport_csv(path) + n_unique_imported <- nrow(rv$latest_unique) + } + } else { + errors <- c(errors, paste0(nm, " (unsupported type)")) + } + }, error = function(e) { + errors <<- c(errors, paste0(nm, ": ", conditionMessage(e))) + }) + } + + if (n_unique_imported > 0) rv$n_unique <- count_unique(rv$latest_unique) + + if (length(errors) > 0) { + show_toastr("Some files could not be re-imported", + paste(errors, collapse = "; "), type = "error") + } + if (n_unique_imported > 0 || n_candidates_imported > 0) { + msg <- character(0) + if (n_unique_imported > 0) + msg <- c(msg, paste("Imported", n_unique_imported, "deduplicated citations.")) + if (n_candidates_imported > 0) + msg <- c(msg, paste("Restored", n_candidates_imported, + "candidate pair(s) — finish review on the Deduplicate tab.")) + show_toastr("Re-import successful", paste(msg, collapse = " "), type = "success") } - - rv$n_unique <- count_unique(rv$latest_unique) - - show_toastr("Re-import successful", - paste("Imported", nrow(rv$latest_unique), "citations. You can now proceed to visualisation and tables."), - type = "success") - }) ## Update filters @@ -2937,7 +2984,11 @@ server <- function(input, output, session) { cols <- input$csv_custom_cols if (is.null(cols) || length(cols) == 0) "full" else cols } - export_csv(rv$latest_unique, file, fields = fields) + # Flag the set as manually reviewed when no candidate pairs remain + # pending (UX guard, read back by reimport_csv()). Only written on full + # exports by export_csv(). + export_csv(rv$latest_unique, file, fields = fields, + manual_dedup_complete = (nrow(rv$pairs_to_check) == 0)) } else { stop("No data to download!") shiny::req(FALSE) @@ -2987,6 +3038,22 @@ server <- function(input, output, session) { } ) + # ---- Manual-review candidate pairs: export to resume review later ---- + output$downloadCandidates <- shiny::downloadHandler( + filename = function() paste0("candidate-pairs-", Sys.Date(), ".csv"), + content = function(file) { + pairs <- rv$pairs_to_check + if (!is.data.frame(pairs) || nrow(pairs) == 0) { + utils::write.csv( + data.frame(note = "No unresolved candidate pairs."), + file, row.names = FALSE + ) + } else { + export_dedup_candidates(pairs, file) + } + } + ) + # Export hub — plot downloads (mirror the Visualise tab download handlers) output$export_heatplot <- shiny::downloadHandler( filename = function() { diff --git a/man/export_csv.Rd b/man/export_csv.Rd index 93fbf04e..a6aebb9e 100644 --- a/man/export_csv.Rd +++ b/man/export_csv.Rd @@ -9,7 +9,8 @@ export_csv( filename, fields = "full", separate = NULL, - trim_abstracts = 32000 + trim_abstracts = 32000, + manual_dedup_complete = FALSE ) } \arguments{ @@ -28,6 +29,13 @@ selection. Note that exports other than \code{"full"} cannot be reimported into \item{trim_abstracts}{Some databases may return full-text that is misidentified as an abstract. This inflates file size and may lead to issues with Excel, which cannot deal with more than 32,000 characters per field. Therefore, the default is to trim very long abstracts to 32,000 characters. Set a lower number to reduce file size, or NULL to retain abstracts as they are.} + +\item{manual_dedup_complete}{Logical. Records, in a \code{manual_dedup_complete} +column, whether manual deduplication has been completed for this set +(default \code{FALSE}). Set \code{TRUE} after confirming manual pairs with +\code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}}. This flag is read back by \code{\link[=reimport_csv]{reimport_csv()}} +and lets later steps know whether candidate pairs still need review. Only +written when \code{fields = "full"}.} } \value{ No return value, called for side effects. Saves the deduplicated citations as a 'CSV' file to the specified location. diff --git a/man/export_dedup_candidates.Rd b/man/export_dedup_candidates.Rd new file mode 100644 index 00000000..253e7d2b --- /dev/null +++ b/man/export_dedup_candidates.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{export_dedup_candidates} +\alias{export_dedup_candidates} +\title{Export manual-review candidate pairs to a CSV file} +\usage{ +export_dedup_candidates(manual_dedup, filename) +} +\arguments{ +\item{manual_dedup}{Data frame of candidate pairs, i.e. the \verb{$manual_dedup} +element of \code{dedup_citations(manual = TRUE)}.} + +\item{filename}{Name (and path) of file, should end in .csv} +} +\value{ +No return value, called for side effects. Saves the candidate pairs +as a 'CSV' file to the specified location. +} +\description{ +Saves the candidate duplicate pairs returned as the \verb{$manual_dedup} element +of \code{dedup_citations(manual = TRUE)} so that manual review can be completed +later. Combine with \code{\link[=export_csv]{export_csv()}} to defer manual deduplication: export the +automatically deduplicated unique citations \emph{and} these candidate pairs now, +then re-import both later with \code{\link[=reimport_csv]{reimport_csv()}} and +\code{\link[=reimport_dedup_candidates]{reimport_dedup_candidates()}} to finish the review. Note that \emph{existing files +are overwritten without warning.} +} +\examples{ +if (interactive()) { + examplecitations_path <- system.file("extdata", "examplecitations.rds", package = "CiteSource") + examplecitations <- readRDS(examplecitations_path) + dedup_results <- dedup_citations(examplecitations, manual = TRUE) + export_dedup_candidates(dedup_results$manual_dedup, tempfile(fileext = ".csv")) +} +} +\seealso{ +\code{\link[=reimport_dedup_candidates]{reimport_dedup_candidates()}}, \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}} +} diff --git a/man/reimport_dedup_candidates.Rd b/man/reimport_dedup_candidates.Rd new file mode 100644 index 00000000..03c5aa13 --- /dev/null +++ b/man/reimport_dedup_candidates.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reimport.R +\name{reimport_dedup_candidates} +\alias{reimport_dedup_candidates} +\title{Reimport manual-review candidate pairs exported from CiteSource} +\usage{ +reimport_dedup_candidates(filename) +} +\arguments{ +\item{filename}{Name (and path) of the candidate-pairs CSV, should end in .csv} +} +\value{ +A data frame of candidate pairs with \code{duplicate_id.x} / \code{duplicate_id.y} +read as character (matching the unique citations from \code{\link[=reimport_csv]{reimport_csv()}}), ready +for review and \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}}. +} +\description{ +Reads a CSV of candidate duplicate pairs previously written by +\code{\link[=export_dedup_candidates]{export_dedup_candidates()}} (i.e. the \verb{$manual_dedup} element of +\code{dedup_citations(manual = TRUE)}). This supports a deferred workflow: run +automatic deduplication now, export both the unique citations and the +candidate pairs, and complete the manual review later after re-importing. +} +\details{ +After review, set the \code{result} column to \code{"match"} for confirmed duplicates +and pass the result, together with the reimported unique citations, to +\code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}}. +} +\examples{ +if (interactive()) { + candidates <- reimport_dedup_candidates("path/to/candidates.csv") + # mark confirmed duplicates, then merge into the reimported unique set + candidates$result <- ifelse(candidates$result == "match", "match", "no_match") + final <- dedup_citations_add_manual(reimport_csv("unique.csv"), candidates) +} +} +\seealso{ +\code{\link[=export_dedup_candidates]{export_dedup_candidates()}}, \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}} +} diff --git a/tests/testthat/test-reimport.R b/tests/testthat/test-reimport.R new file mode 100644 index 00000000..742c6ec6 --- /dev/null +++ b/tests/testthat/test-reimport.R @@ -0,0 +1,74 @@ +# Tests for the deferred manual-deduplication workflow: +# auto-dedup now -> export -> reimport -> complete manual review later. + +example_citations <- function() { + readRDS(system.file("extdata", "examplecitations.rds", package = "CiteSource")) +} + +test_that("reimport_csv round-trips all columns as character", { + auto <- suppressWarnings(suppressMessages(dedup_citations(example_citations()))) + f <- tempfile(fileext = ".csv") + export_csv(auto, f) + re <- reimport_csv(f) + + expect_true(all(c("cite_source", "cite_label", "cite_string", + "duplicate_id", "record_ids") %in% names(re))) + # duplicate_id must stay character (read.csv would otherwise infer integer), + # which is what dedup_citations_add_manual() / re-dedup require. + expect_type(re$duplicate_id, "character") + expect_type(re$year, "character") +}) + +test_that("export_csv writes the manual_dedup_complete flag on full exports", { + auto <- suppressWarnings(suppressMessages(dedup_citations(example_citations()))) + + f1 <- tempfile(fileext = ".csv") + export_csv(auto, f1) # default FALSE + expect_equal(unique(reimport_csv(f1)$manual_dedup_complete), "FALSE") + + f2 <- tempfile(fileext = ".csv") + export_csv(auto, f2, manual_dedup_complete = TRUE) + expect_equal(unique(reimport_csv(f2)$manual_dedup_complete), "TRUE") + + # Not written for non-reimportable (standard) exports + f3 <- tempfile(fileext = ".csv") + suppressWarnings(export_csv(auto, f3, fields = "standard")) + expect_false("manual_dedup_complete" %in% names(utils::read.csv(f3))) +}) + +test_that("candidate pairs round-trip and seed a result column", { + pairs <- data.frame( + duplicate_id.x = c("1001", "1005"), + duplicate_id.y = c("1002", "1009"), + title1 = c("A", "B"), title2 = c("A", "B"), + stringsAsFactors = FALSE + ) + f <- tempfile(fileext = ".csv") + export_dedup_candidates(pairs, f) + back <- reimport_dedup_candidates(f) + + expect_true(all(c("duplicate_id.x", "duplicate_id.y", "result") %in% names(back))) + expect_type(back$duplicate_id.x, "character") + expect_equal(nrow(back), 2) + + expect_error(export_dedup_candidates(data.frame(a = 1), f), "duplicate_id") + bad <- tempfile(fileext = ".csv") + utils::write.csv(data.frame(a = 1), bad, row.names = FALSE) + expect_error(reimport_dedup_candidates(bad), "duplicate_id") +}) + +test_that("manual pairs can be merged into a reimported (auto-deduped) set", { + auto <- suppressWarnings(suppressMessages(dedup_citations(example_citations()))) + f <- tempfile(fileext = ".csv") + export_csv(auto, f) + re <- reimport_csv(f) + + # Force two real records to be a confirmed manual duplicate + ids <- as.character(re$duplicate_id) + pair <- data.frame(duplicate_id.x = ids[1], duplicate_id.y = ids[2], + result = "match", stringsAsFactors = FALSE) + + final <- suppressWarnings(suppressMessages(dedup_citations_add_manual(re, pair))) + expect_equal(nrow(final), nrow(re) - 1) + expect_true(all(c("cite_source", "record_ids", "duplicate_id") %in% names(final))) +}) From 06aa754a75114b1fd7db27e002183c3e978d8bbb Mon Sep 17 00:00:00 2001 From: Trevor Riley Date: Mon, 1 Jun 2026 10:38:31 -0400 Subject: [PATCH 2/7] =?UTF-8?q?feat:=20incremental=20deduplication=20?= =?UTF-8?q?=E2=80=94=20add=20sources=20to=20a=20deduped=20set=20(Goal=202)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dedup_citations_add_sources(existing, new_raw) adds new raw citations to a previously deduplicated set and re-deduplicates across both, preserving prior auto/manual merge decisions and the original record_ids provenance. For the same data it produces the same unique set as deduplicating everything from scratch (validated on the gambling-harms vignette data: 163 existing + 431 new -> 278 unique, == from-scratch; 645 underlying record_ids preserved). Implementation reconciles IDs (existing duplicate_id -> record_id; new records get fresh non-colliding ids based on the max underlying id), drops duplicate_id/record_ids so the engine's format_rerun rename can't clash on record_id, re-runs dedup_citations(), then expands record_ids back to the original underlying IDs via a provenance lookup. Works in manual = TRUE mode to surface new candidate pairs. Shiny app: - file_reimport sets rv$existing_dedup_present when a deduplicated set is re-imported. - identify_dups: with an existing re-imported set present, "Find duplicates" merges new uploads in via dedup_citations_add_sources(); otherwise deduplicates the uploads as before. Uploads (and the upload form) are cleared after a merge to prevent adding the same records twice. - Deduplicate tab hint describes the add-sources flow. Tests in test-add-sources.R. --- NAMESPACE | 1 + NEWS.md | 7 +++ R/dedup.R | 98 ++++++++++++++++++++++++++++++ inst/shiny-app/CiteSource/app.R | 81 ++++++++++++++++-------- man/dedup_citations_add_sources.Rd | 57 +++++++++++++++++ tests/testthat/test-add-sources.R | 80 ++++++++++++++++++++++++ 6 files changed, 299 insertions(+), 25 deletions(-) create mode 100644 man/dedup_citations_add_sources.Rd create mode 100644 tests/testthat/test-add-sources.R diff --git a/NAMESPACE b/NAMESPACE index 138bd851..c0c773b7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(create_initial_record_table) export(create_precision_sensitivity_table) export(dedup_citations) export(dedup_citations_add_manual) +export(dedup_citations_add_sources) export(dedup_log) export(export_bib) export(export_csv) diff --git a/NEWS.md b/NEWS.md index 76b8a86f..b506ee4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,13 @@ ## New features +- Incremental deduplication: `dedup_citations_add_sources()` adds new raw + citations to a previously deduplicated set and deduplicates across both, + preserving prior automatic and manual merge decisions and the original + `record_ids` provenance. For the same data it yields the same unique set as + deduplicating everything from scratch. Exposed in the Shiny app — re-upload a + deduplicated set, add new citation files, and "Find duplicates" merges them + in. Works in `manual = TRUE` mode to surface new candidate pairs for review. - Deferred manual deduplication: run automatic dedup now and complete manual review later. `export_dedup_candidates()` / `reimport_dedup_candidates()` persist and restore the `$manual_dedup` candidate pairs, and `export_csv()` diff --git a/R/dedup.R b/R/dedup.R index df8eeb12..46be8768 100644 --- a/R/dedup.R +++ b/R/dedup.R @@ -181,6 +181,104 @@ dedup_citations_add_manual <- function(unique_citations, additional_pairs) { } +#' Add new citations to a previously deduplicated set and re-deduplicate +#' +#' Adds further citations (e.g. an additional database search) to a set that was +#' already deduplicated, and deduplicates the new records against both the +#' existing set and each other — without discarding the work already done. Each +#' existing unique record enters as a single row, so prior automatic and manual +#' merge decisions are preserved; the new records are integrated and full +#' provenance (the original `record_ids` behind every merged record) is carried +#' through. +#' +#' This is the incremental counterpart to running [dedup_citations()] on all +#' sources from scratch and, for the same data, produces the same unique set. +#' +#' @export +#' @param existing_citations A previously deduplicated set (from +#' [dedup_citations()], [reimport_csv()] or [reimport_ris()]) — must contain a +#' `duplicate_id` column. +#' @param new_citations New raw citations to add, as returned by +#' [read_citations()] (with `cite_source` / `cite_label` / `cite_string`). +#' @param manual logical. If TRUE, return the full result list including +#' `$manual_dedup` candidate pairs for review (see [dedup_citations()]). +#' Default FALSE. +#' @param show_unknown_tags When a label, source, or other merged field is +#' missing, show it as "unknown"? Default FALSE. +#' @return When `manual = FALSE`: a dataframe of unique citations across both +#' sets. When `manual = TRUE`: a list with `$unique`, `$manual_dedup` and +#' `$auto_pairs` (as in [dedup_citations()]). In both cases `record_ids` +#' retains the original record IDs behind every merged record. +#' @seealso [dedup_citations()], [dedup_citations_add_manual()] +#' +#' @examples +#' if (interactive()) { +#' existing <- dedup_citations(read_citations(old_files, cite_sources = old_srcs)) +#' new_raw <- read_citations(new_files, cite_sources = new_srcs) +#' combined <- dedup_citations_add_sources(existing, new_raw) +#' } +dedup_citations_add_sources <- function(existing_citations, new_citations, + manual = FALSE, show_unknown_tags = FALSE) { + + if (!"duplicate_id" %in% names(existing_citations)) { + stop("existing_citations must contain a `duplicate_id` column — pass a set ", + "returned by dedup_citations(), reimport_csv() or reimport_ris().") + } + + # Work in character throughout (the dedup engine's canonical type) so the two + # frames bind without column-type clashes. + ex <- dplyr::mutate(existing_citations, dplyr::across(dplyr::everything(), as.character)) + if (!"record_ids" %in% names(ex)) ex$record_ids <- ex$duplicate_id + + # Provenance lookup: existing duplicate_id -> its underlying original record_ids + prov <- stats::setNames(as.character(ex$record_ids), as.character(ex$duplicate_id)) + + # Each existing unique record enters as one input keyed by its duplicate_id + ex$record_id <- as.character(ex$duplicate_id) + + # New records get fresh ids that cannot collide with any existing id. Base the + # offset on the max of ALL underlying record_ids (duplicate_id is the cluster + # minimum, so a new id keyed off it could otherwise reuse an existing id). + existing_ids <- c(as.character(ex$duplicate_id), + unlist(strsplit(paste(ex$record_ids, collapse = ", "), ",\\s*"))) + existing_ids <- existing_ids[!is.na(existing_ids) & !existing_ids %in% c("", "NA")] + max_id <- suppressWarnings(max(as.numeric(existing_ids), na.rm = TRUE)) + + nw <- dplyr::mutate(new_citations, dplyr::across(dplyr::everything(), as.character)) + nw <- dplyr::select(nw, -dplyr::any_of(c("duplicate_id", "record_ids", "record_id"))) + nw$record_id <- if (is.finite(max_id)) { + as.character(max_id + seq_len(nrow(nw))) + } else { + paste0("new_", seq_len(nrow(nw))) + } + + # Drop the merged-set metadata that would otherwise trigger a record_id clash + # (format_rerun renames duplicate_id -> record_id) or be stale after re-dedup. + ex <- dplyr::select(ex, -dplyr::any_of(c("duplicate_id", "record_ids", "manual_dedup_complete"))) + + combined <- dplyr::bind_rows(ex, nw) + + result <- dedup_citations(combined, manual = manual, show_unknown_tags = show_unknown_tags) + + # Restore original provenance: expand existing-duplicate-id tokens in the + # rebuilt record_ids back to their underlying original record IDs. + unique_out <- if (manual) result$unique else result + unique_out$record_ids <- vapply(unique_out$record_ids, function(rids) { + toks <- trimws(strsplit(rids, ",\\s*")[[1]]) + toks <- ifelse(toks %in% names(prov), prov[toks], toks) + toks <- unlist(strsplit(paste(toks, collapse = ", "), ",\\s*")) + paste(unique(trimws(toks[toks != ""])), collapse = ", ") + }, character(1), USE.NAMES = FALSE) + + if (manual) { + result$unique <- unique_out + result + } else { + unique_out + } +} + + #' Add missing columns to a citations dataframe #' #' @param raw_citations Citation dataframe diff --git a/inst/shiny-app/CiteSource/app.R b/inst/shiny-app/CiteSource/app.R index a210af59..29860613 100644 --- a/inst/shiny-app/CiteSource/app.R +++ b/inst/shiny-app/CiteSource/app.R @@ -382,6 +382,8 @@ ui <- shiny::navbarPage("CiteSource", br(), shiny::h5("Step 3: Deduplicate"), shiny::p("Click the button below to detect and remove duplicates automatically"), + shiny::p("Already re-uploaded a deduplicated set? Add new citation files on the File upload tab, then click Find duplicates to merge them into the existing set.", + style = "font-size:0.82em;color:#6c757d;"), # Action button: identify duplicates in uploaded dataset shinyWidgets::actionBttn( @@ -900,6 +902,7 @@ server <- function(input, output, session) { rv$pairs_to_check <- data.frame()#for potential duplicates/manual dedup rv$pairs_removed <- data.frame()#for removed records rv$auto_pairs <- data.frame()#auto-merged pairs, for the dedup log + rv$existing_dedup_present <- FALSE # TRUE when latest_unique is a reimported deduped set that new uploads should be merged INTO (Goal 2) rv$file_meta <- list() # Per-file metadata (source/label/string); keyed by file.datapath # Card view state rv$selected_pairs_card <- integer(0) @@ -1218,6 +1221,7 @@ server <- function(input, output, session) { tryCatch({ if (ext == "ris") { rv$latest_unique <- reimport_ris(path) + rv$existing_dedup_present <- TRUE n_unique_imported <- nrow(rv$latest_unique) } else if (ext == "csv") { # Route by content: a candidate-pairs file has duplicate_id.x / .y; @@ -1232,6 +1236,7 @@ server <- function(input, output, session) { n_candidates_imported <- nrow(cand) } else { rv$latest_unique <- reimport_csv(path) + rv$existing_dedup_present <- TRUE n_unique_imported <- nrow(rv$latest_unique) } } else { @@ -1411,11 +1416,15 @@ server <- function(input, output, session) { # when dedup button clicked, deduplicate shiny::observeEvent(input$identify_dups, { - if (nrow(rv$upload_df) == 0) { - if (nrow(rv$latest_unique) > 0) { - show_toastr("Deduplication already complete", - "You have reimported a dataset that has already been deduplicated. Further deduplication is not possible here.", - type = "error") + has_new <- is.data.frame(rv$upload_df) && nrow(rv$upload_df) > 0 + has_existing <- isTRUE(rv$existing_dedup_present) && + is.data.frame(rv$latest_unique) && nrow(rv$latest_unique) > 0 + + if (!has_new) { + if (is.data.frame(rv$latest_unique) && nrow(rv$latest_unique) > 0) { + show_toastr("Already deduplicated", + "This set is already deduplicated. To add more sources, upload new citation files above, then click Find duplicates to merge them in.", + type = "info") } else { show_toastr("Data needed", "Please import your citations first.", type = "error") } @@ -1444,35 +1453,57 @@ server <- function(input, output, session) { # Assign unique IDs to avoid issues with manual deduplication rv$upload_df <- rv$upload_df %>% dplyr::mutate(record_id = as.character(1000 + dplyr::row_number())) - - # Perform deduplication - dedup_results <- CiteSource::dedup_citations(rv$upload_df, manual = TRUE, show_unknown_tags = FALSE) + + n_new <- nrow(rv$upload_df) # capture before any clearing + + # Perform deduplication. With a reimported deduplicated set present, merge + # the new uploads INTO it (Goal 2); otherwise deduplicate the uploads. + if (has_existing) { + dedup_results <- CiteSource::dedup_citations_add_sources( + rv$latest_unique, rv$upload_df, manual = TRUE, show_unknown_tags = FALSE) + } else { + dedup_results <- CiteSource::dedup_citations( + rv$upload_df, manual = TRUE, show_unknown_tags = FALSE) + } rv$pairs_to_check <- dedup_results$manual_dedup rv$latest_unique <- dedup_results$unique rv$auto_pairs <- if (is.null(dedup_results$auto_pairs)) data.frame() else dedup_results$auto_pairs rv$pairs_removed <- data.frame() # reset manual log on a fresh dedup run rv$n_unique <- count_unique(rv$latest_unique) # Generate the n_unique data - - # Generate a summary message based on deduplication results - n_citations <- nrow(rv$upload_df) - n_unique_records <- nrow(rv$latest_unique) - n_duplicates_removed <- n_citations - n_unique_records - n_pairs_manual <- nrow(rv$pairs_to_check) + n_unique_records <- nrow(rv$latest_unique) + n_pairs_manual <- nrow(rv$pairs_to_check) fmt <- function(x) format(x, big.mark = ",", scientific = FALSE) - message <- if (n_pairs_manual > 0) { - paste0("Total citations uploaded: ", fmt(n_citations), "\n", - "Unique citations after deduplication: ", fmt(n_unique_records), "\n", - "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", - n_pairs_manual, " potential duplicate pair(s) flagged for manual review.") + + if (has_existing) { + # The combined set is now a standalone deduplicated set. Clear the uploads + # (and the upload form) so the same new records can't be added twice, and + # keep the merged set flagged as an existing set for further additions. + rv$df <- data.frame(); rv$upload_df <- data.frame(); rv$file_meta <- list() + rv$existing_dedup_present <- TRUE + + review_msg <- if (n_pairs_manual > 0) + paste0(n_pairs_manual, " potential duplicate pair(s) flagged for manual review.") + else "No potential duplicates for manual review." + message <- paste0("Added ", fmt(n_new), " new citation(s) to the existing set.\n", + "Unique citations after merge: ", fmt(n_unique_records), "\n\n", review_msg) + show_toastr("Sources added", message, type = "success") } else { - paste0("Total citations uploaded: ", fmt(n_citations), "\n", - "Unique citations after deduplication: ", fmt(n_unique_records), "\n", - "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", - "No potential duplicates for manual review. You can proceed to the visualization tab.") + rv$existing_dedup_present <- FALSE + n_duplicates_removed <- n_new - n_unique_records + message <- if (n_pairs_manual > 0) { + paste0("Total citations uploaded: ", fmt(n_new), "\n", + "Unique citations after deduplication: ", fmt(n_unique_records), "\n", + "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", + n_pairs_manual, " potential duplicate pair(s) flagged for manual review.") + } else { + paste0("Total citations uploaded: ", fmt(n_new), "\n", + "Unique citations after deduplication: ", fmt(n_unique_records), "\n", + "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", + "No potential duplicates for manual review. You can proceed to the visualization tab.") + } + show_toastr("Auto-deduplication complete", message, type = "success") } - - show_toastr("Auto-deduplication complete", message, type = "success") }) # ---- Post-dedup summary card ---- diff --git a/man/dedup_citations_add_sources.Rd b/man/dedup_citations_add_sources.Rd new file mode 100644 index 00000000..1efcd781 --- /dev/null +++ b/man/dedup_citations_add_sources.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dedup.R +\name{dedup_citations_add_sources} +\alias{dedup_citations_add_sources} +\title{Add new citations to a previously deduplicated set and re-deduplicate} +\usage{ +dedup_citations_add_sources( + existing_citations, + new_citations, + manual = FALSE, + show_unknown_tags = FALSE +) +} +\arguments{ +\item{existing_citations}{A previously deduplicated set (from +\code{\link[=dedup_citations]{dedup_citations()}}, \code{\link[=reimport_csv]{reimport_csv()}} or \code{\link[=reimport_ris]{reimport_ris()}}) — must contain a +\code{duplicate_id} column.} + +\item{new_citations}{New raw citations to add, as returned by +\code{\link[=read_citations]{read_citations()}} (with \code{cite_source} / \code{cite_label} / \code{cite_string}).} + +\item{manual}{logical. If TRUE, return the full result list including +\verb{$manual_dedup} candidate pairs for review (see \code{\link[=dedup_citations]{dedup_citations()}}). +Default FALSE.} + +\item{show_unknown_tags}{When a label, source, or other merged field is +missing, show it as "unknown"? Default FALSE.} +} +\value{ +When \code{manual = FALSE}: a dataframe of unique citations across both +sets. When \code{manual = TRUE}: a list with \verb{$unique}, \verb{$manual_dedup} and +\verb{$auto_pairs} (as in \code{\link[=dedup_citations]{dedup_citations()}}). In both cases \code{record_ids} +retains the original record IDs behind every merged record. +} +\description{ +Adds further citations (e.g. an additional database search) to a set that was +already deduplicated, and deduplicates the new records against both the +existing set and each other — without discarding the work already done. Each +existing unique record enters as a single row, so prior automatic and manual +merge decisions are preserved; the new records are integrated and full +provenance (the original \code{record_ids} behind every merged record) is carried +through. +} +\details{ +This is the incremental counterpart to running \code{\link[=dedup_citations]{dedup_citations()}} on all +sources from scratch and, for the same data, produces the same unique set. +} +\examples{ +if (interactive()) { + existing <- dedup_citations(read_citations(old_files, cite_sources = old_srcs)) + new_raw <- read_citations(new_files, cite_sources = new_srcs) + combined <- dedup_citations_add_sources(existing, new_raw) +} +} +\seealso{ +\code{\link[=dedup_citations]{dedup_citations()}}, \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}} +} diff --git a/tests/testthat/test-add-sources.R b/tests/testthat/test-add-sources.R new file mode 100644 index 00000000..495416f1 --- /dev/null +++ b/tests/testthat/test-add-sources.R @@ -0,0 +1,80 @@ +# Tests for dedup_citations_add_sources(): adding new citations to a +# previously deduplicated set (Goal 2 — incremental deduplication). + +# Build a tiny raw set with controllable duplicates across sources. +make_raw <- function() { + base <- data.frame( + title = c("Alpha study of fish", "Beta review of coral", "Gamma trial of kelp", + "Delta survey of crabs", "Epsilon report on eels"), + author = c("Smith J", "Jones A", "Lee K", "Brown R", "Davis M"), + year = c("2010", "2011", "2012", "2013", "2014"), + journal = c("Mar Biol", "Coral J", "Kelp Sci", "Crab Rev", "Eel Rep"), + abstract = c("aaa fish abundance", "bbb coral cover", "ccc kelp density", + "ddd crab counts", "eee eel migration"), + doi = c("10.1/a", "10.1/b", "10.1/c", "10.1/d", "10.1/e"), + pages = c("1-10", "11-20", "21-30", "31-40", "41-50"), + volume = c("1", "2", "3", "4", "5"), + number = c("1", "1", "1", "1", "1"), + isbn = c("111", "222", "333", "444", "555"), + stringsAsFactors = FALSE + ) + base +} + +test_that("add_sources errors without a duplicate_id column", { + raw <- make_raw() + expect_error( + dedup_citations_add_sources(raw, raw), + "duplicate_id" + ) +}) + +test_that("incremental dedup matches from-scratch and preserves provenance", { + # Source A = records 1-4, Source B = records 3-5 (records 3,4 overlap A/B). + a <- make_raw()[1:4, ]; a$cite_source <- "A"; a$cite_label <- ""; a$cite_string <- "" + b <- make_raw()[3:5, ]; b$cite_source <- "B"; b$cite_label <- ""; b$cite_string <- "" + + existing <- suppressWarnings(suppressMessages(dedup_citations(a))) + combined <- suppressWarnings(suppressMessages(dedup_citations_add_sources(existing, b))) + scratch <- suppressWarnings(suppressMessages( + dedup_citations(dplyr::bind_rows(a, b)) + )) + + # Same number of unique records as deduping everything at once + expect_equal(nrow(combined), nrow(scratch)) + + # All five distinct titles are represented + expect_setequal(unique(combined$title), unique(make_raw()$title)) + + # Both sources present after the merge + srcs <- unique(trimws(unlist(strsplit(paste(combined$cite_source, collapse = ", "), ",\\s*")))) + expect_true(all(c("A", "B") %in% srcs)) + + # Output is reimport-shaped + expect_true(all(c("duplicate_id", "record_ids", "cite_source") %in% names(combined))) +}) + +test_that("works on a reimported (all-character) existing set and in manual mode", { + a <- make_raw()[1:4, ]; a$cite_source <- "A"; a$cite_label <- ""; a$cite_string <- "" + b <- make_raw()[3:5, ]; b$cite_source <- "B"; b$cite_label <- ""; b$cite_string <- "" + + existing <- suppressWarnings(suppressMessages(dedup_citations(a))) + f <- tempfile(fileext = ".csv") + export_csv(existing, f) + existing_re <- reimport_csv(f) + + res <- suppressWarnings(suppressMessages( + dedup_citations_add_sources(existing_re, b, manual = TRUE) + )) + expect_type(res, "list") + expect_true(all(c("unique", "manual_dedup", "auto_pairs") %in% names(res))) + expect_true(all(c("duplicate_id", "record_ids") %in% names(res$unique))) + + # Candidate pairs reference duplicate_ids present in the unique output, so the + # set is ready for dedup_citations_add_manual() / the Shiny manual review tab. + if (nrow(res$manual_dedup) > 0) { + ids <- as.character(res$unique$duplicate_id) + expect_true(all(as.character(res$manual_dedup$duplicate_id.x) %in% ids)) + expect_true(all(as.character(res$manual_dedup$duplicate_id.y) %in% ids)) + } +}) From 25a5bcacef7e03d6a5c62443e1a486a1758e6fcc Mon Sep 17 00:00:00 2001 From: Trevor Riley Date: Mon, 1 Jun 2026 10:56:37 -0400 Subject: [PATCH 3/7] feat: show read-only source overview when a deduped set is re-imported On the File upload tab, re-importing a previously deduplicated/exported set now renders a view-only summary card listing per-source (and label/string) record counts, so users can see what is already in the set before adding more references. Tokens are de-duplicated within each record, so each unique record counts once per distinct source/label/string. The card also notes the total record count and whether manual deduplication was marked complete. It is kept separate from the new-uploads metadata form and does not allow editing source/label/string. --- inst/shiny-app/CiteSource/app.R | 83 +++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/inst/shiny-app/CiteSource/app.R b/inst/shiny-app/CiteSource/app.R index 29860613..a4828363 100644 --- a/inst/shiny-app/CiteSource/app.R +++ b/inst/shiny-app/CiteSource/app.R @@ -367,6 +367,7 @@ ui <- shiny::navbarPage("CiteSource", ), # Main panel for displaying outputs ---- shiny::mainPanel( + shiny::uiOutput("reimport_summary"), shiny::uiOutput("metadata_form"), shiny::uiOutput("post_upload_guide") ) @@ -998,14 +999,86 @@ server <- function(input, output, session) { ), check.names = FALSE ) - }, - striped = TRUE, - hover = TRUE, - width = "100%", + }, + striped = TRUE, + hover = TRUE, + width = "100%", align = "l", sanitize.text.function = function(x) x # Allows the tags to work ) - + + # --- Re-imported deduplicated set: read-only source overview --------------- + # When a previously deduplicated/exported set is re-uploaded, show what is + # already in it (sources, and any labels/strings) so the user can see the + # existing content before adding more references. View only — this set is kept + # separate from the newly uploaded files and its tags cannot be edited here. + reimport_summary_data <- shiny::reactive({ + shiny::req(isTRUE(rv$existing_dedup_present), + is.data.frame(rv$latest_unique), nrow(rv$latest_unique) > 0) + + # Count, per field value, the number of unique records that include it. + # Tokens are de-duplicated within each record so a record merged from + # several sources counts once per distinct source/label/string. + tally_field <- function(col, field_name) { + if (!col %in% names(rv$latest_unique)) return(NULL) + vals <- rv$latest_unique[[col]] + vals <- vals[!is.na(vals) & !vals %in% c("", "NA")] + if (length(vals) == 0) return(NULL) + toks <- unlist(lapply(strsplit(vals, ",\\s*"), function(t) unique(trimws(t)))) + toks <- toks[toks != "" & toks != "NA"] + if (length(toks) == 0) return(NULL) + tbl <- sort(table(toks), decreasing = TRUE) + data.frame(Field = field_name, Value = names(tbl), + Records = as.integer(tbl), check.names = FALSE, + row.names = NULL, stringsAsFactors = FALSE) + } + + out <- dplyr::bind_rows( + tally_field("cite_source", "Source"), + tally_field("cite_label", "Label"), + tally_field("cite_string", "String") + ) + if (is.null(out) || nrow(out) == 0) return(NULL) + out + }) + + output$reimport_summary <- shiny::renderUI({ + summ <- reimport_summary_data() + if (is.null(summ)) return(NULL) + + flag <- rv$latest_unique$manual_dedup_complete + reviewed <- length(flag) > 0 && as.character(flag[1]) %in% c("TRUE", "T", "1") + + bslib::card( + bslib::card_header( + shiny::tags$i(class = "fa fa-database", style = "margin-right:7px;"), + "Re-imported deduplicated set" + ), + bslib::card_body( + shiny::p( + paste0(format(nrow(rv$latest_unique), big.mark = ","), + " unique records re-imported", + if (reviewed) " (manual deduplication marked complete)." else "."), + style = "color:#6c757d;font-size:0.88em;margin-bottom:4px;" + ), + shiny::p( + "View only — this set is kept separate from any new files you upload. Add new citation files on the left, then click Find duplicates to merge them in.", + style = "color:#6c757d;font-size:0.8em;margin-bottom:10px;" + ), + shiny::tableOutput("reimport_summary_tbl") + ) + ) + }) + + output$reimport_summary_tbl <- shiny::renderTable( + { + summ <- reimport_summary_data() + shiny::req(summ) + summ + }, + striped = TRUE, hover = TRUE, width = "100%", align = "l" + ) + # --- Google Analytics Integration --- # Flag to ensure GA script is inserted only once per session #### Upload files tab section ------ From 14b5fe13cfb71c0ad76850834964333277cce41a Mon Sep 17 00:00:00 2001 From: Trevor Riley Date: Mon, 1 Jun 2026 11:02:58 -0400 Subject: [PATCH 4/7] docs: document incremental/deferred dedup in user guide; clarify upload page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User Guide (in-app www/user_guide.md): - Step 1: re-import section rewritten — re-importing is no longer a dead end; describes the read-only source-overview card and the three paths (continue to analysis, add new sources, or finish manual review by re-uploading candidate pairs). Adds a "growing a review over time" note. - Step 2: note that Find duplicates merges new uploads into a re-imported set. - Step 3: how to pause and finish manual review later via candidate-pairs export. - Step 6: document Dedup Log and Candidate Pairs downloads and the manual-dedup-complete flag in the full CSV. File upload page (app.R sidebar): clearer labels and helper text distinguishing "upload new files to deduplicate" from "re-upload a CiteSource export" (and that the two can be combined to add sources or resume manual review). README: note incremental add-sources and deferred manual review on re-import. --- README.md | 2 +- inst/shiny-app/CiteSource/app.R | 15 +++++---- inst/shiny-app/CiteSource/www/user_guide.md | 36 +++++++++++++++++++-- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a6a9854a..a330857b 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Once records are deduplicated, users are able to easily create plots and tables **Exporting and Re-importing Data** -Once records have been processed, users are able to export data in .csv, .ris, and .bib formats. Furthermore, users are able to reimport .csv and .ris files in order to recreate plots and tables. +Once records have been processed, users are able to export data in .csv, .ris, and .bib formats. Furthermore, users are able to reimport .csv and .ris files in order to recreate plots and tables. Re-importing also lets a review grow over time: new database or search results can be added to a previously deduplicated set and deduplicated against it without starting over (`dedup_citations_add_sources()`), and automatic deduplication can be done now with manual review completed later by exporting and re-importing the candidate pairs (`export_dedup_candidates()` / `reimport_dedup_candidates()`). ## Getting Started **Installation** diff --git a/inst/shiny-app/CiteSource/app.R b/inst/shiny-app/CiteSource/app.R index a4828363..6bbae9f1 100644 --- a/inst/shiny-app/CiteSource/app.R +++ b/inst/shiny-app/CiteSource/app.R @@ -351,16 +351,19 @@ ui <- shiny::navbarPage("CiteSource", # Sidebar layout ---- shiny::sidebarLayout( shiny::sidebarPanel( # Input: Select a file ---- - shiny::h5("Step 1: Upload your citation files"), - shiny::fileInput("file", "", + shiny::h5("Step 1: Upload citation files"), + shiny::p("New search/database exports to deduplicate.", + style = "font-size:0.8em;color:#6c757d;margin-top:-4px;margin-bottom:6px;"), + shiny::fileInput("file", "Add new files (.ris, .bib, .txt)", multiple = TRUE, accept = c(".ris", ".txt", ".bib") ), shiny::hr(), - shiny::h5("OR: Re-upload an .ris or .csv exported from CiteSource"), - shiny::p("You can also add a candidate-pairs CSV to resume manual deduplication.", - style = "font-size:0.8em;color:#6c757d;margin-top:-6px;"), - shiny::fileInput("file_reimport", "", + shiny::h5("Re-upload a CiteSource export"), + shiny::p( + "A previously deduplicated set (.csv or .ris) to keep working with — view its sources below, add new files above to merge in, or finish manual review by also re-uploading a candidate-pairs .csv.", + style = "font-size:0.8em;color:#6c757d;margin-top:-4px;margin-bottom:6px;"), + shiny::fileInput("file_reimport", "Re-upload exported file(s)", multiple = TRUE, accept = c(".ris", ".csv") ) diff --git a/inst/shiny-app/CiteSource/www/user_guide.md b/inst/shiny-app/CiteSource/www/user_guide.md index ab12c5d8..6f20e872 100644 --- a/inst/shiny-app/CiteSource/www/user_guide.md +++ b/inst/shiny-app/CiteSource/www/user_guide.md @@ -29,7 +29,20 @@ You can upload files in multiple batches. Each new upload adds rows to the form
Re-importing previously processed CiteSource data -> If you have a `.ris` or `.csv` previously exported from CiteSource, use the **"Re-upload an .ris or .csv exported from CiteSource"** input below the main upload area. These files contain embedded `cite_source`, `cite_label`, and `cite_string` columns. Re-importing skips deduplication entirely and takes you directly to Visualise and Tables with your prior results. +> If you have a `.ris` or `.csv` previously exported from CiteSource, use the **"Re-upload a CiteSource export"** input below the main upload area. These files carry embedded `cite_source`, `cite_label`, and `cite_string` columns, so they don't need to be re-deduplicated against themselves. +> +> When you re-import a deduplicated set, a read-only **"Re-imported deduplicated set"** card appears in the main panel showing the record count per source (and per label/string), so you can see what's already in the set before doing anything else. You can then: +> +> - **Go straight to Visualise and Tables** with your prior results, or +> - **Add new sources** — upload new search files using the box above, then go to **Deduplicate → Find duplicates** to merge them into the existing set (see Step 2), or +> - **Finish manual review later** — if you also re-upload the **candidate-pairs `.csv`** you exported earlier (see Step 6), the flagged pairs are restored to the Manual deduplication tab so you can complete the review (see Step 3). + +
+ +
+Adding more sources to a finished review + +> CiteSource lets you grow a review over time. Re-import a previously deduplicated set, upload the new database/search files, and run **Find duplicates** — the new records are deduplicated against both the existing set and each other, while the merges and manual decisions you already made are preserved. You do not have to start over.
@@ -56,6 +69,13 @@ Once complete, a summary card shows: - A per-source record count breakdown - Whether any pairs were flagged for manual review +
+Adding new sources to a re-imported set + +> If you re-imported a previously deduplicated set (Step 1) and then uploaded new citation files, clicking **Find duplicates** merges the new records into the existing set rather than starting from scratch. Prior automatic and manual merge decisions are kept, full record provenance is preserved, and the result is the same as if every source had been deduplicated together. After the merge the upload form clears so the same files can't be added twice — upload another batch any time to keep growing the set. + +
+ --- @@ -80,6 +100,13 @@ The default **Card View** shows each potential pair side-by-side with color-code +
+Finishing manual review later + +> You don't have to complete manual review in one sitting. On the **Export** tab, download both the **Citations** file and the **Candidate Pairs (CSV)** (see Step 6). Later, re-upload the citations file *and* the candidate-pairs file together (Step 1); the flagged pairs reappear here so you can finish reviewing them. When you export again with no pairs left pending, the file is marked as having manual deduplication complete. + +
+ --- @@ -136,7 +163,12 @@ Navigate to the **Tables** tab. Use the sidebar filters to select the subset of Navigate to the **Export** tab. Three sections are available: -**Citations** — download the full deduplicated dataset as `.csv`, `.ris`, or `.bib`. Provenance metadata (`cite_source`, `cite_label`, `cite_string`) is embedded in standard bibliographic fields (`.ris` uses C1, C2, C7, C8, DB). Only `.csv` and `.ris` can be re-imported into CiteSource later. +**Citations** — download the full deduplicated dataset as `.csv`, `.ris`, or `.bib`. Provenance metadata (`cite_source`, `cite_label`, `cite_string`) is embedded in standard bibliographic fields (`.ris` uses C1, C2, C7, C8, DB). Only `.csv` and `.ris` can be re-imported into CiteSource later. The full `.csv` also records whether manual deduplication has been completed, so a re-imported set knows whether review is still pending. + +This section also provides two supporting downloads: + +- **Dedup Log (CSV)** — every merged duplicate pair, flagged as automated or manual. Useful as a supplementary file documenting your deduplication for a systematic review. +- **Candidate Pairs (CSV)** — the pairs still flagged for manual review. Download this alongside the citations file if you want to pause and finish manual deduplication later (see Step 3). **Plots** — download any of the three visualizations as PNG files. Content reflects your current filter selections on the Visualise tab. From ef318a0faa4652cc1c02f3f9870395ce1f1d632a Mon Sep 17 00:00:00 2001 From: Trevor Riley Date: Mon, 1 Jun 2026 11:11:01 -0400 Subject: [PATCH 5/7] chore: bump version to 0.2.1 - DESCRIPTION: Version 0.2.0 -> 0.2.1, Date 2026-06-01. - NEWS.md: add 0.2.1 section covering incremental deduplication (dedup_citations_add_sources), deferred manual deduplication (export/reimport_dedup_candidates + manual_dedup_complete flag), the Shiny re-import source overview and multi-file content-routed re-upload, the all-character reimport_csv fix, and the doc updates. These were moved out of the released 0.2.0 section. - CITATION.cff: version 0.2.1, date-released 2026-06-01. --- CITATION.cff | 4 ++-- DESCRIPTION | 4 ++-- NEWS.md | 46 +++++++++++++++++++++++++++++++++------------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index b5a86494..c072d6cb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,8 +20,8 @@ authors: given-names: "Matthew J." orcid: "https://orcid.org/0000-0001-8426-6495" title: "CiteSource: An R Package for Data-Driven Search Strategy Development and Enhanced Evidence Synthesis Reporting" -version: 0.2.0 -date-released: 2026-05-13 +version: 0.2.1 +date-released: 2026-06-01 doi: TBD url: "https://github.com/ESHackathon/CiteSource" preferred-citation: diff --git a/DESCRIPTION b/DESCRIPTION index 216fbfe0..5c4d1e1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: CiteSource Title: Data-Driven Search Strategy Development and Evidence Synthesis Reporting -Version: 0.2.0 -Date: 2026-05-11 +Version: 0.2.1 +Date: 2026-06-01 Authors@R: c( person("Trevor", "Riley", , "tnriley@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6834-9802")), diff --git a/NEWS.md b/NEWS.md index b506ee4d..c1ea8a5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,18 +1,14 @@ -# CiteSource 0.2.0 - -## Breaking changes - -- Requires R >= 4.1.0 (native pipe and `across()` syntax used throughout). +# CiteSource 0.2.1 ## New features -- Incremental deduplication: `dedup_citations_add_sources()` adds new raw - citations to a previously deduplicated set and deduplicates across both, - preserving prior automatic and manual merge decisions and the original - `record_ids` provenance. For the same data it yields the same unique set as - deduplicating everything from scratch. Exposed in the Shiny app — re-upload a - deduplicated set, add new citation files, and "Find duplicates" merges them - in. Works in `manual = TRUE` mode to surface new candidate pairs for review. +- Incremental deduplication: `dedup_citations_add_sources()` adds new citations + to a previously deduplicated set and deduplicates across both, preserving + prior automatic and manual merge decisions and the original `record_ids` + provenance. For the same data it yields the same unique set as deduplicating + everything from scratch. Exposed in the Shiny app — re-upload a deduplicated + set, add new citation files, and "Find duplicates" merges them in. Works in + `manual = TRUE` mode to surface new candidate pairs for review. - Deferred manual deduplication: run automatic dedup now and complete manual review later. `export_dedup_candidates()` / `reimport_dedup_candidates()` persist and restore the `$manual_dedup` candidate pairs, and `export_csv()` @@ -20,10 +16,34 @@ `reimport_csv()`) so downstream steps know whether review is still pending. Re-import, mark `result == "match"`, and merge with `dedup_citations_add_manual()`. +- Shiny app: re-importing a deduplicated set now shows a read-only source + overview (records per source, and per label/string) on the upload page so you + can see what is already in the set before adding more; the re-upload input + accepts a candidate-pairs CSV and several files at once. + +## Bug fixes + - `reimport_csv()` now reads all columns as character, matching the canonical (all-character) types produced by `dedup_citations()`. This is required so a - reimported set can re-enter `dedup_citations_add_manual()` (and future + reimported set can re-enter `dedup_citations_add_manual()` (and incremental re-deduplication) without column-type clashes. +- Shiny app: the re-upload (re-import) input no longer errors when more than one + file is selected; each file is routed by content (deduplicated set vs. + candidate-pairs CSV vs. RIS). + +## Documentation + +- In-app User Guide and README updated to document incremental and deferred + deduplication; the file upload page labels were clarified. + +# CiteSource 0.2.0 + +## Breaking changes + +- Requires R >= 4.1.0 (native pipe and `across()` syntax used throughout). + +## New features + - `read_citations()` now warns when `cite_label` values are outside the standard vocabulary (`search`, `screened`, `final`), since phase-analysis functions depend on those exact strings. From 148199b8a482df893abf842839f224e7934775c3 Mon Sep 17 00:00:00 2001 From: Trevor Riley Date: Mon, 1 Jun 2026 11:47:27 -0400 Subject: [PATCH 6/7] chore: CRAN prep for 0.2.1 (R CMD check --as-cran clean) R CMD check --as-cran is now 0 errors | 0 warnings | 1 note (the note is the expected "New submission" feasibility notice plus transient URL-check resets). - R/dedup.R: replace non-ASCII em-dashes (incl. one in a stop() string) with ASCII hyphens; regenerate affected .Rd files. Clears the "non-ASCII characters in R code" WARNING. - .Rbuildignore: exclude CLAUDE.md, guide/, and .tmp* so dev/session files are not bundled into the build tarball. Clears the "non-standard top-level files" NOTE. - cran-comments.md: rewritten for the 0.2.1 feature update. Note: networkD3 remains in Suggests but is unused anywhere in the package (harmless on CRAN; flagged for optional removal). --- .Rbuildignore | 3 ++ R/dedup.R | 10 +++--- cran-comments.md | 53 ++++++++++++++---------------- man/dedup_citations.Rd | 2 +- man/dedup_citations_add_sources.Rd | 4 +-- man/dedup_log.Rd | 2 +- 6 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index aaf4a03f..cb80a70e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,6 @@ ^vignettes/valid_data$ ^tests/shinytest$ ^CRAN-SUBMISSION$ +^CLAUDE\.md$ +^guide$ +^\.tmp.*$ diff --git a/R/dedup.R b/R/dedup.R index 46be8768..43c9011c 100644 --- a/R/dedup.R +++ b/R/dedup.R @@ -18,7 +18,7 @@ #' @return When `manual = FALSE`: a dataframe of unique citations. When #' `manual = TRUE`: a list with `$unique` (unique citations), #' `$manual_dedup` (potential pairs for review), and `$auto_pairs` -#' (pairs that were merged automatically — feed to [dedup_log()] together +#' (pairs that were merged automatically - feed to [dedup_log()] together #' with confirmed manual pairs to build a full provenance log). #' #' @examples @@ -77,7 +77,7 @@ dedup_citations <- function(raw_citations, manual = FALSE, show_unknown_tags = F #' #' Combines automatically merged pairs and user-confirmed manual pairs into a #' single tibble with a `method` column (`"auto"` / `"manual"`). Useful for -#' reporting and auditing — e.g. as supplementary material for a systematic +#' reporting and auditing - e.g. as supplementary material for a systematic #' review. #' #' @export @@ -185,7 +185,7 @@ dedup_citations_add_manual <- function(unique_citations, additional_pairs) { #' #' Adds further citations (e.g. an additional database search) to a set that was #' already deduplicated, and deduplicates the new records against both the -#' existing set and each other — without discarding the work already done. Each +#' existing set and each other - without discarding the work already done. Each #' existing unique record enters as a single row, so prior automatic and manual #' merge decisions are preserved; the new records are integrated and full #' provenance (the original `record_ids` behind every merged record) is carried @@ -196,7 +196,7 @@ dedup_citations_add_manual <- function(unique_citations, additional_pairs) { #' #' @export #' @param existing_citations A previously deduplicated set (from -#' [dedup_citations()], [reimport_csv()] or [reimport_ris()]) — must contain a +#' [dedup_citations()], [reimport_csv()] or [reimport_ris()]) - must contain a #' `duplicate_id` column. #' @param new_citations New raw citations to add, as returned by #' [read_citations()] (with `cite_source` / `cite_label` / `cite_string`). @@ -221,7 +221,7 @@ dedup_citations_add_sources <- function(existing_citations, new_citations, manual = FALSE, show_unknown_tags = FALSE) { if (!"duplicate_id" %in% names(existing_citations)) { - stop("existing_citations must contain a `duplicate_id` column — pass a set ", + stop("existing_citations must contain a `duplicate_id` column - pass a set ", "returned by dedup_citations(), reimport_csv() or reimport_ris().") } diff --git a/cran-comments.md b/cran-comments.md index ecca3d5b..27867e09 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,46 +1,43 @@ -## Resubmission +## Submission -This is a resubmission addressing reviewer feedback: +This is a minor feature update (0.2.0 -> 0.2.1). -* **Title / description formatting**: Removed "An R Package for" from the - `Title` field. Added single quotes around `'shiny'` in `Description` to - conform to the requirement that package names appear in single quotes. +New in this version: -* **Missing `\value` tags**: Added `@return` documentation to all five - flagged functions (`export_bib`, `export_ris`, `plot_contributions`, - `plot_source_overlap_upset`, `reimport_ris`) and re-ran - `devtools::document()` to regenerate the `.Rd` files. Functions called - for side effects use the phrasing "No return value, called for side - effects." Functions that return objects describe the class and meaning - of the output. +* `dedup_citations_add_sources()` adds new citations to a previously + deduplicated set and deduplicates across both, preserving prior merge + decisions and record provenance (incremental deduplication). +* `export_dedup_candidates()` / `reimport_dedup_candidates()`, and a + `manual_dedup_complete` flag in `export_csv()`, support performing automatic + deduplication now and completing manual review later. +* `reimport_csv()` now reads all columns as character so a re-imported set can + re-enter the manual-merge / incremental-deduplication functions without + column-type clashes. +* Documentation and bundled 'shiny' application updates. -* **Writing to home filespace**: Removed default `filename` values from - `export_csv()`, `export_ris()`, and `export_bib()` so users must supply - an explicit path. Updated all examples to write to `tempfile()` rather - than a bare filename. Fixed the `file = TRUE` fallback in the internal - `write_refs()` helper to write to `tempdir()` instead of `getwd()`. +## Test environments ---- +* Local: Windows 11, R 4.5.0 +* win-builder: R-release and R-devel ## R CMD check results 0 errors | 0 warnings | 1 note -Tested on: -- Windows 11, R 4.5.0: 0 errors | 0 warnings | 1 note -- Windows Server 2022, R-devel (r90065 ucrt): 0 errors | 0 warnings | 1 note +* checking CRAN incoming feasibility ... NOTE + Maintainer: 'Trevor Riley ' -The note on Windows 11 is "unable to verify current time", a transient -network issue on the checking machine unrelated to the package. -The note on Win-devel is the standard "New submission" CRAN feasibility -notice and requires no action. + The "New submission" wording appears only because the previous version is not + yet published on CRAN. Any flagged URLs (e.g. the GNU GPL license pages) are + valid and resolve in a browser; the check machine reported transient + connection resets. ## Reverse dependencies -None — this is a new submission with no downstream dependents. +None - CiteSource has no downstream dependents on CRAN. ## Notes R/asys_dedup.R contains code vendored from the ASySD package -(GPL >= 3, CAMARADES Group / Kaitlyn Hair) with attribution in -the file header. CiteSource is also GPL >= 3, so licenses are compatible. +(GPL >= 3, CAMARADES Group / Kaitlyn Hair) with attribution in the file +header. CiteSource is also GPL >= 3, so the licenses are compatible. diff --git a/man/dedup_citations.Rd b/man/dedup_citations.Rd index f6c07c99..602643bb 100644 --- a/man/dedup_citations.Rd +++ b/man/dedup_citations.Rd @@ -19,7 +19,7 @@ missing, show it as "unknown"? Default FALSE.} When \code{manual = FALSE}: a dataframe of unique citations. When \code{manual = TRUE}: a list with \verb{$unique} (unique citations), \verb{$manual_dedup} (potential pairs for review), and \verb{$auto_pairs} -(pairs that were merged automatically — feed to \code{\link[=dedup_log]{dedup_log()}} together +(pairs that were merged automatically - feed to \code{\link[=dedup_log]{dedup_log()}} together with confirmed manual pairs to build a full provenance log). } \description{ diff --git a/man/dedup_citations_add_sources.Rd b/man/dedup_citations_add_sources.Rd index 1efcd781..cf535a5d 100644 --- a/man/dedup_citations_add_sources.Rd +++ b/man/dedup_citations_add_sources.Rd @@ -13,7 +13,7 @@ dedup_citations_add_sources( } \arguments{ \item{existing_citations}{A previously deduplicated set (from -\code{\link[=dedup_citations]{dedup_citations()}}, \code{\link[=reimport_csv]{reimport_csv()}} or \code{\link[=reimport_ris]{reimport_ris()}}) — must contain a +\code{\link[=dedup_citations]{dedup_citations()}}, \code{\link[=reimport_csv]{reimport_csv()}} or \code{\link[=reimport_ris]{reimport_ris()}}) - must contain a \code{duplicate_id} column.} \item{new_citations}{New raw citations to add, as returned by @@ -35,7 +35,7 @@ retains the original record IDs behind every merged record. \description{ Adds further citations (e.g. an additional database search) to a set that was already deduplicated, and deduplicates the new records against both the -existing set and each other — without discarding the work already done. Each +existing set and each other - without discarding the work already done. Each existing unique record enters as a single row, so prior automatic and manual merge decisions are preserved; the new records are integrated and full provenance (the original \code{record_ids} behind every merged record) is carried diff --git a/man/dedup_log.Rd b/man/dedup_log.Rd index 75331acf..228d6ab6 100644 --- a/man/dedup_log.Rd +++ b/man/dedup_log.Rd @@ -23,7 +23,7 @@ common bibliographic fields (\code{title1/2}, \code{author1/2}, \code{year1/2}, \description{ Combines automatically merged pairs and user-confirmed manual pairs into a single tibble with a \code{method} column (\code{"auto"} / \code{"manual"}). Useful for -reporting and auditing — e.g. as supplementary material for a systematic +reporting and auditing - e.g. as supplementary material for a systematic review. } \examples{ From 5aa432ae3d4288a8d0470fb8f8a4f31d1fc0fe04 Mon Sep 17 00:00:00 2001 From: Trevor Riley <89118428+TNRiley@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:59:34 -0400 Subject: [PATCH 7/7] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/dedup.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/dedup.R b/R/dedup.R index 43c9011c..d4f0eeeb 100644 --- a/R/dedup.R +++ b/R/dedup.R @@ -239,9 +239,10 @@ dedup_citations_add_sources <- function(existing_citations, new_citations, # New records get fresh ids that cannot collide with any existing id. Base the # offset on the max of ALL underlying record_ids (duplicate_id is the cluster # minimum, so a new id keyed off it could otherwise reuse an existing id). - existing_ids <- c(as.character(ex$duplicate_id), - unlist(strsplit(paste(ex$record_ids, collapse = ", "), ",\\s*"))) - existing_ids <- existing_ids[!is.na(existing_ids) & !existing_ids %in% c("", "NA")] + existing_ids <- c( + as.character(ex$duplicate_id), + unlist(strsplit(as.character(ex$record_ids), ",\\s*")) + ) max_id <- suppressWarnings(max(as.numeric(existing_ids), na.rm = TRUE)) nw <- dplyr::mutate(new_citations, dplyr::across(dplyr::everything(), as.character))