ESHackathon · TNRiley · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -20,3 +20,6 @@
 ^vignettes/valid_data$
 ^tests/shinytest$
 ^CRAN-SUBMISSION$
+^CLAUDE\.md$
+^guide$
+^\.tmp.*$
diff --git a/CITATION.cff b/CITATION.cff
@@ -20,8 +20,8 @@ authors:
     given-names: "Matthew J."
     orcid: "https://orcid.org/0000-0001-8426-6495"
 title: "CiteSource: An R Package for Data-Driven Search Strategy Development and Enhanced Evidence Synthesis Reporting"
-version: 0.2.0
-date-released: 2026-05-13
+version: 0.2.1
+date-released: 2026-06-01
 doi: TBD 
 url: "https://github.com/ESHackathon/CiteSource"
 preferred-citation:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CiteSource
 Title: Data-Driven Search Strategy Development and Evidence Synthesis Reporting
-Version: 0.2.0
-Date: 2026-05-11
+Version: 0.2.1
+Date: 2026-06-01
 Authors@R: c(
     person("Trevor", "Riley", , "tnriley@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-6834-9802")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,9 +14,11 @@ export(create_initial_record_table)
 export(create_precision_sensitivity_table)
 export(dedup_citations)
 export(dedup_citations_add_manual)
+export(dedup_citations_add_sources)
 export(dedup_log)
 export(export_bib)
 export(export_csv)
+export(export_dedup_candidates)
 export(export_ris)
 export(plot_contributions)
 export(plot_source_overlap_heatmap)
@@ -25,6 +27,7 @@ export(read_citations)
 export(record_counts)
 export(record_level_table)
 export(reimport_csv)
+export(reimport_dedup_candidates)
 export(reimport_ris)
 export(runShiny)
 export(run_shiny)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,41 @@
+# CiteSource 0.2.1
+
+## New features
+
+- Incremental deduplication: `dedup_citations_add_sources()` adds new citations
+  to a previously deduplicated set and deduplicates across both, preserving
+  prior automatic and manual merge decisions and the original `record_ids`
+  provenance. For the same data it yields the same unique set as deduplicating
+  everything from scratch. Exposed in the Shiny app — re-upload a deduplicated
+  set, add new citation files, and "Find duplicates" merges them in. Works in
+  `manual = TRUE` mode to surface new candidate pairs for review.
+- Deferred manual deduplication: run automatic dedup now and complete manual
+  review later. `export_dedup_candidates()` / `reimport_dedup_candidates()`
+  persist and restore the `$manual_dedup` candidate pairs, and `export_csv()`
+  gains a `manual_dedup_complete` flag (written as a column, read back by
+  `reimport_csv()`) so downstream steps know whether review is still pending.
+  Re-import, mark `result == "match"`, and merge with
+  `dedup_citations_add_manual()`.
+- Shiny app: re-importing a deduplicated set now shows a read-only source
+  overview (records per source, and per label/string) on the upload page so you
+  can see what is already in the set before adding more; the re-upload input
+  accepts a candidate-pairs CSV and several files at once.
+
+## Bug fixes
+
+- `reimport_csv()` now reads all columns as character, matching the canonical
+  (all-character) types produced by `dedup_citations()`. This is required so a
+  reimported set can re-enter `dedup_citations_add_manual()` (and incremental
+  re-deduplication) without column-type clashes.
+- Shiny app: the re-upload (re-import) input no longer errors when more than one
+  file is selected; each file is routed by content (deduplicated set vs.
+  candidate-pairs CSV vs. RIS).
+
+## Documentation
+
+- In-app User Guide and README updated to document incremental and deferred
+  deduplication; the file upload page labels were clarified.
+
 # CiteSource 0.2.0
 
 ## Breaking changes

diff --git a/R/dedup.R b/R/dedup.R
@@ -18,7 +18,7 @@
 #' @return When `manual = FALSE`: a dataframe of unique citations. When
 #'   `manual = TRUE`: a list with `$unique` (unique citations),
 #'   `$manual_dedup` (potential pairs for review), and `$auto_pairs`
-#'   (pairs that were merged automatically — feed to [dedup_log()] together
+#'   (pairs that were merged automatically - feed to [dedup_log()] together
 #'   with confirmed manual pairs to build a full provenance log).
 #'
 #' @examples
@@ -77,7 +77,7 @@ dedup_citations <- function(raw_citations, manual = FALSE, show_unknown_tags = F
 #'
 #' Combines automatically merged pairs and user-confirmed manual pairs into a
 #' single tibble with a `method` column (`"auto"` / `"manual"`). Useful for
-#' reporting and auditing — e.g. as supplementary material for a systematic
+#' reporting and auditing - e.g. as supplementary material for a systematic
 #' review.
 #'
 #' @export
@@ -181,6 +181,105 @@ dedup_citations_add_manual <- function(unique_citations, additional_pairs) {
 }
 
 
+#' Add new citations to a previously deduplicated set and re-deduplicate
+#'
+#' Adds further citations (e.g. an additional database search) to a set that was
+#' already deduplicated, and deduplicates the new records against both the
+#' existing set and each other - without discarding the work already done. Each
+#' existing unique record enters as a single row, so prior automatic and manual
+#' merge decisions are preserved; the new records are integrated and full
+#' provenance (the original `record_ids` behind every merged record) is carried
+#' through.
+#'
+#' This is the incremental counterpart to running [dedup_citations()] on all
+#' sources from scratch and, for the same data, produces the same unique set.
+#'
+#' @export
+#' @param existing_citations A previously deduplicated set (from
+#'   [dedup_citations()], [reimport_csv()] or [reimport_ris()]) - must contain a
+#'   `duplicate_id` column.
+#' @param new_citations New raw citations to add, as returned by
+#'   [read_citations()] (with `cite_source` / `cite_label` / `cite_string`).
+#' @param manual logical. If TRUE, return the full result list including
+#'   `$manual_dedup` candidate pairs for review (see [dedup_citations()]).
+#'   Default FALSE.
+#' @param show_unknown_tags When a label, source, or other merged field is
+#'   missing, show it as "unknown"? Default FALSE.
+#' @return When `manual = FALSE`: a dataframe of unique citations across both
+#'   sets. When `manual = TRUE`: a list with `$unique`, `$manual_dedup` and
+#'   `$auto_pairs` (as in [dedup_citations()]). In both cases `record_ids`
+#'   retains the original record IDs behind every merged record.
+#' @seealso [dedup_citations()], [dedup_citations_add_manual()]
+#'
+#' @examples
+#' if (interactive()) {
+#'   existing <- dedup_citations(read_citations(old_files, cite_sources = old_srcs))
+#'   new_raw  <- read_citations(new_files, cite_sources = new_srcs)
+#'   combined <- dedup_citations_add_sources(existing, new_raw)
+#' }
+dedup_citations_add_sources <- function(existing_citations, new_citations,
+                                        manual = FALSE, show_unknown_tags = FALSE) {
+
+  if (!"duplicate_id" %in% names(existing_citations)) {
+    stop("existing_citations must contain a `duplicate_id` column - pass a set ",
+         "returned by dedup_citations(), reimport_csv() or reimport_ris().")
+  }
+
+  # Work in character throughout (the dedup engine's canonical type) so the two
+  # frames bind without column-type clashes.
+  ex <- dplyr::mutate(existing_citations, dplyr::across(dplyr::everything(), as.character))
+  if (!"record_ids" %in% names(ex)) ex$record_ids <- ex$duplicate_id
+
+  # Provenance lookup: existing duplicate_id -> its underlying original record_ids
+  prov <- stats::setNames(as.character(ex$record_ids), as.character(ex$duplicate_id))
+
+  # Each existing unique record enters as one input keyed by its duplicate_id
+  ex$record_id <- as.character(ex$duplicate_id)
+
+  # New records get fresh ids that cannot collide with any existing id. Base the
+  # offset on the max of ALL underlying record_ids (duplicate_id is the cluster
+  # minimum, so a new id keyed off it could otherwise reuse an existing id).
+  existing_ids <- c(
+    as.character(ex$duplicate_id),
+    unlist(strsplit(as.character(ex$record_ids), ",\\s*"))
+  )
+  max_id <- suppressWarnings(max(as.numeric(existing_ids), na.rm = TRUE))
+
+  nw <- dplyr::mutate(new_citations, dplyr::across(dplyr::everything(), as.character))
+  nw <- dplyr::select(nw, -dplyr::any_of(c("duplicate_id", "record_ids", "record_id")))
+  nw$record_id <- if (is.finite(max_id)) {
+    as.character(max_id + seq_len(nrow(nw)))
+  } else {
+    paste0("new_", seq_len(nrow(nw)))
+  }
+
+  # Drop the merged-set metadata that would otherwise trigger a record_id clash
+  # (format_rerun renames duplicate_id -> record_id) or be stale after re-dedup.
+  ex <- dplyr::select(ex, -dplyr::any_of(c("duplicate_id", "record_ids", "manual_dedup_complete")))
+
+  combined <- dplyr::bind_rows(ex, nw)
+
+  result <- dedup_citations(combined, manual = manual, show_unknown_tags = show_unknown_tags)
+
+  # Restore original provenance: expand existing-duplicate-id tokens in the
+  # rebuilt record_ids back to their underlying original record IDs.
+  unique_out <- if (manual) result$unique else result
+  unique_out$record_ids <- vapply(unique_out$record_ids, function(rids) {
+    toks <- trimws(strsplit(rids, ",\\s*")[[1]])
+    toks <- ifelse(toks %in% names(prov), prov[toks], toks)
+    toks <- unlist(strsplit(paste(toks, collapse = ", "), ",\\s*"))
+    paste(unique(trimws(toks[toks != ""])), collapse = ", ")
+  }, character(1), USE.NAMES = FALSE)
+
+  if (manual) {
+    result$unique <- unique_out
+    result
+  } else {
+    unique_out
+  }
+}
+
+
 #' Add missing columns to a citations dataframe
 #'
 #' @param raw_citations Citation dataframe

diff --git a/R/export.R b/R/export.R
@@ -15,6 +15,12 @@
 #' @param trim_abstracts Some databases may return full-text that is misidentified as an abstract. This inflates file size and may lead to issues with Excel,
 #' which cannot deal with more than 32,000 characters per field. Therefore, the default is to trim very long abstracts to 32,000 characters. Set a lower number to reduce file size, or
 #' NULL to retain abstracts as they are.
+#' @param manual_dedup_complete Logical. Records, in a `manual_dedup_complete`
+#'   column, whether manual deduplication has been completed for this set
+#'   (default `FALSE`). Set `TRUE` after confirming manual pairs with
+#'   [dedup_citations_add_manual()]. This flag is read back by [reimport_csv()]
+#'   and lets later steps know whether candidate pairs still need review. Only
+#'   written when `fields = "full"`.
 #' @return No return value, called for side effects. Saves the deduplicated citations as a 'CSV' file to the specified location.
 #' @export
 #' @examples
@@ -28,7 +34,7 @@
 #'   export_csv(dedup_results, tempfile(fileext = ".csv"), fields = "standard")
 #' }
 
-export_csv <- function(unique_citations, filename, fields = "full", separate = NULL, trim_abstracts = 32000) {
+export_csv <- function(unique_citations, filename, fields = "full", separate = NULL, trim_abstracts = 32000, manual_dedup_complete = FALSE) {
   # Warn if the filename doesn't end with .csv
   if (tolower(tools::file_ext(filename)) != "csv") {
     warning("Function saves a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.")
@@ -78,9 +84,61 @@ export_csv <- function(unique_citations, filename, fields = "full", separate = N
       dplyr::select(-tidyselect::all_of(separate)) |>
       dplyr::bind_cols(separated)
   }
+
+  # Record manual-dedup status on reimportable (full) exports only, so that
+  # later steps (and the Shiny app) know whether manual review is still pending.
+  if (identical(fields, "full")) {
+    unique_citations$manual_dedup_complete <- isTRUE(manual_dedup_complete)
+  }
+
   utils::write.csv(unique_citations, filename, row.names = FALSE)
 }
 
+#' Export manual-review candidate pairs to a CSV file
+#'
+#' Saves the candidate duplicate pairs returned as the `$manual_dedup` element
+#' of `dedup_citations(manual = TRUE)` so that manual review can be completed
+#' later. Combine with [export_csv()] to defer manual deduplication: export the
+#' automatically deduplicated unique citations *and* these candidate pairs now,
+#' then re-import both later with [reimport_csv()] and
+#' [reimport_dedup_candidates()] to finish the review. Note that *existing files
+#' are overwritten without warning.*
+#'
+#' @param manual_dedup Data frame of candidate pairs, i.e. the `$manual_dedup`
+#'   element of `dedup_citations(manual = TRUE)`.
+#' @param filename Name (and path) of file, should end in .csv
+#' @return No return value, called for side effects. Saves the candidate pairs
+#'   as a 'CSV' file to the specified location.
+#' @export
+#' @seealso [reimport_dedup_candidates()], [dedup_citations_add_manual()]
+#' @examples
+#' if (interactive()) {
+#'   examplecitations_path <- system.file("extdata", "examplecitations.rds", package = "CiteSource")
+#'   examplecitations <- readRDS(examplecitations_path)
+#'   dedup_results <- dedup_citations(examplecitations, manual = TRUE)
+#'   export_dedup_candidates(dedup_results$manual_dedup, tempfile(fileext = ".csv"))
+#' }
+export_dedup_candidates <- function(manual_dedup, filename) {
+  if (tolower(tools::file_ext(filename)) != "csv") {
+    warning("Function saves a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.")
+  }
+
+  if (!all(c("duplicate_id.x", "duplicate_id.y") %in% names(manual_dedup))) {
+    stop(
+      "manual_dedup must contain duplicate_id.x and duplicate_id.y columns. ",
+      "Pass the $manual_dedup element of dedup_citations(manual = TRUE)."
+    )
+  }
+
+  # Seed an empty result column to prompt reviewers to mark confirmed duplicates
+  # (dedup_citations_add_manual() merges only rows where result == "match").
+  if (!"result" %in% names(manual_dedup)) {
+    manual_dedup$result <- ""
+  }
+
+  utils::write.csv(manual_dedup, filename, row.names = FALSE)
+}
+
 #' Export data frame to RIS file
 #'
 #' This function saves a data frame as a RIS file with specified columns mapped to RIS fields. Note that

diff --git a/R/reimport.R b/R/reimport.R
@@ -17,8 +17,11 @@ reimport_csv <- function(filename) {
   # Warn if the filename doesn't end with .csv
   if (tolower(tools::file_ext(filename)) != "csv") warning("Function reads a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.")
 
-  # Read the CSV file
-  unique_citations_imported <- utils::read.csv(filename, stringsAsFactors = FALSE)
+  # Read the CSV file. All columns are read as character so the reimported data
+  # matches the (all-character) types produced by dedup_citations(). This keeps
+  # the reimport faithful and is required for re-deduplication / adding manual
+  # pairs via dedup_citations_add_manual(), which fails on mixed column types.
+  unique_citations_imported <- utils::read.csv(filename, stringsAsFactors = FALSE, colClasses = "character")
 
   # Check if the required columns are present
   if (!all(c("cite_source", "cite_label", "cite_string", "duplicate_id", "record_ids") %in% names(unique_citations_imported))) {
@@ -29,9 +32,56 @@ reimport_csv <- function(filename) {
     )
   }
 
+  # The manual_dedup_complete flag (written by export_csv) is left as character
+  # ("TRUE"/"FALSE") like every other column, so the reimported set can be passed
+  # through dedup_citations_add_manual() / re-deduplication without column-type
+  # clashes. Test it with e.g. `df$manual_dedup_complete[1] == "TRUE"`.
+
   unique_citations_imported
 }
 
+#' Reimport manual-review candidate pairs exported from CiteSource
+#'
+#' Reads a CSV of candidate duplicate pairs previously written by
+#' [export_dedup_candidates()] (i.e. the `$manual_dedup` element of
+#' `dedup_citations(manual = TRUE)`). This supports a deferred workflow: run
+#' automatic deduplication now, export both the unique citations and the
+#' candidate pairs, and complete the manual review later after re-importing.
+#'
+#' After review, set the `result` column to `"match"` for confirmed duplicates
+#' and pass the result, together with the reimported unique citations, to
+#' [dedup_citations_add_manual()].
+#'
+#' @param filename Name (and path) of the candidate-pairs CSV, should end in .csv
+#' @return A data frame of candidate pairs with `duplicate_id.x` / `duplicate_id.y`
+#'   read as character (matching the unique citations from [reimport_csv()]), ready
+#'   for review and [dedup_citations_add_manual()].
+#' @export
+#' @seealso [export_dedup_candidates()], [dedup_citations_add_manual()]
+#' @examples
+#' if (interactive()) {
+#'   candidates <- reimport_dedup_candidates("path/to/candidates.csv")
+#'   # mark confirmed duplicates, then merge into the reimported unique set
+#'   candidates$result <- ifelse(candidates$result == "match", "match", "no_match")
+#'   final <- dedup_citations_add_manual(reimport_csv("unique.csv"), candidates)
+#' }
+reimport_dedup_candidates <- function(filename) {
+  if (tolower(tools::file_ext(filename)) != "csv") {
+    warning("Function reads a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.")
+  }
+
+  candidates <- utils::read.csv(filename, stringsAsFactors = FALSE, colClasses = "character")
+
+  if (!all(c("duplicate_id.x", "duplicate_id.y") %in% names(candidates))) {
+    stop(
+      "Columns duplicate_id.x and duplicate_id.y were not found in ", filename,
+      ". This function expects a candidate-pairs file written by export_dedup_candidates()."
+    )
+  }
+
+  candidates
+}
+
 #' Reimport a RIS-file exported from CiteSource
 #'
 #' This function reimports a RIS file that was tagged and deduplicated by CiteSource.

diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ Once records are deduplicated, users are able to easily create plots and tables
 
 **Exporting and Re-importing Data**
 
-Once records have been processed, users are able to export data in .csv, .ris, and .bib formats. Furthermore, users are able to reimport .csv and .ris files in order to recreate plots and tables.
+Once records have been processed, users are able to export data in .csv, .ris, and .bib formats. Furthermore, users are able to reimport .csv and .ris files in order to recreate plots and tables. Re-importing also lets a review grow over time: new database or search results can be added to a previously deduplicated set and deduplicated against it without starting over (`dedup_citations_add_sources()`), and automatic deduplication can be done now with manual review completed later by exporting and re-importing the candidate pairs (`export_dedup_candidates()` / `reimport_dedup_candidates()`).
 
 ## Getting Started
 **Installation**