diff --git a/.Rbuildignore b/.Rbuildignore index aaf4a03f..cb80a70e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,6 @@ ^vignettes/valid_data$ ^tests/shinytest$ ^CRAN-SUBMISSION$ +^CLAUDE\.md$ +^guide$ +^\.tmp.*$ diff --git a/CITATION.cff b/CITATION.cff index b5a86494..c072d6cb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,8 +20,8 @@ authors: given-names: "Matthew J." orcid: "https://orcid.org/0000-0001-8426-6495" title: "CiteSource: An R Package for Data-Driven Search Strategy Development and Enhanced Evidence Synthesis Reporting" -version: 0.2.0 -date-released: 2026-05-13 +version: 0.2.1 +date-released: 2026-06-01 doi: TBD url: "https://github.com/ESHackathon/CiteSource" preferred-citation: diff --git a/DESCRIPTION b/DESCRIPTION index 216fbfe0..5c4d1e1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: CiteSource Title: Data-Driven Search Strategy Development and Evidence Synthesis Reporting -Version: 0.2.0 -Date: 2026-05-11 +Version: 0.2.1 +Date: 2026-06-01 Authors@R: c( person("Trevor", "Riley", , "tnriley@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6834-9802")), diff --git a/NAMESPACE b/NAMESPACE index 060ad70c..c0c773b7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,9 +14,11 @@ export(create_initial_record_table) export(create_precision_sensitivity_table) export(dedup_citations) export(dedup_citations_add_manual) +export(dedup_citations_add_sources) export(dedup_log) export(export_bib) export(export_csv) +export(export_dedup_candidates) export(export_ris) export(plot_contributions) export(plot_source_overlap_heatmap) @@ -25,6 +27,7 @@ export(read_citations) export(record_counts) export(record_level_table) export(reimport_csv) +export(reimport_dedup_candidates) export(reimport_ris) export(runShiny) export(run_shiny) diff --git a/NEWS.md b/NEWS.md index 04029592..c1ea8a5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,41 @@ +# CiteSource 0.2.1 + +## New features + +- Incremental deduplication: `dedup_citations_add_sources()` adds new citations + to a previously deduplicated set and deduplicates across both, preserving + prior automatic and manual merge decisions and the original `record_ids` + provenance. For the same data it yields the same unique set as deduplicating + everything from scratch. Exposed in the Shiny app — re-upload a deduplicated + set, add new citation files, and "Find duplicates" merges them in. Works in + `manual = TRUE` mode to surface new candidate pairs for review. +- Deferred manual deduplication: run automatic dedup now and complete manual + review later. `export_dedup_candidates()` / `reimport_dedup_candidates()` + persist and restore the `$manual_dedup` candidate pairs, and `export_csv()` + gains a `manual_dedup_complete` flag (written as a column, read back by + `reimport_csv()`) so downstream steps know whether review is still pending. + Re-import, mark `result == "match"`, and merge with + `dedup_citations_add_manual()`. +- Shiny app: re-importing a deduplicated set now shows a read-only source + overview (records per source, and per label/string) on the upload page so you + can see what is already in the set before adding more; the re-upload input + accepts a candidate-pairs CSV and several files at once. + +## Bug fixes + +- `reimport_csv()` now reads all columns as character, matching the canonical + (all-character) types produced by `dedup_citations()`. This is required so a + reimported set can re-enter `dedup_citations_add_manual()` (and incremental + re-deduplication) without column-type clashes. +- Shiny app: the re-upload (re-import) input no longer errors when more than one + file is selected; each file is routed by content (deduplicated set vs. + candidate-pairs CSV vs. RIS). + +## Documentation + +- In-app User Guide and README updated to document incremental and deferred + deduplication; the file upload page labels were clarified. + # CiteSource 0.2.0 ## Breaking changes diff --git a/R/dedup.R b/R/dedup.R index df8eeb12..d4f0eeeb 100644 --- a/R/dedup.R +++ b/R/dedup.R @@ -18,7 +18,7 @@ #' @return When `manual = FALSE`: a dataframe of unique citations. When #' `manual = TRUE`: a list with `$unique` (unique citations), #' `$manual_dedup` (potential pairs for review), and `$auto_pairs` -#' (pairs that were merged automatically — feed to [dedup_log()] together +#' (pairs that were merged automatically - feed to [dedup_log()] together #' with confirmed manual pairs to build a full provenance log). #' #' @examples @@ -77,7 +77,7 @@ dedup_citations <- function(raw_citations, manual = FALSE, show_unknown_tags = F #' #' Combines automatically merged pairs and user-confirmed manual pairs into a #' single tibble with a `method` column (`"auto"` / `"manual"`). Useful for -#' reporting and auditing — e.g. as supplementary material for a systematic +#' reporting and auditing - e.g. as supplementary material for a systematic #' review. #' #' @export @@ -181,6 +181,105 @@ dedup_citations_add_manual <- function(unique_citations, additional_pairs) { } +#' Add new citations to a previously deduplicated set and re-deduplicate +#' +#' Adds further citations (e.g. an additional database search) to a set that was +#' already deduplicated, and deduplicates the new records against both the +#' existing set and each other - without discarding the work already done. Each +#' existing unique record enters as a single row, so prior automatic and manual +#' merge decisions are preserved; the new records are integrated and full +#' provenance (the original `record_ids` behind every merged record) is carried +#' through. +#' +#' This is the incremental counterpart to running [dedup_citations()] on all +#' sources from scratch and, for the same data, produces the same unique set. +#' +#' @export +#' @param existing_citations A previously deduplicated set (from +#' [dedup_citations()], [reimport_csv()] or [reimport_ris()]) - must contain a +#' `duplicate_id` column. +#' @param new_citations New raw citations to add, as returned by +#' [read_citations()] (with `cite_source` / `cite_label` / `cite_string`). +#' @param manual logical. If TRUE, return the full result list including +#' `$manual_dedup` candidate pairs for review (see [dedup_citations()]). +#' Default FALSE. +#' @param show_unknown_tags When a label, source, or other merged field is +#' missing, show it as "unknown"? Default FALSE. +#' @return When `manual = FALSE`: a dataframe of unique citations across both +#' sets. When `manual = TRUE`: a list with `$unique`, `$manual_dedup` and +#' `$auto_pairs` (as in [dedup_citations()]). In both cases `record_ids` +#' retains the original record IDs behind every merged record. +#' @seealso [dedup_citations()], [dedup_citations_add_manual()] +#' +#' @examples +#' if (interactive()) { +#' existing <- dedup_citations(read_citations(old_files, cite_sources = old_srcs)) +#' new_raw <- read_citations(new_files, cite_sources = new_srcs) +#' combined <- dedup_citations_add_sources(existing, new_raw) +#' } +dedup_citations_add_sources <- function(existing_citations, new_citations, + manual = FALSE, show_unknown_tags = FALSE) { + + if (!"duplicate_id" %in% names(existing_citations)) { + stop("existing_citations must contain a `duplicate_id` column - pass a set ", + "returned by dedup_citations(), reimport_csv() or reimport_ris().") + } + + # Work in character throughout (the dedup engine's canonical type) so the two + # frames bind without column-type clashes. + ex <- dplyr::mutate(existing_citations, dplyr::across(dplyr::everything(), as.character)) + if (!"record_ids" %in% names(ex)) ex$record_ids <- ex$duplicate_id + + # Provenance lookup: existing duplicate_id -> its underlying original record_ids + prov <- stats::setNames(as.character(ex$record_ids), as.character(ex$duplicate_id)) + + # Each existing unique record enters as one input keyed by its duplicate_id + ex$record_id <- as.character(ex$duplicate_id) + + # New records get fresh ids that cannot collide with any existing id. Base the + # offset on the max of ALL underlying record_ids (duplicate_id is the cluster + # minimum, so a new id keyed off it could otherwise reuse an existing id). + existing_ids <- c( + as.character(ex$duplicate_id), + unlist(strsplit(as.character(ex$record_ids), ",\\s*")) + ) + max_id <- suppressWarnings(max(as.numeric(existing_ids), na.rm = TRUE)) + + nw <- dplyr::mutate(new_citations, dplyr::across(dplyr::everything(), as.character)) + nw <- dplyr::select(nw, -dplyr::any_of(c("duplicate_id", "record_ids", "record_id"))) + nw$record_id <- if (is.finite(max_id)) { + as.character(max_id + seq_len(nrow(nw))) + } else { + paste0("new_", seq_len(nrow(nw))) + } + + # Drop the merged-set metadata that would otherwise trigger a record_id clash + # (format_rerun renames duplicate_id -> record_id) or be stale after re-dedup. + ex <- dplyr::select(ex, -dplyr::any_of(c("duplicate_id", "record_ids", "manual_dedup_complete"))) + + combined <- dplyr::bind_rows(ex, nw) + + result <- dedup_citations(combined, manual = manual, show_unknown_tags = show_unknown_tags) + + # Restore original provenance: expand existing-duplicate-id tokens in the + # rebuilt record_ids back to their underlying original record IDs. + unique_out <- if (manual) result$unique else result + unique_out$record_ids <- vapply(unique_out$record_ids, function(rids) { + toks <- trimws(strsplit(rids, ",\\s*")[[1]]) + toks <- ifelse(toks %in% names(prov), prov[toks], toks) + toks <- unlist(strsplit(paste(toks, collapse = ", "), ",\\s*")) + paste(unique(trimws(toks[toks != ""])), collapse = ", ") + }, character(1), USE.NAMES = FALSE) + + if (manual) { + result$unique <- unique_out + result + } else { + unique_out + } +} + + #' Add missing columns to a citations dataframe #' #' @param raw_citations Citation dataframe diff --git a/R/export.R b/R/export.R index 6be27c27..262816f5 100644 --- a/R/export.R +++ b/R/export.R @@ -15,6 +15,12 @@ #' @param trim_abstracts Some databases may return full-text that is misidentified as an abstract. This inflates file size and may lead to issues with Excel, #' which cannot deal with more than 32,000 characters per field. Therefore, the default is to trim very long abstracts to 32,000 characters. Set a lower number to reduce file size, or #' NULL to retain abstracts as they are. +#' @param manual_dedup_complete Logical. Records, in a `manual_dedup_complete` +#' column, whether manual deduplication has been completed for this set +#' (default `FALSE`). Set `TRUE` after confirming manual pairs with +#' [dedup_citations_add_manual()]. This flag is read back by [reimport_csv()] +#' and lets later steps know whether candidate pairs still need review. Only +#' written when `fields = "full"`. #' @return No return value, called for side effects. Saves the deduplicated citations as a 'CSV' file to the specified location. #' @export #' @examples @@ -28,7 +34,7 @@ #' export_csv(dedup_results, tempfile(fileext = ".csv"), fields = "standard") #' } -export_csv <- function(unique_citations, filename, fields = "full", separate = NULL, trim_abstracts = 32000) { +export_csv <- function(unique_citations, filename, fields = "full", separate = NULL, trim_abstracts = 32000, manual_dedup_complete = FALSE) { # Warn if the filename doesn't end with .csv if (tolower(tools::file_ext(filename)) != "csv") { warning("Function saves a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") @@ -78,9 +84,61 @@ export_csv <- function(unique_citations, filename, fields = "full", separate = N dplyr::select(-tidyselect::all_of(separate)) |> dplyr::bind_cols(separated) } + + # Record manual-dedup status on reimportable (full) exports only, so that + # later steps (and the Shiny app) know whether manual review is still pending. + if (identical(fields, "full")) { + unique_citations$manual_dedup_complete <- isTRUE(manual_dedup_complete) + } + utils::write.csv(unique_citations, filename, row.names = FALSE) } +#' Export manual-review candidate pairs to a CSV file +#' +#' Saves the candidate duplicate pairs returned as the `$manual_dedup` element +#' of `dedup_citations(manual = TRUE)` so that manual review can be completed +#' later. Combine with [export_csv()] to defer manual deduplication: export the +#' automatically deduplicated unique citations *and* these candidate pairs now, +#' then re-import both later with [reimport_csv()] and +#' [reimport_dedup_candidates()] to finish the review. Note that *existing files +#' are overwritten without warning.* +#' +#' @param manual_dedup Data frame of candidate pairs, i.e. the `$manual_dedup` +#' element of `dedup_citations(manual = TRUE)`. +#' @param filename Name (and path) of file, should end in .csv +#' @return No return value, called for side effects. Saves the candidate pairs +#' as a 'CSV' file to the specified location. +#' @export +#' @seealso [reimport_dedup_candidates()], [dedup_citations_add_manual()] +#' @examples +#' if (interactive()) { +#' examplecitations_path <- system.file("extdata", "examplecitations.rds", package = "CiteSource") +#' examplecitations <- readRDS(examplecitations_path) +#' dedup_results <- dedup_citations(examplecitations, manual = TRUE) +#' export_dedup_candidates(dedup_results$manual_dedup, tempfile(fileext = ".csv")) +#' } +export_dedup_candidates <- function(manual_dedup, filename) { + if (tolower(tools::file_ext(filename)) != "csv") { + warning("Function saves a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") + } + + if (!all(c("duplicate_id.x", "duplicate_id.y") %in% names(manual_dedup))) { + stop( + "manual_dedup must contain duplicate_id.x and duplicate_id.y columns. ", + "Pass the $manual_dedup element of dedup_citations(manual = TRUE)." + ) + } + + # Seed an empty result column to prompt reviewers to mark confirmed duplicates + # (dedup_citations_add_manual() merges only rows where result == "match"). + if (!"result" %in% names(manual_dedup)) { + manual_dedup$result <- "" + } + + utils::write.csv(manual_dedup, filename, row.names = FALSE) +} + #' Export data frame to RIS file #' #' This function saves a data frame as a RIS file with specified columns mapped to RIS fields. Note that diff --git a/R/reimport.R b/R/reimport.R index e4df627f..79fbe3a3 100644 --- a/R/reimport.R +++ b/R/reimport.R @@ -17,8 +17,11 @@ reimport_csv <- function(filename) { # Warn if the filename doesn't end with .csv if (tolower(tools::file_ext(filename)) != "csv") warning("Function reads a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") - # Read the CSV file - unique_citations_imported <- utils::read.csv(filename, stringsAsFactors = FALSE) + # Read the CSV file. All columns are read as character so the reimported data + # matches the (all-character) types produced by dedup_citations(). This keeps + # the reimport faithful and is required for re-deduplication / adding manual + # pairs via dedup_citations_add_manual(), which fails on mixed column types. + unique_citations_imported <- utils::read.csv(filename, stringsAsFactors = FALSE, colClasses = "character") # Check if the required columns are present if (!all(c("cite_source", "cite_label", "cite_string", "duplicate_id", "record_ids") %in% names(unique_citations_imported))) { @@ -29,9 +32,56 @@ reimport_csv <- function(filename) { ) } + # The manual_dedup_complete flag (written by export_csv) is left as character + # ("TRUE"/"FALSE") like every other column, so the reimported set can be passed + # through dedup_citations_add_manual() / re-deduplication without column-type + # clashes. Test it with e.g. `df$manual_dedup_complete[1] == "TRUE"`. + unique_citations_imported } +#' Reimport manual-review candidate pairs exported from CiteSource +#' +#' Reads a CSV of candidate duplicate pairs previously written by +#' [export_dedup_candidates()] (i.e. the `$manual_dedup` element of +#' `dedup_citations(manual = TRUE)`). This supports a deferred workflow: run +#' automatic deduplication now, export both the unique citations and the +#' candidate pairs, and complete the manual review later after re-importing. +#' +#' After review, set the `result` column to `"match"` for confirmed duplicates +#' and pass the result, together with the reimported unique citations, to +#' [dedup_citations_add_manual()]. +#' +#' @param filename Name (and path) of the candidate-pairs CSV, should end in .csv +#' @return A data frame of candidate pairs with `duplicate_id.x` / `duplicate_id.y` +#' read as character (matching the unique citations from [reimport_csv()]), ready +#' for review and [dedup_citations_add_manual()]. +#' @export +#' @seealso [export_dedup_candidates()], [dedup_citations_add_manual()] +#' @examples +#' if (interactive()) { +#' candidates <- reimport_dedup_candidates("path/to/candidates.csv") +#' # mark confirmed duplicates, then merge into the reimported unique set +#' candidates$result <- ifelse(candidates$result == "match", "match", "no_match") +#' final <- dedup_citations_add_manual(reimport_csv("unique.csv"), candidates) +#' } +reimport_dedup_candidates <- function(filename) { + if (tolower(tools::file_ext(filename)) != "csv") { + warning("Function reads a CSV file, so filename should (usually) end in .csv. For now, name is used as provided.") + } + + candidates <- utils::read.csv(filename, stringsAsFactors = FALSE, colClasses = "character") + + if (!all(c("duplicate_id.x", "duplicate_id.y") %in% names(candidates))) { + stop( + "Columns duplicate_id.x and duplicate_id.y were not found in ", filename, + ". This function expects a candidate-pairs file written by export_dedup_candidates()." + ) + } + + candidates +} + #' Reimport a RIS-file exported from CiteSource #' #' This function reimports a RIS file that was tagged and deduplicated by CiteSource. diff --git a/README.md b/README.md index 1ff86bfb..201937be 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Once records are deduplicated, users are able to easily create plots and tables **Exporting and Re-importing Data** -Once records have been processed, users are able to export data in .csv, .ris, and .bib formats. Furthermore, users are able to reimport .csv and .ris files in order to recreate plots and tables. +Once records have been processed, users are able to export data in .csv, .ris, and .bib formats. Furthermore, users are able to reimport .csv and .ris files in order to recreate plots and tables. Re-importing also lets a review grow over time: new database or search results can be added to a previously deduplicated set and deduplicated against it without starting over (`dedup_citations_add_sources()`), and automatic deduplication can be done now with manual review completed later by exporting and re-importing the candidate pairs (`export_dedup_candidates()` / `reimport_dedup_candidates()`). ## Getting Started **Installation** diff --git a/cran-comments.md b/cran-comments.md index ecca3d5b..27867e09 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,46 +1,43 @@ -## Resubmission +## Submission -This is a resubmission addressing reviewer feedback: +This is a minor feature update (0.2.0 -> 0.2.1). -* **Title / description formatting**: Removed "An R Package for" from the - `Title` field. Added single quotes around `'shiny'` in `Description` to - conform to the requirement that package names appear in single quotes. +New in this version: -* **Missing `\value` tags**: Added `@return` documentation to all five - flagged functions (`export_bib`, `export_ris`, `plot_contributions`, - `plot_source_overlap_upset`, `reimport_ris`) and re-ran - `devtools::document()` to regenerate the `.Rd` files. Functions called - for side effects use the phrasing "No return value, called for side - effects." Functions that return objects describe the class and meaning - of the output. +* `dedup_citations_add_sources()` adds new citations to a previously + deduplicated set and deduplicates across both, preserving prior merge + decisions and record provenance (incremental deduplication). +* `export_dedup_candidates()` / `reimport_dedup_candidates()`, and a + `manual_dedup_complete` flag in `export_csv()`, support performing automatic + deduplication now and completing manual review later. +* `reimport_csv()` now reads all columns as character so a re-imported set can + re-enter the manual-merge / incremental-deduplication functions without + column-type clashes. +* Documentation and bundled 'shiny' application updates. -* **Writing to home filespace**: Removed default `filename` values from - `export_csv()`, `export_ris()`, and `export_bib()` so users must supply - an explicit path. Updated all examples to write to `tempfile()` rather - than a bare filename. Fixed the `file = TRUE` fallback in the internal - `write_refs()` helper to write to `tempdir()` instead of `getwd()`. +## Test environments ---- +* Local: Windows 11, R 4.5.0 +* win-builder: R-release and R-devel ## R CMD check results 0 errors | 0 warnings | 1 note -Tested on: -- Windows 11, R 4.5.0: 0 errors | 0 warnings | 1 note -- Windows Server 2022, R-devel (r90065 ucrt): 0 errors | 0 warnings | 1 note +* checking CRAN incoming feasibility ... NOTE + Maintainer: 'Trevor Riley ' -The note on Windows 11 is "unable to verify current time", a transient -network issue on the checking machine unrelated to the package. -The note on Win-devel is the standard "New submission" CRAN feasibility -notice and requires no action. + The "New submission" wording appears only because the previous version is not + yet published on CRAN. Any flagged URLs (e.g. the GNU GPL license pages) are + valid and resolve in a browser; the check machine reported transient + connection resets. ## Reverse dependencies -None — this is a new submission with no downstream dependents. +None - CiteSource has no downstream dependents on CRAN. ## Notes R/asys_dedup.R contains code vendored from the ASySD package -(GPL >= 3, CAMARADES Group / Kaitlyn Hair) with attribution in -the file header. CiteSource is also GPL >= 3, so licenses are compatible. +(GPL >= 3, CAMARADES Group / Kaitlyn Hair) with attribution in the file +header. CiteSource is also GPL >= 3, so the licenses are compatible. diff --git a/inst/shiny-app/CiteSource/app.R b/inst/shiny-app/CiteSource/app.R index 20d298fb..6bbae9f1 100644 --- a/inst/shiny-app/CiteSource/app.R +++ b/inst/shiny-app/CiteSource/app.R @@ -351,20 +351,26 @@ ui <- shiny::navbarPage("CiteSource", # Sidebar layout ---- shiny::sidebarLayout( shiny::sidebarPanel( # Input: Select a file ---- - shiny::h5("Step 1: Upload your citation files"), - shiny::fileInput("file", "", + shiny::h5("Step 1: Upload citation files"), + shiny::p("New search/database exports to deduplicate.", + style = "font-size:0.8em;color:#6c757d;margin-top:-4px;margin-bottom:6px;"), + shiny::fileInput("file", "Add new files (.ris, .bib, .txt)", multiple = TRUE, accept = c(".ris", ".txt", ".bib") ), shiny::hr(), - shiny::h5("OR: Re-upload an .ris or .csv exported from CiteSource"), - shiny::fileInput("file_reimport", "", + shiny::h5("Re-upload a CiteSource export"), + shiny::p( + "A previously deduplicated set (.csv or .ris) to keep working with — view its sources below, add new files above to merge in, or finish manual review by also re-uploading a candidate-pairs .csv.", + style = "font-size:0.8em;color:#6c757d;margin-top:-4px;margin-bottom:6px;"), + shiny::fileInput("file_reimport", "Re-upload exported file(s)", multiple = TRUE, accept = c(".ris", ".csv") ) ), # Main panel for displaying outputs ---- shiny::mainPanel( + shiny::uiOutput("reimport_summary"), shiny::uiOutput("metadata_form"), shiny::uiOutput("post_upload_guide") ) @@ -380,6 +386,8 @@ ui <- shiny::navbarPage("CiteSource", br(), shiny::h5("Step 3: Deduplicate"), shiny::p("Click the button below to detect and remove duplicates automatically"), + shiny::p("Already re-uploaded a deduplicated set? Add new citation files on the File upload tab, then click Find duplicates to merge them into the existing set.", + style = "font-size:0.82em;color:#6c757d;"), # Action button: identify duplicates in uploaded dataset shinyWidgets::actionBttn( @@ -818,6 +826,14 @@ ui <- shiny::navbarPage("CiteSource", "CSV of every merged duplicate pair, flagged as automated or manual.", style="color:#6c757d;font-size:0.82em;margin-bottom:8px;"), shiny::downloadButton("downloadDedupLog", "Dedup Log (CSV)", + style="margin-bottom:4px;"), + shiny::tags$hr(style="margin:14px 0 10px 0;"), + shiny::tags$strong("Manual review candidates", + style="font-size:0.88em;display:block;margin-bottom:4px;"), + shiny::p( + "CSV of unresolved candidate pairs. Re-upload it together with the citations CSV to finish manual deduplication later.", + style="color:#6c757d;font-size:0.82em;margin-bottom:8px;"), + shiny::downloadButton("downloadCandidates", "Candidate Pairs (CSV)", style="margin-bottom:4px;") ) ), @@ -890,6 +906,7 @@ server <- function(input, output, session) { rv$pairs_to_check <- data.frame()#for potential duplicates/manual dedup rv$pairs_removed <- data.frame()#for removed records rv$auto_pairs <- data.frame()#auto-merged pairs, for the dedup log + rv$existing_dedup_present <- FALSE # TRUE when latest_unique is a reimported deduped set that new uploads should be merged INTO (Goal 2) rv$file_meta <- list() # Per-file metadata (source/label/string); keyed by file.datapath # Card view state rv$selected_pairs_card <- integer(0) @@ -985,14 +1002,86 @@ server <- function(input, output, session) { ), check.names = FALSE ) - }, - striped = TRUE, - hover = TRUE, - width = "100%", + }, + striped = TRUE, + hover = TRUE, + width = "100%", align = "l", sanitize.text.function = function(x) x # Allows the tags to work ) - + + # --- Re-imported deduplicated set: read-only source overview --------------- + # When a previously deduplicated/exported set is re-uploaded, show what is + # already in it (sources, and any labels/strings) so the user can see the + # existing content before adding more references. View only — this set is kept + # separate from the newly uploaded files and its tags cannot be edited here. + reimport_summary_data <- shiny::reactive({ + shiny::req(isTRUE(rv$existing_dedup_present), + is.data.frame(rv$latest_unique), nrow(rv$latest_unique) > 0) + + # Count, per field value, the number of unique records that include it. + # Tokens are de-duplicated within each record so a record merged from + # several sources counts once per distinct source/label/string. + tally_field <- function(col, field_name) { + if (!col %in% names(rv$latest_unique)) return(NULL) + vals <- rv$latest_unique[[col]] + vals <- vals[!is.na(vals) & !vals %in% c("", "NA")] + if (length(vals) == 0) return(NULL) + toks <- unlist(lapply(strsplit(vals, ",\\s*"), function(t) unique(trimws(t)))) + toks <- toks[toks != "" & toks != "NA"] + if (length(toks) == 0) return(NULL) + tbl <- sort(table(toks), decreasing = TRUE) + data.frame(Field = field_name, Value = names(tbl), + Records = as.integer(tbl), check.names = FALSE, + row.names = NULL, stringsAsFactors = FALSE) + } + + out <- dplyr::bind_rows( + tally_field("cite_source", "Source"), + tally_field("cite_label", "Label"), + tally_field("cite_string", "String") + ) + if (is.null(out) || nrow(out) == 0) return(NULL) + out + }) + + output$reimport_summary <- shiny::renderUI({ + summ <- reimport_summary_data() + if (is.null(summ)) return(NULL) + + flag <- rv$latest_unique$manual_dedup_complete + reviewed <- length(flag) > 0 && as.character(flag[1]) %in% c("TRUE", "T", "1") + + bslib::card( + bslib::card_header( + shiny::tags$i(class = "fa fa-database", style = "margin-right:7px;"), + "Re-imported deduplicated set" + ), + bslib::card_body( + shiny::p( + paste0(format(nrow(rv$latest_unique), big.mark = ","), + " unique records re-imported", + if (reviewed) " (manual deduplication marked complete)." else "."), + style = "color:#6c757d;font-size:0.88em;margin-bottom:4px;" + ), + shiny::p( + "View only — this set is kept separate from any new files you upload. Add new citation files on the left, then click Find duplicates to merge them in.", + style = "color:#6c757d;font-size:0.8em;margin-bottom:10px;" + ), + shiny::tableOutput("reimport_summary_tbl") + ) + ) + }) + + output$reimport_summary_tbl <- shiny::renderTable( + { + summ <- reimport_summary_data() + shiny::req(summ) + summ + }, + striped = TRUE, hover = TRUE, width = "100%", align = "l" + ) + # --- Google Analytics Integration --- # Flag to ensure GA script is inserted only once per session #### Upload files tab section ------ @@ -1194,22 +1283,61 @@ server <- function(input, output, session) { }) shiny::observeEvent(input$file_reimport, { - file_extension <- tolower(tools::file_ext(input$file_reimport$datapath)) - - if (file_extension == "csv") { - rv$latest_unique <- reimport_csv(input$file_reimport$datapath) - } else if (file_extension == "ris") { - rv$latest_unique <- reimport_ris(input$file_reimport$datapath) - } else { - warning("Invalid file extension, needs to be .ris or .csv") + files <- input$file_reimport # data frame: one row per file (multiple = TRUE) + + n_unique_imported <- 0L + n_candidates_imported <- 0L + errors <- character(0) + + for (i in seq_len(nrow(files))) { + path <- files$datapath[i] + nm <- files$name[i] + ext <- tolower(tools::file_ext(nm)) + + tryCatch({ + if (ext == "ris") { + rv$latest_unique <- reimport_ris(path) + rv$existing_dedup_present <- TRUE + n_unique_imported <- nrow(rv$latest_unique) + } else if (ext == "csv") { + # Route by content: a candidate-pairs file has duplicate_id.x / .y; + # a deduplicated citation set has the cite_* / duplicate_id columns. + hdr <- names(utils::read.csv(path, nrows = 1, stringsAsFactors = FALSE)) + if (all(c("duplicate_id.x", "duplicate_id.y") %in% hdr)) { + cand <- reimport_dedup_candidates(path) + # Drop the R-workflow `result` column: in the app the merge decision + # is the user's row selection, not result == "match". + cand$result <- NULL + rv$pairs_to_check <- cand + n_candidates_imported <- nrow(cand) + } else { + rv$latest_unique <- reimport_csv(path) + rv$existing_dedup_present <- TRUE + n_unique_imported <- nrow(rv$latest_unique) + } + } else { + errors <- c(errors, paste0(nm, " (unsupported type)")) + } + }, error = function(e) { + errors <<- c(errors, paste0(nm, ": ", conditionMessage(e))) + }) + } + + if (n_unique_imported > 0) rv$n_unique <- count_unique(rv$latest_unique) + + if (length(errors) > 0) { + show_toastr("Some files could not be re-imported", + paste(errors, collapse = "; "), type = "error") + } + if (n_unique_imported > 0 || n_candidates_imported > 0) { + msg <- character(0) + if (n_unique_imported > 0) + msg <- c(msg, paste("Imported", n_unique_imported, "deduplicated citations.")) + if (n_candidates_imported > 0) + msg <- c(msg, paste("Restored", n_candidates_imported, + "candidate pair(s) — finish review on the Deduplicate tab.")) + show_toastr("Re-import successful", paste(msg, collapse = " "), type = "success") } - - rv$n_unique <- count_unique(rv$latest_unique) - - show_toastr("Re-import successful", - paste("Imported", nrow(rv$latest_unique), "citations. You can now proceed to visualisation and tables."), - type = "success") - }) ## Update filters @@ -1364,11 +1492,15 @@ server <- function(input, output, session) { # when dedup button clicked, deduplicate shiny::observeEvent(input$identify_dups, { - if (nrow(rv$upload_df) == 0) { - if (nrow(rv$latest_unique) > 0) { - show_toastr("Deduplication already complete", - "You have reimported a dataset that has already been deduplicated. Further deduplication is not possible here.", - type = "error") + has_new <- is.data.frame(rv$upload_df) && nrow(rv$upload_df) > 0 + has_existing <- isTRUE(rv$existing_dedup_present) && + is.data.frame(rv$latest_unique) && nrow(rv$latest_unique) > 0 + + if (!has_new) { + if (is.data.frame(rv$latest_unique) && nrow(rv$latest_unique) > 0) { + show_toastr("Already deduplicated", + "This set is already deduplicated. To add more sources, upload new citation files above, then click Find duplicates to merge them in.", + type = "info") } else { show_toastr("Data needed", "Please import your citations first.", type = "error") } @@ -1397,35 +1529,57 @@ server <- function(input, output, session) { # Assign unique IDs to avoid issues with manual deduplication rv$upload_df <- rv$upload_df %>% dplyr::mutate(record_id = as.character(1000 + dplyr::row_number())) - - # Perform deduplication - dedup_results <- CiteSource::dedup_citations(rv$upload_df, manual = TRUE, show_unknown_tags = FALSE) + + n_new <- nrow(rv$upload_df) # capture before any clearing + + # Perform deduplication. With a reimported deduplicated set present, merge + # the new uploads INTO it (Goal 2); otherwise deduplicate the uploads. + if (has_existing) { + dedup_results <- CiteSource::dedup_citations_add_sources( + rv$latest_unique, rv$upload_df, manual = TRUE, show_unknown_tags = FALSE) + } else { + dedup_results <- CiteSource::dedup_citations( + rv$upload_df, manual = TRUE, show_unknown_tags = FALSE) + } rv$pairs_to_check <- dedup_results$manual_dedup rv$latest_unique <- dedup_results$unique rv$auto_pairs <- if (is.null(dedup_results$auto_pairs)) data.frame() else dedup_results$auto_pairs rv$pairs_removed <- data.frame() # reset manual log on a fresh dedup run rv$n_unique <- count_unique(rv$latest_unique) # Generate the n_unique data - - # Generate a summary message based on deduplication results - n_citations <- nrow(rv$upload_df) - n_unique_records <- nrow(rv$latest_unique) - n_duplicates_removed <- n_citations - n_unique_records - n_pairs_manual <- nrow(rv$pairs_to_check) + n_unique_records <- nrow(rv$latest_unique) + n_pairs_manual <- nrow(rv$pairs_to_check) fmt <- function(x) format(x, big.mark = ",", scientific = FALSE) - message <- if (n_pairs_manual > 0) { - paste0("Total citations uploaded: ", fmt(n_citations), "\n", - "Unique citations after deduplication: ", fmt(n_unique_records), "\n", - "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", - n_pairs_manual, " potential duplicate pair(s) flagged for manual review.") + + if (has_existing) { + # The combined set is now a standalone deduplicated set. Clear the uploads + # (and the upload form) so the same new records can't be added twice, and + # keep the merged set flagged as an existing set for further additions. + rv$df <- data.frame(); rv$upload_df <- data.frame(); rv$file_meta <- list() + rv$existing_dedup_present <- TRUE + + review_msg <- if (n_pairs_manual > 0) + paste0(n_pairs_manual, " potential duplicate pair(s) flagged for manual review.") + else "No potential duplicates for manual review." + message <- paste0("Added ", fmt(n_new), " new citation(s) to the existing set.\n", + "Unique citations after merge: ", fmt(n_unique_records), "\n\n", review_msg) + show_toastr("Sources added", message, type = "success") } else { - paste0("Total citations uploaded: ", fmt(n_citations), "\n", - "Unique citations after deduplication: ", fmt(n_unique_records), "\n", - "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", - "No potential duplicates for manual review. You can proceed to the visualization tab.") + rv$existing_dedup_present <- FALSE + n_duplicates_removed <- n_new - n_unique_records + message <- if (n_pairs_manual > 0) { + paste0("Total citations uploaded: ", fmt(n_new), "\n", + "Unique citations after deduplication: ", fmt(n_unique_records), "\n", + "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", + n_pairs_manual, " potential duplicate pair(s) flagged for manual review.") + } else { + paste0("Total citations uploaded: ", fmt(n_new), "\n", + "Unique citations after deduplication: ", fmt(n_unique_records), "\n", + "Duplicates removed: ", fmt(n_duplicates_removed), "\n\n", + "No potential duplicates for manual review. You can proceed to the visualization tab.") + } + show_toastr("Auto-deduplication complete", message, type = "success") } - - show_toastr("Auto-deduplication complete", message, type = "success") }) # ---- Post-dedup summary card ---- @@ -2937,7 +3091,11 @@ server <- function(input, output, session) { cols <- input$csv_custom_cols if (is.null(cols) || length(cols) == 0) "full" else cols } - export_csv(rv$latest_unique, file, fields = fields) + # Flag the set as manually reviewed when no candidate pairs remain + # pending (UX guard, read back by reimport_csv()). Only written on full + # exports by export_csv(). + export_csv(rv$latest_unique, file, fields = fields, + manual_dedup_complete = (nrow(rv$pairs_to_check) == 0)) } else { stop("No data to download!") shiny::req(FALSE) @@ -2987,6 +3145,22 @@ server <- function(input, output, session) { } ) + # ---- Manual-review candidate pairs: export to resume review later ---- + output$downloadCandidates <- shiny::downloadHandler( + filename = function() paste0("candidate-pairs-", Sys.Date(), ".csv"), + content = function(file) { + pairs <- rv$pairs_to_check + if (!is.data.frame(pairs) || nrow(pairs) == 0) { + utils::write.csv( + data.frame(note = "No unresolved candidate pairs."), + file, row.names = FALSE + ) + } else { + export_dedup_candidates(pairs, file) + } + } + ) + # Export hub — plot downloads (mirror the Visualise tab download handlers) output$export_heatplot <- shiny::downloadHandler( filename = function() { diff --git a/inst/shiny-app/CiteSource/www/user_guide.md b/inst/shiny-app/CiteSource/www/user_guide.md index 5acecb7c..a7651812 100644 --- a/inst/shiny-app/CiteSource/www/user_guide.md +++ b/inst/shiny-app/CiteSource/www/user_guide.md @@ -26,7 +26,20 @@ You can upload files in multiple batches. Each new upload adds rows to the form
Re-importing previously processed CiteSource data -> If you have a `.ris` or `.csv` previously exported from CiteSource, use the **"Re-upload an .ris or .csv exported from CiteSource"** input below the main upload area. These files contain embedded `cite_source`, `cite_label`, and `cite_string` columns. Re-importing skips deduplication entirely and takes you directly to Visualise and Tables with your prior results. +> If you have a `.ris` or `.csv` previously exported from CiteSource, use the **"Re-upload a CiteSource export"** input below the main upload area. These files carry embedded `cite_source`, `cite_label`, and `cite_string` columns, so they don't need to be re-deduplicated against themselves. +> +> When you re-import a deduplicated set, a read-only **"Re-imported deduplicated set"** card appears in the main panel showing the record count per source (and per label/string), so you can see what's already in the set before doing anything else. You can then: +> +> - **Go straight to Visualise and Tables** with your prior results, or +> - **Add new sources** — upload new search files using the box above, then go to **Deduplicate → Find duplicates** to merge them into the existing set (see Step 2), or +> - **Finish manual review later** — if you also re-upload the **candidate-pairs `.csv`** you exported earlier (see Step 6), the flagged pairs are restored to the Manual deduplication tab so you can complete the review (see Step 3). + +
+ +
+Adding more sources to a finished review + +> CiteSource lets you grow a review over time. Re-import a previously deduplicated set, upload the new database/search files, and run **Find duplicates** — the new records are deduplicated against both the existing set and each other, while the merges and manual decisions you already made are preserved. You do not have to start over.
@@ -53,6 +66,13 @@ Once complete, a summary card shows: - A per-source record count breakdown - Whether any pairs were flagged for manual review +
+Adding new sources to a re-imported set + +> If you re-imported a previously deduplicated set (Step 1) and then uploaded new citation files, clicking **Find duplicates** merges the new records into the existing set rather than starting from scratch. Prior automatic and manual merge decisions are kept, full record provenance is preserved, and the result is the same as if every source had been deduplicated together. After the merge the upload form clears so the same files can't be added twice — upload another batch any time to keep growing the set. + +
+ --- @@ -77,6 +97,13 @@ The default **Card View** shows each potential pair side-by-side with color-code +
+Finishing manual review later + +> You don't have to complete manual review in one sitting. On the **Export** tab, download both the **Citations** file and the **Candidate Pairs (CSV)** (see Step 6). Later, re-upload the citations file *and* the candidate-pairs file together (Step 1); the flagged pairs reappear here so you can finish reviewing them. When you export again with no pairs left pending, the file is marked as having manual deduplication complete. + +
+ --- @@ -133,7 +160,12 @@ Navigate to the **Tables** tab. Use the sidebar filters to select the subset of Navigate to the **Export** tab. Three sections are available: -**Citations** — download the full deduplicated dataset as `.csv`, `.ris`, or `.bib`. Provenance metadata (`cite_source`, `cite_label`, `cite_string`) is embedded in standard bibliographic fields (`.ris` uses C1, C2, C7, C8, DB). Only `.csv` and `.ris` can be re-imported into CiteSource later. +**Citations** — download the full deduplicated dataset as `.csv`, `.ris`, or `.bib`. Provenance metadata (`cite_source`, `cite_label`, `cite_string`) is embedded in standard bibliographic fields (`.ris` uses C1, C2, C7, C8, DB). Only `.csv` and `.ris` can be re-imported into CiteSource later. The full `.csv` also records whether manual deduplication has been completed, so a re-imported set knows whether review is still pending. + +This section also provides two supporting downloads: + +- **Dedup Log (CSV)** — every merged duplicate pair, flagged as automated or manual. Useful as a supplementary file documenting your deduplication for a systematic review. +- **Candidate Pairs (CSV)** — the pairs still flagged for manual review. Download this alongside the citations file if you want to pause and finish manual deduplication later (see Step 3). **Plots** — download any of the three visualizations as PNG files. Content reflects your current filter selections on the Visualise tab. diff --git a/man/dedup_citations.Rd b/man/dedup_citations.Rd index f6c07c99..602643bb 100644 --- a/man/dedup_citations.Rd +++ b/man/dedup_citations.Rd @@ -19,7 +19,7 @@ missing, show it as "unknown"? Default FALSE.} When \code{manual = FALSE}: a dataframe of unique citations. When \code{manual = TRUE}: a list with \verb{$unique} (unique citations), \verb{$manual_dedup} (potential pairs for review), and \verb{$auto_pairs} -(pairs that were merged automatically — feed to \code{\link[=dedup_log]{dedup_log()}} together +(pairs that were merged automatically - feed to \code{\link[=dedup_log]{dedup_log()}} together with confirmed manual pairs to build a full provenance log). } \description{ diff --git a/man/dedup_citations_add_sources.Rd b/man/dedup_citations_add_sources.Rd new file mode 100644 index 00000000..cf535a5d --- /dev/null +++ b/man/dedup_citations_add_sources.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dedup.R +\name{dedup_citations_add_sources} +\alias{dedup_citations_add_sources} +\title{Add new citations to a previously deduplicated set and re-deduplicate} +\usage{ +dedup_citations_add_sources( + existing_citations, + new_citations, + manual = FALSE, + show_unknown_tags = FALSE +) +} +\arguments{ +\item{existing_citations}{A previously deduplicated set (from +\code{\link[=dedup_citations]{dedup_citations()}}, \code{\link[=reimport_csv]{reimport_csv()}} or \code{\link[=reimport_ris]{reimport_ris()}}) - must contain a +\code{duplicate_id} column.} + +\item{new_citations}{New raw citations to add, as returned by +\code{\link[=read_citations]{read_citations()}} (with \code{cite_source} / \code{cite_label} / \code{cite_string}).} + +\item{manual}{logical. If TRUE, return the full result list including +\verb{$manual_dedup} candidate pairs for review (see \code{\link[=dedup_citations]{dedup_citations()}}). +Default FALSE.} + +\item{show_unknown_tags}{When a label, source, or other merged field is +missing, show it as "unknown"? Default FALSE.} +} +\value{ +When \code{manual = FALSE}: a dataframe of unique citations across both +sets. When \code{manual = TRUE}: a list with \verb{$unique}, \verb{$manual_dedup} and +\verb{$auto_pairs} (as in \code{\link[=dedup_citations]{dedup_citations()}}). In both cases \code{record_ids} +retains the original record IDs behind every merged record. +} +\description{ +Adds further citations (e.g. an additional database search) to a set that was +already deduplicated, and deduplicates the new records against both the +existing set and each other - without discarding the work already done. Each +existing unique record enters as a single row, so prior automatic and manual +merge decisions are preserved; the new records are integrated and full +provenance (the original \code{record_ids} behind every merged record) is carried +through. +} +\details{ +This is the incremental counterpart to running \code{\link[=dedup_citations]{dedup_citations()}} on all +sources from scratch and, for the same data, produces the same unique set. +} +\examples{ +if (interactive()) { + existing <- dedup_citations(read_citations(old_files, cite_sources = old_srcs)) + new_raw <- read_citations(new_files, cite_sources = new_srcs) + combined <- dedup_citations_add_sources(existing, new_raw) +} +} +\seealso{ +\code{\link[=dedup_citations]{dedup_citations()}}, \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}} +} diff --git a/man/dedup_log.Rd b/man/dedup_log.Rd index 75331acf..228d6ab6 100644 --- a/man/dedup_log.Rd +++ b/man/dedup_log.Rd @@ -23,7 +23,7 @@ common bibliographic fields (\code{title1/2}, \code{author1/2}, \code{year1/2}, \description{ Combines automatically merged pairs and user-confirmed manual pairs into a single tibble with a \code{method} column (\code{"auto"} / \code{"manual"}). Useful for -reporting and auditing — e.g. as supplementary material for a systematic +reporting and auditing - e.g. as supplementary material for a systematic review. } \examples{ diff --git a/man/export_csv.Rd b/man/export_csv.Rd index 93fbf04e..a6aebb9e 100644 --- a/man/export_csv.Rd +++ b/man/export_csv.Rd @@ -9,7 +9,8 @@ export_csv( filename, fields = "full", separate = NULL, - trim_abstracts = 32000 + trim_abstracts = 32000, + manual_dedup_complete = FALSE ) } \arguments{ @@ -28,6 +29,13 @@ selection. Note that exports other than \code{"full"} cannot be reimported into \item{trim_abstracts}{Some databases may return full-text that is misidentified as an abstract. This inflates file size and may lead to issues with Excel, which cannot deal with more than 32,000 characters per field. Therefore, the default is to trim very long abstracts to 32,000 characters. Set a lower number to reduce file size, or NULL to retain abstracts as they are.} + +\item{manual_dedup_complete}{Logical. Records, in a \code{manual_dedup_complete} +column, whether manual deduplication has been completed for this set +(default \code{FALSE}). Set \code{TRUE} after confirming manual pairs with +\code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}}. This flag is read back by \code{\link[=reimport_csv]{reimport_csv()}} +and lets later steps know whether candidate pairs still need review. Only +written when \code{fields = "full"}.} } \value{ No return value, called for side effects. Saves the deduplicated citations as a 'CSV' file to the specified location. diff --git a/man/export_dedup_candidates.Rd b/man/export_dedup_candidates.Rd new file mode 100644 index 00000000..253e7d2b --- /dev/null +++ b/man/export_dedup_candidates.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{export_dedup_candidates} +\alias{export_dedup_candidates} +\title{Export manual-review candidate pairs to a CSV file} +\usage{ +export_dedup_candidates(manual_dedup, filename) +} +\arguments{ +\item{manual_dedup}{Data frame of candidate pairs, i.e. the \verb{$manual_dedup} +element of \code{dedup_citations(manual = TRUE)}.} + +\item{filename}{Name (and path) of file, should end in .csv} +} +\value{ +No return value, called for side effects. Saves the candidate pairs +as a 'CSV' file to the specified location. +} +\description{ +Saves the candidate duplicate pairs returned as the \verb{$manual_dedup} element +of \code{dedup_citations(manual = TRUE)} so that manual review can be completed +later. Combine with \code{\link[=export_csv]{export_csv()}} to defer manual deduplication: export the +automatically deduplicated unique citations \emph{and} these candidate pairs now, +then re-import both later with \code{\link[=reimport_csv]{reimport_csv()}} and +\code{\link[=reimport_dedup_candidates]{reimport_dedup_candidates()}} to finish the review. Note that \emph{existing files +are overwritten without warning.} +} +\examples{ +if (interactive()) { + examplecitations_path <- system.file("extdata", "examplecitations.rds", package = "CiteSource") + examplecitations <- readRDS(examplecitations_path) + dedup_results <- dedup_citations(examplecitations, manual = TRUE) + export_dedup_candidates(dedup_results$manual_dedup, tempfile(fileext = ".csv")) +} +} +\seealso{ +\code{\link[=reimport_dedup_candidates]{reimport_dedup_candidates()}}, \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}} +} diff --git a/man/reimport_dedup_candidates.Rd b/man/reimport_dedup_candidates.Rd new file mode 100644 index 00000000..03c5aa13 --- /dev/null +++ b/man/reimport_dedup_candidates.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reimport.R +\name{reimport_dedup_candidates} +\alias{reimport_dedup_candidates} +\title{Reimport manual-review candidate pairs exported from CiteSource} +\usage{ +reimport_dedup_candidates(filename) +} +\arguments{ +\item{filename}{Name (and path) of the candidate-pairs CSV, should end in .csv} +} +\value{ +A data frame of candidate pairs with \code{duplicate_id.x} / \code{duplicate_id.y} +read as character (matching the unique citations from \code{\link[=reimport_csv]{reimport_csv()}}), ready +for review and \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}}. +} +\description{ +Reads a CSV of candidate duplicate pairs previously written by +\code{\link[=export_dedup_candidates]{export_dedup_candidates()}} (i.e. the \verb{$manual_dedup} element of +\code{dedup_citations(manual = TRUE)}). This supports a deferred workflow: run +automatic deduplication now, export both the unique citations and the +candidate pairs, and complete the manual review later after re-importing. +} +\details{ +After review, set the \code{result} column to \code{"match"} for confirmed duplicates +and pass the result, together with the reimported unique citations, to +\code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}}. +} +\examples{ +if (interactive()) { + candidates <- reimport_dedup_candidates("path/to/candidates.csv") + # mark confirmed duplicates, then merge into the reimported unique set + candidates$result <- ifelse(candidates$result == "match", "match", "no_match") + final <- dedup_citations_add_manual(reimport_csv("unique.csv"), candidates) +} +} +\seealso{ +\code{\link[=export_dedup_candidates]{export_dedup_candidates()}}, \code{\link[=dedup_citations_add_manual]{dedup_citations_add_manual()}} +} diff --git a/tests/testthat/test-add-sources.R b/tests/testthat/test-add-sources.R new file mode 100644 index 00000000..495416f1 --- /dev/null +++ b/tests/testthat/test-add-sources.R @@ -0,0 +1,80 @@ +# Tests for dedup_citations_add_sources(): adding new citations to a +# previously deduplicated set (Goal 2 — incremental deduplication). + +# Build a tiny raw set with controllable duplicates across sources. +make_raw <- function() { + base <- data.frame( + title = c("Alpha study of fish", "Beta review of coral", "Gamma trial of kelp", + "Delta survey of crabs", "Epsilon report on eels"), + author = c("Smith J", "Jones A", "Lee K", "Brown R", "Davis M"), + year = c("2010", "2011", "2012", "2013", "2014"), + journal = c("Mar Biol", "Coral J", "Kelp Sci", "Crab Rev", "Eel Rep"), + abstract = c("aaa fish abundance", "bbb coral cover", "ccc kelp density", + "ddd crab counts", "eee eel migration"), + doi = c("10.1/a", "10.1/b", "10.1/c", "10.1/d", "10.1/e"), + pages = c("1-10", "11-20", "21-30", "31-40", "41-50"), + volume = c("1", "2", "3", "4", "5"), + number = c("1", "1", "1", "1", "1"), + isbn = c("111", "222", "333", "444", "555"), + stringsAsFactors = FALSE + ) + base +} + +test_that("add_sources errors without a duplicate_id column", { + raw <- make_raw() + expect_error( + dedup_citations_add_sources(raw, raw), + "duplicate_id" + ) +}) + +test_that("incremental dedup matches from-scratch and preserves provenance", { + # Source A = records 1-4, Source B = records 3-5 (records 3,4 overlap A/B). + a <- make_raw()[1:4, ]; a$cite_source <- "A"; a$cite_label <- ""; a$cite_string <- "" + b <- make_raw()[3:5, ]; b$cite_source <- "B"; b$cite_label <- ""; b$cite_string <- "" + + existing <- suppressWarnings(suppressMessages(dedup_citations(a))) + combined <- suppressWarnings(suppressMessages(dedup_citations_add_sources(existing, b))) + scratch <- suppressWarnings(suppressMessages( + dedup_citations(dplyr::bind_rows(a, b)) + )) + + # Same number of unique records as deduping everything at once + expect_equal(nrow(combined), nrow(scratch)) + + # All five distinct titles are represented + expect_setequal(unique(combined$title), unique(make_raw()$title)) + + # Both sources present after the merge + srcs <- unique(trimws(unlist(strsplit(paste(combined$cite_source, collapse = ", "), ",\\s*")))) + expect_true(all(c("A", "B") %in% srcs)) + + # Output is reimport-shaped + expect_true(all(c("duplicate_id", "record_ids", "cite_source") %in% names(combined))) +}) + +test_that("works on a reimported (all-character) existing set and in manual mode", { + a <- make_raw()[1:4, ]; a$cite_source <- "A"; a$cite_label <- ""; a$cite_string <- "" + b <- make_raw()[3:5, ]; b$cite_source <- "B"; b$cite_label <- ""; b$cite_string <- "" + + existing <- suppressWarnings(suppressMessages(dedup_citations(a))) + f <- tempfile(fileext = ".csv") + export_csv(existing, f) + existing_re <- reimport_csv(f) + + res <- suppressWarnings(suppressMessages( + dedup_citations_add_sources(existing_re, b, manual = TRUE) + )) + expect_type(res, "list") + expect_true(all(c("unique", "manual_dedup", "auto_pairs") %in% names(res))) + expect_true(all(c("duplicate_id", "record_ids") %in% names(res$unique))) + + # Candidate pairs reference duplicate_ids present in the unique output, so the + # set is ready for dedup_citations_add_manual() / the Shiny manual review tab. + if (nrow(res$manual_dedup) > 0) { + ids <- as.character(res$unique$duplicate_id) + expect_true(all(as.character(res$manual_dedup$duplicate_id.x) %in% ids)) + expect_true(all(as.character(res$manual_dedup$duplicate_id.y) %in% ids)) + } +}) diff --git a/tests/testthat/test-reimport.R b/tests/testthat/test-reimport.R new file mode 100644 index 00000000..742c6ec6 --- /dev/null +++ b/tests/testthat/test-reimport.R @@ -0,0 +1,74 @@ +# Tests for the deferred manual-deduplication workflow: +# auto-dedup now -> export -> reimport -> complete manual review later. + +example_citations <- function() { + readRDS(system.file("extdata", "examplecitations.rds", package = "CiteSource")) +} + +test_that("reimport_csv round-trips all columns as character", { + auto <- suppressWarnings(suppressMessages(dedup_citations(example_citations()))) + f <- tempfile(fileext = ".csv") + export_csv(auto, f) + re <- reimport_csv(f) + + expect_true(all(c("cite_source", "cite_label", "cite_string", + "duplicate_id", "record_ids") %in% names(re))) + # duplicate_id must stay character (read.csv would otherwise infer integer), + # which is what dedup_citations_add_manual() / re-dedup require. + expect_type(re$duplicate_id, "character") + expect_type(re$year, "character") +}) + +test_that("export_csv writes the manual_dedup_complete flag on full exports", { + auto <- suppressWarnings(suppressMessages(dedup_citations(example_citations()))) + + f1 <- tempfile(fileext = ".csv") + export_csv(auto, f1) # default FALSE + expect_equal(unique(reimport_csv(f1)$manual_dedup_complete), "FALSE") + + f2 <- tempfile(fileext = ".csv") + export_csv(auto, f2, manual_dedup_complete = TRUE) + expect_equal(unique(reimport_csv(f2)$manual_dedup_complete), "TRUE") + + # Not written for non-reimportable (standard) exports + f3 <- tempfile(fileext = ".csv") + suppressWarnings(export_csv(auto, f3, fields = "standard")) + expect_false("manual_dedup_complete" %in% names(utils::read.csv(f3))) +}) + +test_that("candidate pairs round-trip and seed a result column", { + pairs <- data.frame( + duplicate_id.x = c("1001", "1005"), + duplicate_id.y = c("1002", "1009"), + title1 = c("A", "B"), title2 = c("A", "B"), + stringsAsFactors = FALSE + ) + f <- tempfile(fileext = ".csv") + export_dedup_candidates(pairs, f) + back <- reimport_dedup_candidates(f) + + expect_true(all(c("duplicate_id.x", "duplicate_id.y", "result") %in% names(back))) + expect_type(back$duplicate_id.x, "character") + expect_equal(nrow(back), 2) + + expect_error(export_dedup_candidates(data.frame(a = 1), f), "duplicate_id") + bad <- tempfile(fileext = ".csv") + utils::write.csv(data.frame(a = 1), bad, row.names = FALSE) + expect_error(reimport_dedup_candidates(bad), "duplicate_id") +}) + +test_that("manual pairs can be merged into a reimported (auto-deduped) set", { + auto <- suppressWarnings(suppressMessages(dedup_citations(example_citations()))) + f <- tempfile(fileext = ".csv") + export_csv(auto, f) + re <- reimport_csv(f) + + # Force two real records to be a confirmed manual duplicate + ids <- as.character(re$duplicate_id) + pair <- data.frame(duplicate_id.x = ids[1], duplicate_id.y = ids[2], + result = "match", stringsAsFactors = FALSE) + + final <- suppressWarnings(suppressMessages(dedup_citations_add_manual(re, pair))) + expect_equal(nrow(final), nrow(re) - 1) + expect_true(all(c("cite_source", "record_ids", "duplicate_id") %in% names(final))) +})