From 11e2ce9303a6fededa5959972d2640b94e1ec584 Mon Sep 17 00:00:00 2001 From: aaronmoondewey Date: Mon, 6 Apr 2026 15:53:07 -0700 Subject: [PATCH 1/4] Update README.md --- README.md | 322 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 266 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 757df6e..1ef2c6b 100644 --- a/README.md +++ b/README.md @@ -1,130 +1,292 @@ -# deweyr DeweyHex2 +# Working with R +This page provides a collection of practical, ready-to-use R examples to help you load, explore, filter, and analyze data on the Dewey platform. Whether you're new to R or looking for quick reference patterns, you'll find clear snippets, common workflows, and best-practice tips to support fast, reproducible research. -> An R interface to [deweypy](https://github.com/dewey-data/deweypy) for downloading files from the Dewey file management system. - +## Overview +Dewey makes it easy to work with large datasets in R using familiar, flexible tools like tidyverse, arrow, and DuckDB. These examples show how to load data efficiently, inspect schemas, filter rows before download, and quickly summarize large files without requiring heavy local setup. +With just a few lines of code, you can connect to your files, run performant queries, and start analyzing immediately — no complex configuration required. +## Downloading Data -## Overview +### Using deweyr (Recommended) -`deweyr` provides a simple way to download files from Dewey folders directly from R, without requiring manual Python environment setup. The package offers two download methods: +The [`deweyr`](https://github.com/Dewey-Data/deweyr) package provides a simple way to download files from Dewey projects directly from R. The package offers two download methods: -- **`dewey_download()`** - Recommended method using [UV](https://docs.astral.sh/uv/) (automatic Python environment management) -- **`dewey_download_py()`** - Traditional method using an existing Python installation +- **`download_dewey()`** — Recommended method using [UV](https://docs.astral.sh/uv/) (automatic Python environment management, no Python installation required) +- **`download_dewey_py()`** — Traditional method using an existing Python installation with [deweypy](https://github.com/dewey-data/deweypy) -## Installation +#### Installation Install the development version from GitHub: + ```r # install.packages("devtools") -devtools::install_github("Coxabc/deweyr") +devtools::install_github("Dewey-Data/deweyr") ``` -## Quick Start +#### Basic Download -### Method 1: Using UV (Recommended) - -The easiest way to get started - no Python installation required: ```r library(deweyr) +# Download to default location (./dewey-downloads) download_dewey( api_key = "your-api-key", - folder_id = "your-folder-id" + project_id = "abc123" ) ``` > **First-time setup:** If UV isn't installed, `deweyr` will install it automatically. You may see a message recommending you restart your terminal for optimal performance in future runs. +#### Custom Download Location - -### Method 2: Using Existing Python - -If you already have Python and deweypy installed: -```r -library(deweyr) - -download_dewey_py( - api_key = "your-api-key", - folder_id = "your-folder-id" -) -``` - -## Usage Examples - -### Basic Download -```r -# Download to default location (./dewey-downloads) -download_dewey( - api_key = "your-api-key", - folder_id = "abc123" -) -``` - -### Custom Download Location ```r -# Specify where to save files download_dewey( api_key = "your-api-key", - folder_id = "abc123", + project_id = "abc123", download_path = "C:/Users/YourName/Documents/data" ) ``` -### Download from URL +#### Download from URL You can use either a folder ID or the full Dewey URL: + ```r download_dewey( api_key = "your-api-key", - folder_id = "https://api.deweydata.io/api/v1/external/data/abc123" + project_id = "https://api.deweydata.io/api/v1/external/data/abc123" ) ``` -## Advanced Options -### Multi-threaded Downloads +#### Multi-threaded Downloads Adjust the number of workers for faster downloads (default is 8): + ```r download_dewey( api_key = "your-api-key", - folder_id = "abc123", + project_id = "abc123", num_workers = 16 # Use 16 parallel workers ) ``` -### Date-Partitioned Datasets +#### Date-Partitioned Datasets For datasets partitioned by date, you can filter which partitions to download: + ```r # Download only data from 2024 onwards download_dewey( api_key = "your-api-key", - folder_id = "abc123", + project_id = "abc123", partition_key_after = "2024-01-01" ) # Download only data up to a certain date download_dewey( api_key = "your-api-key", - folder_id = "abc123", + project_id = "abc123", partition_key_before = "2023-12-31" ) # Download a specific date range download_dewey( api_key = "your-api-key", - folder_id = "abc123", + project_id = "abc123", partition_key_after = "2024-01-01", partition_key_before = "2024-03-31" ) ``` -## Duck DB Options +### Using the Dewey Client + +Alternatively, you can use the [Quickstart: Dewey Client](https://deweydata.io) to download data to your local machine, and then load the downloaded files into R using the examples below. + +## Loading Data into R(Studio) + +### Handling Different Data Types + +Datasets downloaded from Dewey may come in different file formats depending on storage requirements or download preferences. This section provides guidance on how to load your data into R, whether the files are in `.parquet` or `.csv.gz` format. -### Download from Duck DB +### Parquet Files + +Many of Dewey's datasets are provided as `.parquet` files due to their efficient storage and query performance. To load `.parquet` files into R, you can use the `arrow` or `duckdb` packages, depending on whether you need to filter the data before bringing it into R. + +#### Arrow + +`arrow` provides a fast and memory-efficient way to read `.parquet` files into R. It supports loading single files or entire directories of `.parquet` files and returns the result as a tidy, in-memory dataset. `arrow` is ideal when you want to load the full dataset directly without applying filters first. + +```r +# ---------------------------------------------------------------------------------- +# Optional: Install packages +# Remove the "#" on the line below to install the arrow package (only needed once). +# ---------------------------------------------------------------------------------- +# install.packages("arrow") + +# ------------------------- +# Load required libraries +# ------------------------- +library(arrow) # For working with Parquet datasets efficiently (no full in-memory load required) + +# ------------------------------------------------------------- +# Point to the local folder that contains Dewey Parquet files +# ------------------------------------------------------------- +# This folder should contain one or more .parquet files downloaded from Dewey. +path <- "YOUR FILEPATH" + +# Example: +# path <- "C:/Users/user1/Documents/dewey-downloads/mydata" + +# ------------------------------------------------ +# Create an Arrow Dataset from the Parquet files +# ------------------------------------------------ +# open_dataset() creates a lazy Arrow Dataset that can be queried without immediately loading everything into memory. +lazy_data <- open_dataset(path, format = "parquet") + +# -------------------------------------------------------------- +# Materialize the full dataset into R as a data.frame / tibble +# -------------------------------------------------------------- +# collect() pulls the data from disk (or remote storage) into R memory. +# For very large Dewey datasets, consider filtering or selecting columns before calling collect(). +data <- collect(lazy_data) + +# View the first six rows of your dataset +head(data) +``` + +### CSV Files + +Some of Dewey's datasets are delivered in CSV format and are provided as compressed CSV files (`.csv.gz`) to reduce file size and improve download performance. These files can be loaded directly into R using the `readr` package, which efficiently reads and combines multiple compressed CSVs into a single dataset. + +`duckdb` only queries and filters `.parquet` files. To utilize `duckdb` you will need to convert the files to `.parquet` first. There is a simple workflow to do this within R. The second tab of the code box below provides the coding for transforming `.csv.gz` files to `.parquet`. + +```r +# ---------------------------------------------------------------------------------- +# Optional: Install packages +# Remove the "#" below to install readr (only needed once) +# ---------------------------------------------------------------------------------- +# install.packages("readr") + +# ------------------------- +# Load required libraries +# ------------------------- +library(readr) # For fast, tidy reading of CSV and CSV.GZ files + +# ----------------------------------------------------------------------- +# Point to the local folder that contains Dewey compressed CSV (.csv.gz) +# ----------------------------------------------------------------------- +path <- "YOUR FILEPATH" + +# Example: +# path <- "C:/Users/user1/Documents/dewey-downloads/mydata" + +# ---------------------------------------------------------- +# Load all .csv.gz files in the folder into a single dataset +# ---------------------------------------------------------- +# read_csv() automatically decompresses .gz files. +files <- list.files(path, pattern = "\\.csv\\.gz$", full.names = TRUE) + +data <- do.call(dplyr::bind_rows, lapply(files, read_csv)) + +# -------------------------------------------------------------- +# View the first rows of your dataset to inspect the content +# -------------------------------------------------------------- +head(data) +``` + +## Filter Data + +### DuckDB + +If you want to filter Dewey datasets in R, you can use DuckDB after the files have been downloaded to your local machine. + +The best workflow is: + +1. Download the dataset locally first (using `deweyr` or the Dewey Client). +2. Then use DuckDB in R to filter, reshape, and query the data before loading it into memory. + +This gives you the full power of DuckDB, just after download rather than before. + +#### Filtering .parquet Files + +```r +# ---------------------------------------------------------------------------------- +# Optional: Install packages +# Remove the "#" on the line below to install the duckdb package (only needed once). +# ---------------------------------------------------------------------------------- +# install.packages("duckdb") +# install.packages("DBI") + +# ------------------------- +# Load required libraries +# ------------------------- +library(DBI) # For database connections +library(duckdb) # For querying Parquet efficiently using SQL (filter before load) + +# ------------------------------------------------------------- +# Point to the local folder that contains Dewey Parquet files +# ------------------------------------------------------------- +# This folder should contain one or more .parquet files downloaded from Dewey. +path <- "YOUR FILEPATH" + +# Example: +# path <- "C:/Users/user1/Documents/dewey-downloads/mydata" + +# ----------------------------------------------- +# Create a DuckDB connection (in-memory database) +# ----------------------------------------------- +con <- dbConnect(duckdb(), dbdir = ":memory:") + +#--------------------------------------------------------------------------------- +# Preview five rows from the Parquet files +# This helps view the data and see a sample of the column names and table values +#--------------------------------------------------------------------------------- +sample_query <- paste0(" + SELECT * + FROM read_parquet('", path, "/*.parquet') + LIMIT 5 +") + +sample_preview <- dbGetQuery(con, sample_query) + +head(sample_preview) + +# ------------------------------------------------------------------------- +# Query and FILTER the Parquet files BEFORE loading them into R +# ------------------------------------------------------------------------- +# Replace the WHERE clause with your desired filters. +# DuckDB reads only the necessary row groups and columns from disk. +query <- paste0(" + SELECT * + FROM read_parquet('", path, "/*.parquet') + -- Example filters (remove the -- from the lines below to activate filters): + -- WHERE state = 'WA' + -- AND naics_code = '448120' +") + +# -------------------------------------------------------------- +# Materialize the filtered data into R as a data.frame / tibble +# -------------------------------------------------------------- +# dbGetQuery() runs the SQL query and returns only the filtered rows. +data <- dbGetQuery(con, query) + +# View the first six rows of your filtered dataset +head(data) + +# ------------------------------- +# Disconnect DuckDB when finished +# ------------------------------- +dbDisconnect(con, shutdown = TRUE) +``` + +### DuckDB Options via deweyr + +The `deweyr` package also provides convenience functions for working with DuckDB directly against Dewey datasets. + +#### Download via DuckDB ```r download_dewey_duck( @@ -133,11 +295,11 @@ download_dewey_duck( partition = "column-name-to-partition-by", where = NULL, select = NULL, - overwrite=FALSE + overwrite = FALSE ) ``` -### Read Using Duck DB +#### Read Using DuckDB ```r read_dewey_duck( @@ -146,7 +308,7 @@ read_dewey_duck( ) ``` -### Get Dewey URL +#### Get Dewey URLs ```r get_dewey_urls_duck( @@ -156,7 +318,7 @@ get_dewey_urls_duck( ) ``` -### Preview with Duck DB +#### Preview with DuckDB ```r preview_dewey_duck( @@ -167,6 +329,54 @@ preview_dewey_duck( ) ``` +## Data Exploration & Visualization + +The following section provides a set of quick, practical Exploratory Data Analysis (EDA) tools you can run immediately after loading a Dewey dataset into R. These commands help you validate the structure of the dataset, check for missing values, understand column types, and identify potential issues before running deeper analysis. You'll generate summary statistics, inspect unique values, measure correlations between numeric fields, and visualize distributions across variables. This workflow is designed to give you a fast, high-level understanding of your dataset's shape, quality, and behavior so you can confidently move into more advanced filtering, modeling, or visualization steps. + +```r +# ------------------------- +# Load required libraries +# ------------------------- +library(dplyr) +library(ggplot2) +library(reshape2) +library(tidyr) + +# ------------------------------------------------ +# Check dataset dimensions (rows x columns) +# ------------------------------------------------ +dim(data) + +# ------------------------------------------------ +# View structure, column types, and sample values +# ------------------------------------------------ +str(data) + +# ------------------------------------------------ +# Get summary statistics for each column +# ------------------------------------------------ +summary(data) + +# ------------------------------------------------ +# Count missing values in each column +# ------------------------------------------------ +colSums(is.na(data)) + +# ------------------------------------------------ +# Count unique values per column +# ------------------------------------------------ +sapply(data, function(x) length(unique(x))) + +# ------------------------------------------------ +# Select numeric columns for correlation analysis +# ------------------------------------------------ +num <- data %>% select(where(is.numeric)) + +# ------------------------------------------------ +# Compute correlation matrix +# ------------------------------------------------ +corr <- cor(num, use = "pairwise.complete.obs") +``` --- From 08fee46e5409e61a0967f0c59053cf33c35c9cc3 Mon Sep 17 00:00:00 2001 From: aaronmoondewey Date: Mon, 27 Apr 2026 09:44:49 -0700 Subject: [PATCH 2/4] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1ef2c6b..7674db5 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ library(deweyr) # Download to default location (./dewey-downloads) download_dewey( api_key = "your-api-key", - project_id = "abc123" + folder_id = "abc123" ) ``` @@ -45,7 +45,7 @@ download_dewey( ```r download_dewey( api_key = "your-api-key", - project_id = "abc123", + folder_id = "abc123", download_path = "C:/Users/YourName/Documents/data" ) ``` @@ -57,7 +57,7 @@ You can use either a folder ID or the full Dewey URL: ```r download_dewey( api_key = "your-api-key", - project_id = "https://api.deweydata.io/api/v1/external/data/abc123" + folder_id = "https://api.deweydata.io/api/v1/external/data/abc123" ) ``` @@ -68,7 +68,7 @@ Adjust the number of workers for faster downloads (default is 8): ```r download_dewey( api_key = "your-api-key", - project_id = "abc123", + folder_id = "abc123", num_workers = 16 # Use 16 parallel workers ) ``` @@ -81,21 +81,21 @@ For datasets partitioned by date, you can filter which partitions to download: # Download only data from 2024 onwards download_dewey( api_key = "your-api-key", - project_id = "abc123", + folder_id = "abc123", partition_key_after = "2024-01-01" ) # Download only data up to a certain date download_dewey( api_key = "your-api-key", - project_id = "abc123", + folder_id = "abc123", partition_key_before = "2023-12-31" ) # Download a specific date range download_dewey( api_key = "your-api-key", - project_id = "abc123", + folder_id = "abc123", partition_key_after = "2024-01-01", partition_key_before = "2024-03-31" ) From c59b798531651228ef14911f33c62a4315f62bdd Mon Sep 17 00:00:00 2001 From: Aaron Moon Date: Mon, 4 May 2026 11:33:33 -0700 Subject: [PATCH 3/4] Fix preview hang on large datasets; add partition_key date filter to download_dewey_duck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preview path no longer paginates the full file manifest — hits page 1 of /v1/external/data//files directly via deweypy.api_request, unblocking preview_dewey_duck() on Visits-scale datasets where the manifest hang made it unusable. download_dewey_duck() now accepts partition_key_after and partition_key_before, threaded through to deweypy.get_dataset_files() so the manifest is server-side filtered before download — the missing piece researchers needed to pull a date-bounded slice on storage-limited setups. Hardens the R/Python boundary: structured JSON errors on stderr with non-zero exit, R surfaces them as stop() with the original message; empty-stdout treated as failure (covers system2 corner cases like uv crash before output); collision-proof __DEWEYR_NULL__ sentinel replaces "None" so user values like literal "None" round-trip safely; cols fallback now peeks the filtered URL set instead of an unfiltered preview. Adds 25 testthat tests (32 total, 0 failures) covering argv contract, partition_key validator gates, forwarding via local_mocked_bindings, and sentinel-collision regression. Co-Authored-By: Claude Opus 4.7 --- DESCRIPTION | 2 +- R/duckdb.r | 164 ++++++++++++++++-- inst/python/get_dewey_urls.py | 109 ++++++++++-- man/download_dewey_duck.Rd | 17 +- .../testthat/test-build_get_dewey_urls_args.R | 93 ++++++++++ tests/testthat/test-download_dewey_duck.R | 131 ++++++++++++++ tests/testthat/test-validate_partition_key.R | 66 +++++++ 7 files changed, 551 insertions(+), 31 deletions(-) create mode 100644 tests/testthat/test-build_get_dewey_urls_args.R create mode 100644 tests/testthat/test-download_dewey_duck.R create mode 100644 tests/testthat/test-validate_partition_key.R diff --git a/DESCRIPTION b/DESCRIPTION index cdd9337..eeace07 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,7 @@ Suggests: dplyr, magrittr, readr, - testthat (>= 3.0.0), + testthat (>= 3.2.0), knitr, rmarkdown Config/testthat/edition: 3 diff --git a/R/duckdb.r b/R/duckdb.r index 179f0a6..cf28ab2 100644 --- a/R/duckdb.r +++ b/R/duckdb.r @@ -1,3 +1,71 @@ +#' Build the argv vector for the get_dewey_urls.py subprocess +#' +#' Pure helper extracted so the R/Python boundary contract is testable +#' without spawning a subprocess. Order of positional args here MUST match +#' \code{inst/python/get_dewey_urls.py}. +#' +#' @keywords internal +#' @noRd +# Peek column names for a single URL via DuckDB. Used as a fallback when the +# Python script's schema discovery fails. Stays in lockstep with the URL set +# the caller is about to download. +# +# @keywords internal +# @noRd +peek_cols_from_url <- function(url, file_extension) { + read_fn <- ifelse(file_extension == ".snappy.parquet", "read_parquet", "read_csv") + con <- DBI::dbConnect(duckdb::duckdb()) + on.exit(DBI::dbDisconnect(con), add = TRUE) + DBI::dbExecute(con, "INSTALL httpfs; LOAD httpfs;") + df <- DBI::dbGetQuery(con, glue::glue( + "SELECT * FROM {read_fn}(['{url}']) LIMIT 0" + )) + colnames(df) +} + +# Sentinel string for "no value provided". MUST match NONE_SENTINEL in +# inst/python/get_dewey_urls.py. Chosen so it cannot collide with any +# user-supplied value (api_key, partition_key, file_name). +DEWEYR_NULL_SENTINEL <- "__DEWEYR_NULL__" + +build_get_dewey_urls_args <- function(script, api_key, data_id, file_name, + preview, partition_key_after, + partition_key_before, + python_version = "3.13") { + none <- DEWEYR_NULL_SENTINEL + c( + "run", "--python", python_version, script, + api_key, + data_id, + if (is.null(file_name)) none else as.character(file_name), + tolower(as.character(preview)), + if (is.null(partition_key_after)) none else as.character(partition_key_after), + if (is.null(partition_key_before)) none else as.character(partition_key_before) + ) +} + +#' Validate a partition_key argument +#' +#' Cheap sanity-check on user-supplied partition_key values before they go +#' across the R/Python boundary. Catches the common mistakes (empty string, +#' multi-element, embedded newline) while leaving date format up to the API. +#' +#' @keywords internal +#' @noRd +validate_partition_key <- function(value, arg_name) { + if (is.null(value)) return(invisible(NULL)) + if (!is.character(value) || length(value) != 1 || is.na(value)) { + stop(arg_name, " must be a single character string or NULL.") + } + if (!nzchar(value)) { + stop(arg_name, " must not be an empty string. Pass NULL to skip.") + } + if (grepl("[\r\n]", value)) { + stop(arg_name, " must not contain newline characters.") + } + invisible(NULL) +} + #' Get Dewey dataset file metadata #' #' Calls the Dewey API via a Python script to retrieve download URLs and @@ -10,6 +78,10 @@ #' @param preview If \code{TRUE}, returns only the first file URL instead of #' paginating the full dataset manifest. Used internally by \code{preview_dewey()}. #' Defaults to \code{FALSE}. +#' @param partition_key_after Optional partition key lower bound (inclusive). +#' Forwarded to deweypy. Ignored when \code{preview = TRUE}. +#' @param partition_key_before Optional partition key upper bound (inclusive). +#' Forwarded to deweypy. Ignored when \code{preview = TRUE}. #' #' @return A list with the following fields: #' \describe{ @@ -23,23 +95,61 @@ #' #' @keywords internal #' @noRd -get_dewey_urls <- function(api_key, data_id, file_name = NULL, preview = FALSE) { +get_dewey_urls <- function(api_key, data_id, file_name = NULL, preview = FALSE, + partition_key_after = NULL, partition_key_before = NULL) { if (!check_uv()) { install_uv() message("Restarting the terminal will increase speed of future runs") } data_id <- parse_url(data_id) script <- system.file("python/get_dewey_urls.py", package = "deweyr") - args <- c( - "run", "--python", "3.13", script, - api_key, - data_id, - ifelse(is.null(file_name), "None", file_name), - tolower(as.character(preview)) + args <- build_get_dewey_urls_args( + script = script, + api_key = api_key, + data_id = data_id, + file_name = file_name, + preview = preview, + partition_key_after = partition_key_after, + partition_key_before = partition_key_before ) - result_raw <- system2("uv", args = args, stdout = TRUE, stderr = "stderr.txt") - cat(readLines("stderr.txt"), sep = "\n") - jsonlite::fromJSON(paste(result_raw, collapse = "")) + stderr_path <- tempfile("deweyr_stderr_", fileext = ".txt") + on.exit(unlink(stderr_path), add = TRUE) + result_raw <- system2("uv", args = args, stdout = TRUE, stderr = stderr_path) + exit_status <- attr(result_raw, "status") + err_lines <- if (file.exists(stderr_path)) readLines(stderr_path, warn = FALSE) else character() + stdout_str <- paste(result_raw, collapse = "") + + # Treat empty stdout as failure — `system2(..., stdout = TRUE)` does not + # always set the `status` attribute when the command crashes early + # (e.g. uv not found on PATH after install_uv). The Python script always + # prints non-empty JSON on success. + failed <- (!is.null(exit_status) && exit_status != 0) || !nzchar(trimws(stdout_str)) + + if (failed) { + err_msg <- paste(err_lines, collapse = "\n") + parsed_err <- tryCatch(jsonlite::fromJSON(err_msg), error = function(e) NULL) + if (is.list(parsed_err) && !is.null(parsed_err$error)) { + stop("get_dewey_urls failed: ", parsed_err$error) + } + stop( + "get_dewey_urls failed", + if (!is.null(exit_status)) paste0(" (exit ", exit_status, ")") else "", + ". Stderr:\n", + if (nzchar(err_msg)) err_msg else "(empty)" + ) + } + if (length(err_lines) > 0) cat(err_lines, sep = "\n") + + parsed <- tryCatch( + jsonlite::fromJSON(stdout_str), + error = function(e) { + stop( + "Failed to parse response from get_dewey_urls.py. ", + "Raw output: ", paste(result_raw, collapse = "\n") + ) + } + ) + parsed } #' Preview a Dewey dataset @@ -118,6 +228,11 @@ preview_dewey_duck <- function(api_key, data_id, limit = 10) { #' @param select Optional vector of column indices, ranges, or names to download. #' Accepts mixed input e.g. \code{c(1:3, 7, "CARRIER_NAME")}. The partition #' column will always be added automatically if missing. +#' @param partition_key_after Optional character string (typically YYYY-MM-DD). +#' Pre-filters the file manifest to partitions on/after this key — drastically +#' reduces manifest size on large datasets like SafeGraph Visits. +#' @param partition_key_before Optional character string (typically YYYY-MM-DD). +#' Pre-filters the file manifest to partitions on/before this key. #' #' @return The path to the downloaded dataset folder, invisibly. Pipe into #' \code{read_dewey()} to read immediately after downloading. @@ -147,19 +262,40 @@ preview_dewey_duck <- function(api_key, data_id, limit = 10) { #' select = c(1:3, "TOTAL") #' ) #' +#' # Date-bounded download (huge speedup on Visits-scale datasets) +#' download_dewey_duck(api_key, data_id, +#' partition_key_after = "2024-01-01", +#' partition_key_before = "2024-02-01" +#' ) +#' #' # Download and read in one step #' df <- download_dewey_duck(api_key, data_id, partition = "MONTH_DATE_PARSED") |> #' read_dewey() #' } #' #' @export -download_dewey_duck <- function(api_key, data_id, output_dir = get_download_dir(), partition, overwrite = FALSE, file_name = NULL, where = NULL, select = NULL) { - result <- get_dewey_urls(api_key, data_id, file_name = file_name) +download_dewey_duck <- function(api_key, data_id, output_dir = get_download_dir(), + partition, overwrite = FALSE, file_name = NULL, + where = NULL, select = NULL, + partition_key_after = NULL, + partition_key_before = NULL) { + validate_partition_key(partition_key_after, "partition_key_after") + validate_partition_key(partition_key_before, "partition_key_before") + + result <- get_dewey_urls( + api_key, data_id, + file_name = file_name, + partition_key_after = partition_key_after, + partition_key_before = partition_key_before + ) cols <- result$cols # ✅ no second call needed - # fallback in case cols came back empty + # Fallback if Python's DuckDB schema-peek failed: query the FIRST URL we + # already have. This stays within the partition_key range — going through + # preview_dewey_duck() would peek at an unfiltered file and could mismatch + # the actual download set. if (length(cols) == 0) { - cols <- colnames(preview_dewey_duck(api_key, data_id, limit = 0)) + cols <- peek_cols_from_url(result$urls[[1]], result$file_extension) } if (missing(partition)) { diff --git a/inst/python/get_dewey_urls.py b/inst/python/get_dewey_urls.py index 70e83e9..a3d99cb 100644 --- a/inst/python/get_dewey_urls.py +++ b/inst/python/get_dewey_urls.py @@ -4,30 +4,109 @@ import sys import json import re +import traceback import duckdb from deweypy.auth import set_api_key -from deweypy.download.synchronous import get_dataset_files +from deweypy.download.synchronous import api_request, get_dataset_files -api_key = sys.argv[1] -data_id = sys.argv[2] -file_name = sys.argv[3] if sys.argv[3].lower() != "none" else None -preview = sys.argv[4].lower() == "true" +# Sentinel string for "argument not provided". Chosen to be wildly unlikely +# to collide with any real partition_key, file_name, or other user value. +# Must stay in lockstep with R/duckdb.r build_get_dewey_urls_args(). +NONE_SENTINEL = "__DEWEYR_NULL__" + + +def _arg(i): + if i >= len(sys.argv): + return None + v = sys.argv[i] + return None if v == NONE_SENTINEL else v + + +def fail(message, **extra): + """Emit a structured JSON error to stderr and exit non-zero.""" + payload = {"error": message} + payload.update(extra) + print(json.dumps(payload), file=sys.stderr) + sys.exit(1) + + +try: + api_key = sys.argv[1] + data_id = sys.argv[2] + file_name = _arg(3) + preview = sys.argv[4].lower() == "true" + partition_key_after = _arg(5) + partition_key_before = _arg(6) +except IndexError: + fail( + "Insufficient arguments. Expected: api_key data_id file_name preview " + "partition_key_after partition_key_before" + ) set_api_key(api_key) -files = get_dataset_files(data_id) -urls = files[0]["link"] if preview else [f["link"] for f in files] +try: + if preview: + # Single-page hit avoids paginating the full manifest, which hangs on + # huge datasets like SafeGraph Visits. + params = {"page": 1} + if partition_key_after: + params["partition_key_after"] = partition_key_after + if partition_key_before: + params["partition_key_before"] = partition_key_before + resp = api_request( + "GET", + f"/v1/external/data/{data_id}/files", + params=params, + ).json() + files = resp.get("download_links", []) + else: + files = get_dataset_files( + data_id, + partition_key_after=partition_key_after, + partition_key_before=partition_key_before, + ) +except Exception as e: + fail( + f"Dewey API request failed: {type(e).__name__}: {e}", + traceback=traceback.format_exc(), + ) + +if not files: + fail( + "No files matched the given partition_key range", + partition_key_after=partition_key_after, + partition_key_before=partition_key_before, + preview=preview, + ) + +try: + first = files[0] + if preview: + urls = first["link"] + else: + urls = [f["link"] for f in files] + file_extension = first["file_extension"] + raw_file_name = first["file_name"] +except KeyError as e: + fail(f"Malformed file entry from Dewey API (missing key {e!s})") if not file_name: - file_name = files[0]["file_name"] - parent_folder = re.sub(r"[-_]\d.*$", "", file_name) + parent_folder = re.sub(r"[-_]\d.*$", "", raw_file_name) parent_folder = re.sub(r"-data$", "", parent_folder) + "-duckdb" else: parent_folder = file_name -file_extension = files[0]["file_extension"] +# In preview mode, files only represents page 1 — total bytes would be +# misleading. Report None so callers don't treat it as a true total. +if preview: + file_size_bytes = None +else: + try: + file_size_bytes = sum(f.get("file_size_bytes", 0) for f in files) + except Exception: + file_size_bytes = None -# ✅ Get column names in the same subprocess cols = [] if not preview: try: @@ -41,7 +120,7 @@ .columns.tolist() ) con.close() - except Exception as e: + except Exception: cols = [] # R will fall back gracefully print( @@ -50,9 +129,9 @@ "urls": urls, "parent_folder": parent_folder, "file_extension": file_extension, - "partition_key": files[0]["partition_key"], - "file_size_bytes": sum(f["file_size_bytes"] for f in files), - "cols": cols, # ✅ new field + "partition_key": first.get("partition_key"), + "file_size_bytes": file_size_bytes, + "cols": cols, } ) ) diff --git a/man/download_dewey_duck.Rd b/man/download_dewey_duck.Rd index 98562f7..9713c0e 100644 --- a/man/download_dewey_duck.Rd +++ b/man/download_dewey_duck.Rd @@ -12,7 +12,9 @@ download_dewey_duck( overwrite = FALSE, file_name = NULL, where = NULL, - select = NULL + select = NULL, + partition_key_after = NULL, + partition_key_before = NULL ) } \arguments{ @@ -37,6 +39,13 @@ Example: \code{where = "CARRIER_GROUP = 'Major'"}} \item{select}{Optional vector of column indices, ranges, or names to download. Accepts mixed input e.g. \code{c(1:3, 7, "CARRIER_NAME")}. The partition column will always be added automatically if missing.} + +\item{partition_key_after}{Optional character string (typically YYYY-MM-DD). +Pre-filters the file manifest to partitions on/after this key — drastically +reduces manifest size on large datasets like SafeGraph Visits.} + +\item{partition_key_before}{Optional character string (typically YYYY-MM-DD). +Pre-filters the file manifest to partitions on/before this key.} } \value{ The path to the downloaded dataset folder, invisibly. Pipe into @@ -69,6 +78,12 @@ download_dewey_duck(api_key, data_id, base_dir, select = c(1:3, "TOTAL") ) +# Date-bounded download (huge speedup on Visits-scale datasets) +download_dewey_duck(api_key, data_id, + partition_key_after = "2024-01-01", + partition_key_before = "2024-02-01" +) + # Download and read in one step df <- download_dewey_duck(api_key, data_id, partition = "MONTH_DATE_PARSED") |> read_dewey() diff --git a/tests/testthat/test-build_get_dewey_urls_args.R b/tests/testthat/test-build_get_dewey_urls_args.R new file mode 100644 index 0000000..1527bb5 --- /dev/null +++ b/tests/testthat/test-build_get_dewey_urls_args.R @@ -0,0 +1,93 @@ +# Tests for the R/Python argv contract used by get_dewey_urls.py. +# Order of positional args MUST stay in lockstep with that script. + +NONE <- deweyr:::DEWEYR_NULL_SENTINEL + +test_that("argv defaults: NULL file_name and partition keys collapse to the sentinel", { + args <- deweyr:::build_get_dewey_urls_args( + script = "/tmp/get_dewey_urls.py", + api_key = "key", + data_id = "prj_x__fldr_y", + file_name = NULL, + preview = FALSE, + partition_key_after = NULL, + partition_key_before = NULL + ) + expect_equal(args[1:4], c("run", "--python", "3.13", "/tmp/get_dewey_urls.py")) + expect_equal(args[5], "key") + expect_equal(args[6], "prj_x__fldr_y") + expect_equal(args[7], NONE) # file_name + expect_equal(args[8], "false") # preview lowercased + expect_equal(args[9], NONE) # partition_key_after + expect_equal(args[10], NONE) # partition_key_before + expect_length(args, 10) +}) + +test_that("argv: preview = TRUE serializes as lowercase 'true'", { + args <- deweyr:::build_get_dewey_urls_args( + script = "s.py", api_key = "k", data_id = "d", + file_name = NULL, preview = TRUE, + partition_key_after = NULL, partition_key_before = NULL + ) + expect_equal(args[8], "true") +}) + +test_that("argv: partition keys are forwarded verbatim", { + args <- deweyr:::build_get_dewey_urls_args( + script = "s.py", api_key = "k", data_id = "d", + file_name = NULL, preview = FALSE, + partition_key_after = "2024-01-01", + partition_key_before = "2024-02-01" + ) + expect_equal(args[9], "2024-01-01") + expect_equal(args[10], "2024-02-01") +}) + +test_that("argv: file_name is forwarded when provided", { + args <- deweyr:::build_get_dewey_urls_args( + script = "s.py", api_key = "k", data_id = "d", + file_name = "custom_folder", preview = FALSE, + partition_key_after = NULL, partition_key_before = NULL + ) + expect_equal(args[7], "custom_folder") +}) + +test_that("argv: python_version is overridable", { + args <- deweyr:::build_get_dewey_urls_args( + script = "s.py", api_key = "k", data_id = "d", + file_name = NULL, preview = FALSE, + partition_key_after = NULL, partition_key_before = NULL, + python_version = "3.12" + ) + expect_equal(args[3], "3.12") +}) + +test_that("argv: numeric partition_key is coerced to character", { + # Defensive — users may pass an integer year accidentally. + args <- deweyr:::build_get_dewey_urls_args( + script = "s.py", api_key = "k", data_id = "d", + file_name = NULL, preview = FALSE, + partition_key_after = 2024L, partition_key_before = NULL + ) + expect_type(args[9], "character") + expect_equal(args[9], "2024") +}) + +test_that("argv: sentinel is unlikely to collide with user input", { + # The sentinel must contain characters that would never appear in + # api_keys, partition keys (dates), or filenames. + expect_match(NONE, "DEWEYR") + expect_false(grepl("^[A-Za-z0-9-]+$", NONE)) # contains underscores +}) + +test_that("argv: a user partition_key that looks like the literal 'None' is forwarded as data, not sentinel", { + # The old sentinel was "None" — a real risk of collision. Confirm new + # sentinel doesn't collide with the obvious string a user might pass. + args <- deweyr:::build_get_dewey_urls_args( + script = "s.py", api_key = "k", data_id = "d", + file_name = NULL, preview = FALSE, + partition_key_after = "None", partition_key_before = NULL + ) + expect_equal(args[9], "None") # forwarded as a literal string + expect_false(args[9] == NONE) # NOT the sentinel +}) diff --git a/tests/testthat/test-download_dewey_duck.R b/tests/testthat/test-download_dewey_duck.R new file mode 100644 index 0000000..2977c53 --- /dev/null +++ b/tests/testthat/test-download_dewey_duck.R @@ -0,0 +1,131 @@ +# Tests for download_dewey_duck argument validation and forwarding. +# +# Strategy: download_dewey_duck calls validate_partition_key() FIRST, so we +# can hit the validator gates without any mocking. For "did the partition +# key reach get_dewey_urls?" we use local_mocked_bindings (with explicit +# .package = "deweyr" — without it the mock can silently no-op when run +# outside the package test harness, producing false-positive passes) to +# capture the call and short-circuit before DuckDB tries to download. + +# ---- validator gates (no mocking needed) ------------------------------------- + +test_that("download_dewey_duck rejects empty partition_key_after", { + expect_error( + download_dewey_duck("k", "prj_x__fldr_y", partition_key_after = ""), + "partition_key_after" + ) +}) + +test_that("download_dewey_duck rejects multi-element partition_key_before", { + expect_error( + download_dewey_duck( + "k", "prj_x__fldr_y", + partition_key_before = c("2024-01-01", "2024-02-01") + ), + "partition_key_before" + ) +}) + +test_that("download_dewey_duck rejects NA partition_key_after", { + expect_error( + download_dewey_duck("k", "prj_x__fldr_y", partition_key_after = NA_character_), + "partition_key_after" + ) +}) + +test_that("download_dewey_duck rejects newline injection in partition_key", { + expect_error( + download_dewey_duck( + "k", "prj_x__fldr_y", + partition_key_after = "2024-01-01\nmalicious" + ), + "newline" + ) +}) + +# ---- forwarding: every relevant arg reaches get_dewey_urls ------------------- + +test_that("download_dewey_duck forwards every relevant arg to get_dewey_urls", { + captured <- new.env() + fake_get_dewey_urls <- function(api_key, data_id, file_name = NULL, + preview = FALSE, + partition_key_after = NULL, + partition_key_before = NULL) { + captured$api_key <- api_key + captured$data_id <- data_id + captured$file_name <- file_name + captured$preview <- preview + captured$partition_key_after <- partition_key_after + captured$partition_key_before <- partition_key_before + stop("__captured__") + } + + testthat::local_mocked_bindings( + get_dewey_urls = fake_get_dewey_urls, + .package = "deweyr" + ) + + expect_error( + download_dewey_duck( + api_key = "secret", + data_id = "prj_x__fldr_y", + file_name = "my-folder", + partition_key_after = "2024-01-01", + partition_key_before = "2024-02-01" + ), + "__captured__" + ) + + expect_equal(captured$api_key, "secret") + expect_equal(captured$data_id, "prj_x__fldr_y") + expect_equal(captured$file_name, "my-folder") + expect_equal(captured$partition_key_after, "2024-01-01") + expect_equal(captured$partition_key_before, "2024-02-01") + # download_dewey_duck never sets preview=TRUE; default should be FALSE. + expect_false(isTRUE(captured$preview)) +}) + +test_that("download_dewey_duck forwards NULL partition keys when not supplied", { + captured <- new.env() + captured$seen <- FALSE + fake_get_dewey_urls <- function(api_key, data_id, file_name = NULL, + preview = FALSE, + partition_key_after = NULL, + partition_key_before = NULL) { + captured$seen <- TRUE + captured$partition_key_after <- partition_key_after + captured$partition_key_before <- partition_key_before + stop("__captured__") + } + + testthat::local_mocked_bindings( + get_dewey_urls = fake_get_dewey_urls, + .package = "deweyr" + ) + + expect_error(download_dewey_duck("k", "prj_x__fldr_y"), "__captured__") + + expect_true(captured$seen) + expect_null(captured$partition_key_after) + expect_null(captured$partition_key_before) +}) + +# ---- regression: the validator runs BEFORE get_dewey_urls -------------------- +# If someone moves the validate_partition_key calls below get_dewey_urls, +# the mock would be hit before the validator, and we'd get __captured__ +# instead of the validator's error. Lock the order in. + +test_that("partition_key validator fires before any subprocess call", { + fake_get_dewey_urls <- function(...) stop("__captured__") + testthat::local_mocked_bindings( + get_dewey_urls = fake_get_dewey_urls, + .package = "deweyr" + ) + + err <- tryCatch( + download_dewey_duck("k", "prj_x__fldr_y", partition_key_after = ""), + error = function(e) conditionMessage(e) + ) + expect_match(err, "partition_key_after", fixed = TRUE) + expect_false(grepl("__captured__", err, fixed = TRUE)) +}) diff --git a/tests/testthat/test-validate_partition_key.R b/tests/testthat/test-validate_partition_key.R new file mode 100644 index 0000000..e6bd726 --- /dev/null +++ b/tests/testthat/test-validate_partition_key.R @@ -0,0 +1,66 @@ +# Tests for validate_partition_key — cheap sanity gate before R/Python boundary. + +test_that("validate_partition_key: NULL is allowed (means 'no filter')", { + expect_invisible(deweyr:::validate_partition_key(NULL, "partition_key_after")) + expect_silent(deweyr:::validate_partition_key(NULL, "partition_key_after")) +}) + +test_that("validate_partition_key: valid date string passes", { + expect_silent(deweyr:::validate_partition_key("2024-01-01", "partition_key_after")) +}) + +test_that("validate_partition_key: empty string is rejected", { + expect_error( + deweyr:::validate_partition_key("", "partition_key_after"), + "must not be an empty string" + ) +}) + +test_that("validate_partition_key: NA character is rejected", { + expect_error( + deweyr:::validate_partition_key(NA_character_, "partition_key_after"), + "must be a single character string" + ) +}) + +test_that("validate_partition_key: numeric is rejected", { + expect_error( + deweyr:::validate_partition_key(20240101, "partition_key_after"), + "must be a single character string" + ) +}) + +test_that("validate_partition_key: multi-element character is rejected", { + expect_error( + deweyr:::validate_partition_key(c("2024-01-01", "2024-02-01"), "partition_key_after"), + "must be a single character string" + ) +}) + +test_that("validate_partition_key: zero-length character is rejected", { + expect_error( + deweyr:::validate_partition_key(character(0), "partition_key_after"), + "must be a single character string" + ) +}) + +test_that("validate_partition_key: embedded newline is rejected", { + expect_error( + deweyr:::validate_partition_key("2024-01-01\nrm -rf /", "partition_key_after"), + "must not contain newline" + ) +}) + +test_that("validate_partition_key: embedded carriage return is rejected", { + expect_error( + deweyr:::validate_partition_key("2024-01-01\r", "partition_key_after"), + "must not contain newline" + ) +}) + +test_that("validate_partition_key: arg_name is included in the error message", { + expect_error( + deweyr:::validate_partition_key("", "partition_key_before"), + "partition_key_before" + ) +}) From 68dc389b85c465979c85490bbe561279e9bfeb21 Mon Sep 17 00:00:00 2001 From: Aaron Moon Date: Mon, 4 May 2026 13:08:19 -0700 Subject: [PATCH 4/4] Validate api_key and data_id before shelling out to uv system2() silently drops empty-string elements from args, which shifts every positional argv on the Python side. With Sys.getenv("DEWEY_API_KEY") returning "" (e.g. an R session that didn't load .Renviron), this caused data_id to be read as the sentinel string and produced a confusing 401 on data/__DEWEYR_NULL__/files instead of a clear "set your API key" message. Reject empty / NA / non-character api_key and data_id at the gate, with an actionable hint pointing to readRenviron(). Co-Authored-By: Claude Opus 4.7 --- R/duckdb.r | 17 +++++++++++++++++ tests/testthat/test-download_dewey_duck.R | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/R/duckdb.r b/R/duckdb.r index cf28ab2..e00b101 100644 --- a/R/duckdb.r +++ b/R/duckdb.r @@ -44,6 +44,21 @@ build_get_dewey_urls_args <- function(script, api_key, data_id, file_name, ) } +# Reject an empty / NA / missing api_key BEFORE shelling out to uv. +# system2() drops empty-string args, which silently shifts every positional +# argv on the Python side and produces a confusing 401-on-the-wrong-URL. +validate_required_string <- function(value, arg_name) { + if (is.null(value) || !is.character(value) || length(value) != 1 || + is.na(value) || !nzchar(value)) { + stop(arg_name, " must be a non-empty character string. ", + "Got: ", deparse(value), + if (identical(arg_name, "api_key")) + " (hint: Sys.getenv(\"DEWEY_API_KEY\") may be empty if .Renviron didn't load — try readRenviron(\"~/.Renviron\"))" + else "") + } + invisible(NULL) +} + #' Validate a partition_key argument #' #' Cheap sanity-check on user-supplied partition_key values before they go @@ -97,6 +112,8 @@ validate_partition_key <- function(value, arg_name) { #' @noRd get_dewey_urls <- function(api_key, data_id, file_name = NULL, preview = FALSE, partition_key_after = NULL, partition_key_before = NULL) { + validate_required_string(api_key, "api_key") + validate_required_string(data_id, "data_id") if (!check_uv()) { install_uv() message("Restarting the terminal will increase speed of future runs") diff --git a/tests/testthat/test-download_dewey_duck.R b/tests/testthat/test-download_dewey_duck.R index 2977c53..ccc1d19 100644 --- a/tests/testthat/test-download_dewey_duck.R +++ b/tests/testthat/test-download_dewey_duck.R @@ -9,6 +9,28 @@ # ---- validator gates (no mocking needed) ------------------------------------- +# ---- regression: empty api_key / data_id error before shell out -------------- +# Empty strings are silently dropped by system2(), shifting Python's positional +# argv. Without this gate the user gets a confusing 401 on a malformed URL +# (data/__DEWEYR_NULL__/files) instead of a clear "set your API key" message. + +test_that("preview_dewey_duck rejects empty api_key with a useful hint", { + err <- tryCatch( + preview_dewey_duck("", "prj_x__fldr_y", limit = 0), + error = function(e) conditionMessage(e) + ) + expect_match(err, "api_key", fixed = TRUE) + expect_match(err, "non-empty", fixed = TRUE) + expect_match(err, "DEWEY_API_KEY", fixed = TRUE) # the hint +}) + +test_that("download_dewey_duck rejects NA data_id", { + expect_error( + download_dewey_duck("k", NA_character_), + "data_id" + ) +}) + test_that("download_dewey_duck rejects empty partition_key_after", { expect_error( download_dewey_duck("k", "prj_x__fldr_y", partition_key_after = ""),