From a43e44c8233724352b9cb9afc0a7b7ba0e51dd1e Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Mon, 30 Mar 2026 16:31:33 +0300 Subject: [PATCH 01/16] initial commit --- MetalogDB/R/build_tse.R | 0 MetalogDB/R/fetch_with_cache.R | 0 MetalogDB/R/filter_samples.R | 0 MetalogDB/R/inject_provenance.R | 0 MetalogDB/R/read_metalog_data.R | 0 MetalogDB/R/resolve_metalog_url.R | 0 MetalogDB/R/validate_inputs.R | 0 MetalogDB/README.md | 0 MetalogDB/fetchMetalogTSE.R | 45 +++++++++++++++++ MetalogDB/spec.md | 80 +++++++++++++++++++++++++++++++ 10 files changed, 125 insertions(+) create mode 100644 MetalogDB/R/build_tse.R create mode 100644 MetalogDB/R/fetch_with_cache.R create mode 100644 MetalogDB/R/filter_samples.R create mode 100644 MetalogDB/R/inject_provenance.R create mode 100644 MetalogDB/R/read_metalog_data.R create mode 100644 MetalogDB/R/resolve_metalog_url.R create mode 100644 MetalogDB/R/validate_inputs.R create mode 100644 MetalogDB/README.md create mode 100644 MetalogDB/fetchMetalogTSE.R create mode 100644 MetalogDB/spec.md diff --git a/MetalogDB/R/build_tse.R b/MetalogDB/R/build_tse.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/R/fetch_with_cache.R b/MetalogDB/R/fetch_with_cache.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/R/filter_samples.R b/MetalogDB/R/filter_samples.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/R/inject_provenance.R b/MetalogDB/R/inject_provenance.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/R/read_metalog_data.R b/MetalogDB/R/read_metalog_data.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/R/resolve_metalog_url.R b/MetalogDB/R/resolve_metalog_url.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/R/validate_inputs.R b/MetalogDB/R/validate_inputs.R new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/README.md b/MetalogDB/README.md new file mode 100644 index 0000000..e69de29 diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R new file mode 100644 index 0000000..55485d3 --- /dev/null +++ b/MetalogDB/fetchMetalogTSE.R @@ -0,0 +1,45 @@ +# This function downloads metaphlan profiles and associated metadata +# from the Metalog database. The option exists to filter to a subset +# if a samplelist is provided, downloaded from the Metalog webUI. +# +# Author: Rasmus Hindström +# Date: --- + +# Libraries +library(mia) +library(data.table) +library(httr2) + +# Source helpers +sapply( + list.files(path = "R/", pattern = "\\.R$", full.names = TRUE), + source +) + +# ------------- +# Main function +# ------------- + +fetchMetalogTSE <- function( + collection, # One of "human", "animal", "ocean_water", "other_environ" + metadata = "core", # One of "core", "partially_harmonized", "all" + samplelist = NULL, + cache = TRUE +) { + # Validate inputs + + # Consruct download URLs, download and cache + + # Data ingest + + # Filtering + + # TSE construction + + # License injection + + # Explicit return of tse object + +} + +# Export function to namespace? diff --git a/MetalogDB/spec.md b/MetalogDB/spec.md new file mode 100644 index 0000000..5935656 --- /dev/null +++ b/MetalogDB/spec.md @@ -0,0 +1,80 @@ + +# Software Specification: Metalog to TSE Fetcher + +*Generated by Gemini3 to guide development* + +## 1. Overview +The goal of this package is to provide an interface between the open metalog database and the R/Bioconductor ecosystem. The package dynamically downloads specified microbiome collections and metadata, subsets the data if requested, and constructs a strictly validated `TreeSummarizedExperiment` (TSE) object. + +## 2. Dependencies +* **Core:** `TreeSummarizedExperiment`, `SummarizedExperiment`, `S4Vectors` +* **Data Import/Download:** `httr2` or `curl` (for robust API/download requests), `data.table` or `vroom` (for fast reading of large abundance/metadata tables) +* **Caching (Highly Recommended):** `BiocFileCache` or `rappdirs` (to prevent re-downloading massive datasets if the user runs the function multiple times). + +## 3. User-Facing API + +### Main Function Signature +```r +fetchMetalogTSE(collection, metadata_resolution = "core", samplelist = NULL, cache = TRUE) +``` + +### Arguments Definition +* **`collection`** *(character)*: The target metalog collection. + * *Allowed values:* `"human"`, `"animal"`, `"ocean_water"`, `"other_environmental"`. +* **`metadata_resolution`** *(character)*: The depth of metadata to attach to the samples. + * *Allowed values:* `"core"`, `"partially_harmonized"`, `"all_study_specific"`. +* **`samplelist`** *(character vector or NULL)*: A vector of specific sample IDs to retain. + * *Default:* `NULL` (downloads and retains all samples in the chosen collection). +* **`cache`** *(logical)*: Whether to use local caching for downloaded files to speed up subsequent requests. + +### Output +* Returns a `TreeSummarizedExperiment` object. + +## 4. Internal Architecture & Pipeline + +The execution of `fetchMetalogTSE` should follow a strict internal pipeline divided into modular, internal helper functions. + +### Step 1: Input Validation (`.validate_inputs()`) +* Check if `collection` and `metadata_resolution` match the allowed strict choices. Stop and throw an informative error if not. +* If `samplelist` is provided, ensure it is a valid character vector. + +### Step 2: URL Construction & Caching (`.resolve_metalog_url()`, `.fetch_with_cache()`) +* Map the combination of `collection` and `metadata_resolution` to the specific metalog download URLs or API endpoints. +* Check the local cache. If the file exists and is up-to-date, load from the cache. If not, initiate the download. +* *Action Required:* Print a console message notifying the user: `"Downloading/loading data from metalog (License: ODbL v1.0)..."` + +### Step 3: Data Ingestion (`.read_metalog_data()`) +* Load the raw data files into memory. Because microbiome datasets can be large, use fast parsers like `data.table::fread()` or `vroom::vroom()`. +* Data typically consists of: + 1. **Feature Table:** (Taxa/ASVs/OTUs x Samples) + 2. **Taxonomy Table:** (Taxa x Taxonomic Ranks) + 3. **Metadata Table:** (Samples x Metadata Variables) + 4. **Phylogenetic Tree:** (Optional, if metalog provides Newick files) + +### Step 4: Filtering (`.filter_samples()`) +* *Condition:* If `samplelist` is NOT `NULL`. +* Subset the Metadata Table to include only rows matching `samplelist`. +* Subset the Feature Table columns (or rows, depending on orientation) to match `samplelist`. +* *Optimization:* If the files are overwhelmingly large, attempt to filter the data *during* the read step (e.g., selecting specific columns) to save RAM, if the parser allows it. + +### Step 5: TSE Construction (`.build_tse()`) +Map the ingested, filtered R objects to the standard TSE slots: +* **`assays`**: A list containing the Feature Table (e.g., `list(counts = feature_matrix)`). Ensure it is a standard `matrix` or a sparse matrix (`dgCMatrix`) to save memory. +* **`colData`**: A `DataFrame` containing the chosen metadata table. Row names must exactly match the column names of the `assays` matrix. +* **`rowData`**: A `DataFrame` containing the taxonomy table. Row names must exactly match the row names of the `assays` matrix. +* **`rowTree`**: The phylogenetic tree object (typically `phylo` class), if applicable. + +### Step 6: License & Metadata Injection (`.inject_provenance()`) +* Append provenance data to the `metadata` slot of the TSE object to fulfill ODbL licensing attribution requirements. +```r +metadata(tse)$source <- "metalog database" +metadata(tse)$license <- "Open Database License (ODbL) v1.0" +metadata(tse)$collection <- collection +metadata(tse)$metadata_resolution <- metadata_resolution +metadata(tse)$download_date <- Sys.Date() +``` + +## 5. Error Handling & Edge Cases +* **Missing Samples:** If a user passes a `samplelist` but some of those IDs do not exist in the requested `collection`, throw a warning listing the missing IDs (or a truncated list if there are many) and proceed with the ones that were found. +* **Network Timeouts:** Wrap the download step in `tryCatch()` to gracefully handle poor internet connections or metalog server downtime. Suggest the user check their connection or try again later. +* **Empty Result:** If the `samplelist` filtering results in 0 matching samples, stop and throw an error rather than building an empty, useless TSE object. From b980bc2f559ccabc632d8aa590baaecf40e0ddad Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 13:16:14 +0300 Subject: [PATCH 02/16] add scaffolds --- MetalogDB/R/resolve_metalog_url.R | 7 ++++++ MetalogDB/R/validate_inputs.R | 36 +++++++++++++++++++++++++++++++ MetalogDB/fetchMetalogTSE.R | 7 ++++++ 3 files changed, 50 insertions(+) diff --git a/MetalogDB/R/resolve_metalog_url.R b/MetalogDB/R/resolve_metalog_url.R index e69de29..62616a9 100644 --- a/MetalogDB/R/resolve_metalog_url.R +++ b/MetalogDB/R/resolve_metalog_url.R @@ -0,0 +1,7 @@ +# Function constructs download URL for requested metalog data from DB, +# If cache, checks if files already exist in datadir, and downloads if not. +# Otherwise downloads files to datadir. + +.resolve_metalog_url <- function(collection, metadata, cache) { + +} diff --git a/MetalogDB/R/validate_inputs.R b/MetalogDB/R/validate_inputs.R index e69de29..e370630 100644 --- a/MetalogDB/R/validate_inputs.R +++ b/MetalogDB/R/validate_inputs.R @@ -0,0 +1,36 @@ +# Function to check that inputs to fetchMetalogTSE + +.validate_inputs <- function(collection, metadata, samplelist, cache) { + allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") + if (missing(collection) || !collection %in% allowed_collections) { + stop( + "Validation Error: 'collection' must be one of: ", + paste(paste0('""', allowed_collections, '""'), collapse = ", ") + ) + } + + allowed_metadata <- c("core", "partially_harmonized", "all") + if (!metadata %in% allowed_metadata) { + stop( + "Validation Error: 'metadata' must be one of: ", + paste(paste0('""', allowed_metadata, '""'), collapse = ", ") + ) + } + + if (!is.null(samplelist)) { + ext <- tolower(tools::file_ext(samplelist)) + allowed_exts <- c("csv", "tsv", "txt", "json") + if (!ext %in% allowed_exts) { + stop( + "Validation Error: samplelist file type must be one of: ", + paste(allowed_exts, collapse = ", ") + ) + } + } + + if (!is.logical(cache) || length(cache) != 1 || is.na(cache)) { + stop("Validation Error: 'cache' must be a single logical value (TRUE or FALSE)") + } + + invisible(NULL) +} diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 55485d3..71e7d66 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -26,7 +26,14 @@ fetchMetalogTSE <- function( samplelist = NULL, cache = TRUE ) { + # Validate inputs + .validate_inputs( + collection = collection, + metadata = metadata, + samplelist = samplelist, + cache = cache + ) # Consruct download URLs, download and cache From 866804e1dee88b79b832f948c15ef0828926f40b Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 13:35:32 +0300 Subject: [PATCH 03/16] update argument name --- MetalogDB/R/resolve_metalog_url.R | 2 +- MetalogDB/R/validate_inputs.R | 6 +++--- MetalogDB/fetchMetalogTSE.R | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/MetalogDB/R/resolve_metalog_url.R b/MetalogDB/R/resolve_metalog_url.R index 62616a9..ab3c106 100644 --- a/MetalogDB/R/resolve_metalog_url.R +++ b/MetalogDB/R/resolve_metalog_url.R @@ -2,6 +2,6 @@ # If cache, checks if files already exist in datadir, and downloads if not. # Otherwise downloads files to datadir. -.resolve_metalog_url <- function(collection, metadata, cache) { +.resolve_metalog_url <- function(collection, metadata, use_cache) { } diff --git a/MetalogDB/R/validate_inputs.R b/MetalogDB/R/validate_inputs.R index e370630..d17fe0a 100644 --- a/MetalogDB/R/validate_inputs.R +++ b/MetalogDB/R/validate_inputs.R @@ -1,6 +1,6 @@ # Function to check that inputs to fetchMetalogTSE -.validate_inputs <- function(collection, metadata, samplelist, cache) { +.validate_inputs <- function(collection, metadata, samplelist, use_cache) { allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") if (missing(collection) || !collection %in% allowed_collections) { stop( @@ -28,8 +28,8 @@ } } - if (!is.logical(cache) || length(cache) != 1 || is.na(cache)) { - stop("Validation Error: 'cache' must be a single logical value (TRUE or FALSE)") + if (!is.logical(use_cache) || length(use_cache) != 1 || is.na(use_cache)) { + stop("Validation Error: 'use_cache' must be a single logical value (TRUE or FALSE)") } invisible(NULL) diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 71e7d66..9ae5f74 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -24,7 +24,7 @@ fetchMetalogTSE <- function( collection, # One of "human", "animal", "ocean_water", "other_environ" metadata = "core", # One of "core", "partially_harmonized", "all" samplelist = NULL, - cache = TRUE + use_cache = TRUE ) { # Validate inputs @@ -32,7 +32,7 @@ fetchMetalogTSE <- function( collection = collection, metadata = metadata, samplelist = samplelist, - cache = cache + use_cache = use_cache ) # Consruct download URLs, download and cache From 626e5bee9ba2f38ebc0ac9ee5cd284b75ae0b6bf Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 13:39:13 +0300 Subject: [PATCH 04/16] create data_cache dir --- MetalogDB/fetchMetalogTSE.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 9ae5f74..462cba7 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -27,6 +27,8 @@ fetchMetalogTSE <- function( use_cache = TRUE ) { + dir.create(".data_cache", recursive = TRUE) + # Validate inputs .validate_inputs( collection = collection, @@ -34,7 +36,7 @@ fetchMetalogTSE <- function( samplelist = samplelist, use_cache = use_cache ) - + # Consruct download URLs, download and cache # Data ingest From d17489d1233f090b9bb5fccf2ac9aee22e4f023d Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 14:59:59 +0300 Subject: [PATCH 05/16] Download added --- MetalogDB/R/.download_if_missing.R | 39 +++++++++++++++++++++++++++++ MetalogDB/R/resolve_metalog_url.R | 40 ++++++++++++++++++++++++++++++ MetalogDB/fetchMetalogTSE.R | 10 ++++---- 3 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 MetalogDB/R/.download_if_missing.R diff --git a/MetalogDB/R/.download_if_missing.R b/MetalogDB/R/.download_if_missing.R new file mode 100644 index 0000000..e0d1866 --- /dev/null +++ b/MetalogDB/R/.download_if_missing.R @@ -0,0 +1,39 @@ +# Adapted from Metalog's example script + +.download_if_missing <- function(target_url, download_dir = ".data_cache", use_cache = TRUE) { + + base_filename <- basename(target_url) + + # Caching Logic + if (use_cache) { + # Replace "latest" with a date regex pattern (YYYY-MM-DD) + pattern <- sub("latest", "[0-9]{4}-[0-9]{2}-[0-9]{2}", base_filename) + matching_files <- list.files(download_dir, pattern = pattern, full.names = TRUE) + + if (length(matching_files) > 0) { + latest_file <- max(matching_files) + message("Loaded cached file: ", latest_file) + return(latest_file) + } + } else { + message("Skipping cache. Forcing download for: ", base_filename) + } + + # We make a simple GET request first just to see where "latest" redirects us + message("Fetching file from Metalog...") + response <- httr::GET(base_url, httr::config(followlocation = TRUE)) + + if (httr::status_code(response) != 200) { + stop("Error fetching the file! Status code: ", httr::status_code(response)) + } + + # Extract the final URL and save + url_with_date <- response$url + filename <- basename(url_with_date) + destfile <- file.path(download_dir, filename) + + message("Downloading to: ", destfile) + httr::GET(url_with_date, httr::write_disk(destfile, overwrite = TRUE)) + + return(destfile) +} diff --git a/MetalogDB/R/resolve_metalog_url.R b/MetalogDB/R/resolve_metalog_url.R index ab3c106..3a6dbc6 100644 --- a/MetalogDB/R/resolve_metalog_url.R +++ b/MetalogDB/R/resolve_metalog_url.R @@ -4,4 +4,44 @@ .resolve_metalog_url <- function(collection, metadata, use_cache) { + cache_dir <- ".data_cache" + if (!dir.exists(cache_dir)) { + dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) + } + + # Construct download URL's into target list + base_url <- "https://metalog.embl.de/static/download" + + profile <- dplyr::case_when( + collection == "human" ~ "human", + collection == "animal" ~ "animal", + collection == "ocean" ~ "ocean", + collection == "other_environment" ~ "environmental" + ) + assay_url <- sprintf("%s/profiles/%s_metaphlan4_latest.tsv.gz", base_url, profile) + + md_type <- dplyr::case_when( + metadata == "core" ~ "core", + metadata == "partially_harmonized" ~ "extended", + metadata == "all" ~ "all" + ) + md_url <- sprintf("%s/metadata/%s_%s_long_latest.tsv.gz", base_url, profile, md_type) + + # Get files + assay_file <- .download_if_missing( + target_url = assay_url, + download_dir = cache_dir, + use_cache = use_cache + ) + + md_file <- .download_if_missing( + target_url = md_url, + download_dir = cache_dir, + use_cache = use_cache + ) + + return(list( + assay = assay_file, + md = md_file + )) } diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 462cba7..25b2f2d 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -8,7 +8,8 @@ # Libraries library(mia) library(data.table) -library(httr2) +library(dplyr) +library(httr) # Source helpers sapply( @@ -21,14 +22,12 @@ sapply( # ------------- fetchMetalogTSE <- function( - collection, # One of "human", "animal", "ocean_water", "other_environ" + collection, # One of "human", "animal", "ocean", "other_environment" metadata = "core", # One of "core", "partially_harmonized", "all" samplelist = NULL, use_cache = TRUE ) { - dir.create(".data_cache", recursive = TRUE) - # Validate inputs .validate_inputs( collection = collection, @@ -37,7 +36,8 @@ fetchMetalogTSE <- function( use_cache = use_cache ) - # Consruct download URLs, download and cache + # Construct download URLs, download and cache + data_files <- .resolve_metalog_url(collection, metadata, use_cache) # Data ingest From 993279dcbbd6d5066959a1d4cffb140721c9090d Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 15:16:19 +0300 Subject: [PATCH 06/16] move helpers to main script --- ...oad_if_missing.R => download_if_missing.R} | 0 MetalogDB/fetchMetalogTSE.R | 137 +++++++++++++++++- 2 files changed, 131 insertions(+), 6 deletions(-) rename MetalogDB/R/{.download_if_missing.R => download_if_missing.R} (100%) diff --git a/MetalogDB/R/.download_if_missing.R b/MetalogDB/R/download_if_missing.R similarity index 100% rename from MetalogDB/R/.download_if_missing.R rename to MetalogDB/R/download_if_missing.R diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 25b2f2d..c42bcfc 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -11,11 +11,135 @@ library(data.table) library(dplyr) library(httr) -# Source helpers -sapply( - list.files(path = "R/", pattern = "\\.R$", full.names = TRUE), - source -) +# ------------- +# Helpers FUNs +# ------------- + +# Function to check that inputs to fetchMetalogTSE +.validate_inputs <- function(collection, metadata, samplelist, use_cache) { + allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") + if (missing(collection) || !collection %in% allowed_collections) { + stop( + "Validation Error: 'collection' must be one of: ", + paste(paste0('""', allowed_collections, '""'), collapse = ", ") + ) + } + + allowed_metadata <- c("core", "partially_harmonized", "all") + if (!metadata %in% allowed_metadata) { + stop( + "Validation Error: 'metadata' must be one of: ", + paste(paste0('""', allowed_metadata, '""'), collapse = ", ") + ) + } + + if (!is.null(samplelist)) { + ext <- tolower(tools::file_ext(samplelist)) + allowed_exts <- c("csv", "tsv", "txt", "json") + if (!ext %in% allowed_exts) { + stop( + "Validation Error: samplelist file type must be one of: ", + paste(allowed_exts, collapse = ", ") + ) + } + } + + if (!is.logical(use_cache) || length(use_cache) != 1 || is.na(use_cache)) { + stop("Validation Error: 'use_cache' must be a single logical value (TRUE or FALSE)") + } + + invisible(NULL) +} + +# Function to download datafiles, adapted from Metalog's example script +.download_if_missing <- function( + target_url, + download_dir = ".data_cache", + use_cache = TRUE + ) { + + base_filename <- basename(target_url) + + # Caching Logic + if (use_cache) { + # Replace "latest" with a date regex pattern (YYYY-MM-DD) + pattern <- sub("latest", "[0-9]{4}-[0-9]{2}-[0-9]{2}", base_filename) + matching_files <- list.files(download_dir, pattern = pattern, full.names = TRUE) + + if (length(matching_files) > 0) { + latest_file <- max(matching_files) + message("Loaded cached file: ", latest_file) + return(latest_file) + } + } else { + message("Skipping cache. Forcing download for: ", base_filename) + } + + # We make a simple GET request first just to see where "latest" redirects us + message("Fetching file from Metalog...") + response <- httr::GET(target_url, httr::config(followlocation = TRUE)) + + if (httr::status_code(response) != 200) { + stop("Error fetching the file! Status code: ", httr::status_code(response)) + } + + # Extract the final URL and save + url_with_date <- response$url + filename <- basename(url_with_date) + destfile <- file.path(download_dir, filename) + + message("Downloading to: ", destfile) + httr::GET(url_with_date, httr::write_disk(destfile, overwrite = TRUE)) + + return(destfile) +} + +# Function constructs download URL for requested metalog data from DB, +# If cache, checks if files already exist in datadir, and downloads if not. +# Otherwise downloads files to datadir. +.resolve_metalog_url <- function(collection, metadata, use_cache) { + + cache_dir <- ".data_cache" + if (!dir.exists(cache_dir)) { + dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) + } + + # Construct download URL's into target list + base_url <- "https://metalog.embl.de/static/download" + + profile <- dplyr::case_when( + collection == "human" ~ "human", + collection == "animal" ~ "animal", + collection == "ocean" ~ "ocean", + collection == "other_environment" ~ "environmental" + ) + assay_url <- sprintf("%s/profiles/%s_metaphlan4_latest.tsv.gz", base_url, profile) + + md_type <- dplyr::case_when( + metadata == "core" ~ "core", + metadata == "partially_harmonized" ~ "extended", + metadata == "all" ~ "all" + ) + md_url <- sprintf("%s/metadata/%s_%s_long_latest.tsv.gz", base_url, profile, md_type) + + # Get files + assay_file <- .download_if_missing( + target_url = assay_url, + download_dir = cache_dir, + use_cache = use_cache + ) + + md_file <- .download_if_missing( + target_url = md_url, + download_dir = cache_dir, + use_cache = use_cache + ) + + return(list( + assay = assay_file, + md = md_file + )) +} # ------------- # Main function @@ -39,7 +163,8 @@ fetchMetalogTSE <- function( # Construct download URLs, download and cache data_files <- .resolve_metalog_url(collection, metadata, use_cache) - # Data ingest + # Injest data + # Filtering From b9d313d173c9518b008f84b61d3377485cff89ed Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 16:44:34 +0300 Subject: [PATCH 07/16] working implementation --- MetalogDB/fetchMetalogTSE.R | 155 ++++++++++++++++++++++++++++++++++-- 1 file changed, 147 insertions(+), 8 deletions(-) diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index c42bcfc..3e2c6d0 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -6,10 +6,12 @@ # Date: --- # Libraries -library(mia) +library(Matrix) +library(TreeSummarizedExperiment) library(data.table) library(dplyr) library(httr) +library(mia) # ------------- # Helpers FUNs @@ -141,6 +143,129 @@ library(httr) )) } +# Function loads assay profile as sparse matrix +.load_assay <- function(path, sep = "\t") { + # Read and subset to SGB only + dt <- fread(path, sep = sep) + dt <- dt[startsWith(clade_name, "t__SGB"), ] + + # Ensure expected names and types + setnames(dt, c("sample_alias", "clade_name", "rel_abund")) + dt[, rel_abund := as.numeric(rel_abund)] + dt <- dt[!is.na(rel_abund) & rel_abund != 0] + + # Aggregate duplicates (taxon, sample) -> sum(rel_abund) + setkey(dt, clade_name, sample_alias) + dt <- dt[, .(rel_abund = sum(rel_abund)), by = .(clade_name, sample_alias)] + # Map taxa and samples + taxa <- sort(unique(dt$clade_name)) + samples <- sort(unique(dt$sample_alias)) + i <- match(dt$clade_name, taxa) + j <- match(dt$sample_alias, samples) + x <- dt$rel_abund + + # Build sparse matrix (rows = taxa, cols = samples) + X <- sparseMatrix( + i = i, j = j, x = x, + dims = c(length(taxa), length(samples)), + dimnames = list(taxa, samples) + ) + assay <- list(assay = X, taxa = taxa, samples = samples) + return(assay) +} + +# Loads the metadata and filteres to samples found in assay +.load_metadata <- function(meta_df, samples, sep = "\t") { + dt <- fread(meta_df, sep = sep, na.strings = c("", "NA")) + + # Pivot to wide + wide <- dcast( + dt, + sample_alias ~ metadata_item, + value.var = "value", + fill = NA_character_ + ) + + meta_df <- as.data.frame(wide) + rownames(meta_df) <- meta_df$sample_alias + + # Output Dataframe subset to samples + meta_df <- meta_df[samples, ] + return(meta_df) +} + +# Maps full lineage names levels +.construct_taxmap <- function(database, taxa) { + taxmap <- fread(database, sep = "\t", header = TRUE) + taxmap <- taxmap[startsWith(clade_name, "t__SGB")] + + taxmap <- taxmap %>% + distinct(clade_name, .keep_all = TRUE) %>% + filter(clade_name %in% taxa) + + idx <- match(taxa, taxmap$clade_name) + taxmap <- taxmap[idx, ] + + rownames(taxmap) <- taxmap$clade_name + + taxmap <- taxmap %>% + filter(clade_name %in% taxa) %>% + tidyr::separate( + col = lineage, + into = c( + "Kingdom", "Phylum", "Class", "Order", + "Family", "Genus", "Species", "SGB" + ), + sep = "\\|", + fill = "right" + ) %>% + select(-NCBI_taxids, -clade_name) %>% + as.data.frame() + + # Subset to taxa in present + taxmap <- taxmap[taxa, ] + return(taxmap) +} + +# Function filters profile data down to provided samples +.filter_datasets <- function(assay_list, samplelist) { + + # Resolve the samplelist into a vector of target IDs + ext <- tolower(tools::file_ext(samplelist)) + + # Read file based on extension + if (ext %in% c("csv", "tsv")) { + sl_df <- data.table::fread(samplelist) + } else if (ext == "json") { + if (!requireNamespace("jsonlite", quietly = TRUE)) { + stop("The 'jsonlite' package is required to read JSON sample lists.") + } + sl_df <- as.data.frame(jsonlite::fromJSON(samplelist)) + } + # Grab samples + target_samples <- sl_df[["sample_alias"]] + + # Intersect requested samples with available samples + available_samples <- assay_list[["samples"]] + keep_samples <- intersect(target_samples, available_samples) + + if (length(keep_samples) == 0) { + stop("Filtering Error: None of the provided samples were found in the dataset.") + } + + # Subset the sparse matrix (columns = samples) + assay_list$assay <- assay_list$assay[, keep_samples, drop = FALSE] + assay_list$samples <- keep_samples + + # Drop taxa that now have 0 abundance across all remaining samples + row_sums <- Matrix::rowSums(assay_list$assay) + keep_taxa <- names(row_sums[row_sums > 0]) + assay_list$assay <- assay_list$assay[keep_taxa, , drop = FALSE] + assay_list$taxa <- keep_taxa + + return(assay_list) +} + # ------------- # Main function # ------------- @@ -162,18 +287,32 @@ fetchMetalogTSE <- function( # Construct download URLs, download and cache data_files <- .resolve_metalog_url(collection, metadata, use_cache) + mapping_db <- .download_if_missing( + "https://metalog.embl.de/static/download/profiles/metaphlan4_clades.tsv.gz", + use_cache = use_cache + ) # Injest data - + assay_list <- .load_assay(data_files[["assay"]]) # Filtering + if (!is.null(samplelist)) { + message("Filtering datasets to include only the requested samples...") + assay_list <- .filter_datasets(assay_list, samplelist) + } + md_dt <- .load_metadata(data_files[["md"]], assay_list[["samples"]]) - # TSE construction + # Construct TSE with full lineage mappings + tax <- .construct_taxmap(mapping_db, assay_list[["taxa"]]) + + tse <- TreeSummarizedExperiment( + assays = SimpleList("relabundance" = assay_list[["assay"]]), + colData = DataFrame(md_dt), + rowData = DataFrame(tax) + ) # License injection - - # Explicit return of tse object - + metadata(tse)$license <- "https://metalog.embl.de/ - Open Database License (ODbL) v1.0" + + return(tse) } - -# Export function to namespace? From 42efdbd348bf8e3ebfe32c69aa8ad08b71d999ac Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 16:49:04 +0300 Subject: [PATCH 08/16] auto format file --- MetalogDB/fetchMetalogTSE.R | 190 ++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 97 deletions(-) diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 3e2c6d0..f6550d8 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -1,7 +1,7 @@ # This function downloads metaphlan profiles and associated metadata # from the Metalog database. The option exists to filter to a subset -# if a samplelist is provided, downloaded from the Metalog webUI. -# +# if a samplelist is provided, downloaded from the Metalog webUI. +# # Author: Rasmus Hindström # Date: --- @@ -19,38 +19,38 @@ library(mia) # Function to check that inputs to fetchMetalogTSE .validate_inputs <- function(collection, metadata, samplelist, use_cache) { - allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") - if (missing(collection) || !collection %in% allowed_collections) { - stop( - "Validation Error: 'collection' must be one of: ", - paste(paste0('""', allowed_collections, '""'), collapse = ", ") - ) - } - - allowed_metadata <- c("core", "partially_harmonized", "all") - if (!metadata %in% allowed_metadata) { - stop( - "Validation Error: 'metadata' must be one of: ", - paste(paste0('""', allowed_metadata, '""'), collapse = ", ") - ) - } - - if (!is.null(samplelist)) { - ext <- tolower(tools::file_ext(samplelist)) - allowed_exts <- c("csv", "tsv", "txt", "json") - if (!ext %in% allowed_exts) { - stop( - "Validation Error: samplelist file type must be one of: ", - paste(allowed_exts, collapse = ", ") - ) - } - } - - if (!is.logical(use_cache) || length(use_cache) != 1 || is.na(use_cache)) { - stop("Validation Error: 'use_cache' must be a single logical value (TRUE or FALSE)") - } - - invisible(NULL) + allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") + if (missing(collection) || !collection %in% allowed_collections) { + stop( + "Validation Error: 'collection' must be one of: ", + paste(paste0('""', allowed_collections, '""'), collapse = ", ") + ) + } + + allowed_metadata <- c("core", "partially_harmonized", "all") + if (!metadata %in% allowed_metadata) { + stop( + "Validation Error: 'metadata' must be one of: ", + paste(paste0('""', allowed_metadata, '""'), collapse = ", ") + ) + } + + if (!is.null(samplelist)) { + ext <- tolower(tools::file_ext(samplelist)) + allowed_exts <- c("csv", "tsv", "txt", "json") + if (!ext %in% allowed_exts) { + stop( + "Validation Error: samplelist file type must be one of: ", + paste(allowed_exts, collapse = ", ") + ) + } + } + + if (!is.logical(use_cache) || length(use_cache) != 1 || is.na(use_cache)) { + stop("Validation Error: 'use_cache' must be a single logical value (TRUE or FALSE)") + } + + invisible(NULL) } # Function to download datafiles, adapted from Metalog's example script @@ -58,16 +58,15 @@ library(mia) target_url, download_dir = ".data_cache", use_cache = TRUE - ) { - +) { base_filename <- basename(target_url) - + # Caching Logic if (use_cache) { # Replace "latest" with a date regex pattern (YYYY-MM-DD) pattern <- sub("latest", "[0-9]{4}-[0-9]{2}-[0-9]{2}", base_filename) matching_files <- list.files(download_dir, pattern = pattern, full.names = TRUE) - + if (length(matching_files) > 0) { latest_file <- max(matching_files) message("Loaded cached file: ", latest_file) @@ -76,23 +75,23 @@ library(mia) } else { message("Skipping cache. Forcing download for: ", base_filename) } - + # We make a simple GET request first just to see where "latest" redirects us message("Fetching file from Metalog...") response <- httr::GET(target_url, httr::config(followlocation = TRUE)) - + if (httr::status_code(response) != 200) { stop("Error fetching the file! Status code: ", httr::status_code(response)) } - + # Extract the final URL and save url_with_date <- response$url filename <- basename(url_with_date) destfile <- file.path(download_dir, filename) - + message("Downloading to: ", destfile) httr::GET(url_with_date, httr::write_disk(destfile, overwrite = TRUE)) - + return(destfile) } @@ -100,47 +99,46 @@ library(mia) # If cache, checks if files already exist in datadir, and downloads if not. # Otherwise downloads files to datadir. .resolve_metalog_url <- function(collection, metadata, use_cache) { + cache_dir <- ".data_cache" + if (!dir.exists(cache_dir)) { + dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) + } + + # Construct download URL's into target list + base_url <- "https://metalog.embl.de/static/download" - cache_dir <- ".data_cache" - if (!dir.exists(cache_dir)) { - dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) - } - - # Construct download URL's into target list - base_url <- "https://metalog.embl.de/static/download" - - profile <- dplyr::case_when( - collection == "human" ~ "human", - collection == "animal" ~ "animal", - collection == "ocean" ~ "ocean", - collection == "other_environment" ~ "environmental" - ) - assay_url <- sprintf("%s/profiles/%s_metaphlan4_latest.tsv.gz", base_url, profile) - - md_type <- dplyr::case_when( - metadata == "core" ~ "core", - metadata == "partially_harmonized" ~ "extended", - metadata == "all" ~ "all" - ) - md_url <- sprintf("%s/metadata/%s_%s_long_latest.tsv.gz", base_url, profile, md_type) - - # Get files - assay_file <- .download_if_missing( - target_url = assay_url, - download_dir = cache_dir, - use_cache = use_cache - ) - - md_file <- .download_if_missing( - target_url = md_url, - download_dir = cache_dir, - use_cache = use_cache - ) - - return(list( - assay = assay_file, - md = md_file - )) + profile <- dplyr::case_when( + collection == "human" ~ "human", + collection == "animal" ~ "animal", + collection == "ocean" ~ "ocean", + collection == "other_environment" ~ "environmental" + ) + assay_url <- sprintf("%s/profiles/%s_metaphlan4_latest.tsv.gz", base_url, profile) + + md_type <- dplyr::case_when( + metadata == "core" ~ "core", + metadata == "partially_harmonized" ~ "extended", + metadata == "all" ~ "all" + ) + md_url <- sprintf("%s/metadata/%s_%s_long_latest.tsv.gz", base_url, profile, md_type) + + # Get files + assay_file <- .download_if_missing( + target_url = assay_url, + download_dir = cache_dir, + use_cache = use_cache + ) + + md_file <- .download_if_missing( + target_url = md_url, + download_dir = cache_dir, + use_cache = use_cache + ) + + return(list( + assay = assay_file, + md = md_file + )) } # Function loads assay profile as sparse matrix @@ -205,7 +203,7 @@ library(mia) idx <- match(taxa, taxmap$clade_name) taxmap <- taxmap[idx, ] - + rownames(taxmap) <- taxmap$clade_name taxmap <- taxmap %>% @@ -229,10 +227,9 @@ library(mia) # Function filters profile data down to provided samples .filter_datasets <- function(assay_list, samplelist) { - # Resolve the samplelist into a vector of target IDs ext <- tolower(tools::file_ext(samplelist)) - + # Read file based on extension if (ext %in% c("csv", "tsv")) { sl_df <- data.table::fread(samplelist) @@ -244,25 +241,25 @@ library(mia) } # Grab samples target_samples <- sl_df[["sample_alias"]] - + # Intersect requested samples with available samples available_samples <- assay_list[["samples"]] keep_samples <- intersect(target_samples, available_samples) - + if (length(keep_samples) == 0) { stop("Filtering Error: None of the provided samples were found in the dataset.") } - + # Subset the sparse matrix (columns = samples) assay_list$assay <- assay_list$assay[, keep_samples, drop = FALSE] assay_list$samples <- keep_samples - + # Drop taxa that now have 0 abundance across all remaining samples row_sums <- Matrix::rowSums(assay_list$assay) keep_taxa <- names(row_sums[row_sums > 0]) assay_list$assay <- assay_list$assay[keep_taxa, , drop = FALSE] assay_list$taxa <- keep_taxa - + return(assay_list) } @@ -271,12 +268,11 @@ library(mia) # ------------- fetchMetalogTSE <- function( - collection, # One of "human", "animal", "ocean", "other_environment" - metadata = "core", # One of "core", "partially_harmonized", "all" + collection, # One of "human", "animal", "ocean", "other_environment" + metadata = "core", # One of "core", "partially_harmonized", "all" samplelist = NULL, use_cache = TRUE ) { - # Validate inputs .validate_inputs( collection = collection, @@ -303,8 +299,8 @@ fetchMetalogTSE <- function( md_dt <- .load_metadata(data_files[["md"]], assay_list[["samples"]]) # Construct TSE with full lineage mappings - tax <- .construct_taxmap(mapping_db, assay_list[["taxa"]]) - + tax <- .construct_taxmap(mapping_db, assay_list[["taxa"]]) + tse <- TreeSummarizedExperiment( assays = SimpleList("relabundance" = assay_list[["assay"]]), colData = DataFrame(md_dt), @@ -313,6 +309,6 @@ fetchMetalogTSE <- function( # License injection metadata(tse)$license <- "https://metalog.embl.de/ - Open Database License (ODbL) v1.0" - + return(tse) } From 538901387c00fcc1f56484068614103342924a38 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 31 Mar 2026 16:51:07 +0300 Subject: [PATCH 09/16] Code review from Gemini3 --- MetalogDB/gemini3-codereview.md | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 MetalogDB/gemini3-codereview.md diff --git a/MetalogDB/gemini3-codereview.md b/MetalogDB/gemini3-codereview.md new file mode 100644 index 0000000..f5b9b37 --- /dev/null +++ b/MetalogDB/gemini3-codereview.md @@ -0,0 +1,38 @@ +Here is a code review of your `fetchMetalogTSE.R` script. + +Overall, this is a very well-structured script. You've done an excellent job modularizing the code into logical helper functions, utilizing `data.table` for rapid file reading, and applying sparse matrices (`Matrix::sparseMatrix`) to keep memory usage low when handling large microbiome datasets. + +There are, however, a few critical bugs (specifically regarding input validation and missing dependencies) and some areas where you can improve robustness and best practices. + +### **1. Critical Bugs** + +* **Mismatch in Allowed `collection` Values:** There is a direct conflict between your validation function and your execution logic. + In `.validate_inputs`, you enforce: `c("human", "animal", "ocean_water", "other_environmental")`. + However, the `fetchMetalogTSE` signature and `.resolve_metalog_url` expect: `"ocean"` and `"other_environment"`. If a user passes `"ocean"`, `.validate_inputs` will throw an error. You need to standardize these names across the script. +* **Missing Dependency for `tidyr`:** In `.construct_taxmap`, you use `tidyr::separate(...)`. However, `library(tidyr)` is missing from your setup block at the top of the file. If a user runs this script without having `tidyr` loaded, it will crash. +* **Missing Namespace for `SimpleList` and `DataFrame`:** + In your main function, you use `SimpleList` and `DataFrame`. These belong to `S4Vectors`. While they are often attached when loading `TreeSummarizedExperiment`, it is safer to either load `S4Vectors` explicitly or use `S4Vectors::SimpleList` and `S4Vectors::DataFrame` to prevent environment-specific errors. + +### **2. Robustness and Defensive Programming** + +* **Hardcoded Cache Directory:** You are creating a `.data_cache` folder directly in the user's current working directory. If you plan to bundle this into an R package, this violates CRAN policies (packages cannot alter the user's working directory). + *Recommendation:* Use `tools::R_user_dir("YourPackageName", which = "cache")` or the `rappdirs` package to find the OS-appropriate, centralized cache directory. +* **Unsafe Column Subsetting in `.filter_datasets`:** + You extract target samples using `target_samples <- sl_df[["sample_alias"]]`. If the uploaded sample list doesn't contain a column explicitly named `"sample_alias"`, `target_samples` becomes `NULL`. The subsequent `intersect` will fail, or worse, silently empty your dataset. + *Recommendation:* Add a check: `if (!"sample_alias" %in% names(sl_df)) stop(...)`. +* **Inconsistent Namespace Usage (`::`):** + You use `httr::GET` and `dplyr::case_when`, but you use `fread`, `setnames`, and `dcast` without `data.table::`. If you are writing a script, `library()` calls are fine. But if this is destined for an R package, it's best practice to use `::` consistently for imported functions or strictly manage your `@import` tags in roxygen. +* **Unused Library:** + You load `library(mia)` at the top, but you never actually call any `mia` functions (e.g., `makeTreeSummarizedExperimentFromPhyloseq` or similar). Unless it's needed for a side-effect, you should remove it to minimize dependencies. + +### **3. Performance & Memory** + +* **Reading Massive TSVs:** In `.load_assay`, `dt <- fread(path, sep = sep)` reads the *entire* file into RAM before filtering it down with `dt[startsWith(clade_name, "t__SGB"), ]`. Metaphlan profiles can be massive. + *Recommendation:* If you are on a Unix-like system, you could use the `cmd` argument in `fread` to pre-filter via bash before it hits R memory (e.g., `fread(cmd = paste0("grep 't__SGB' ", path))`). If you need to remain cross-platform, what you have is fine, just be aware of the peak memory spike. +* **Duplicate Conversion:** + `dt[, rel_abund := as.numeric(rel_abund)]` is fine, but if the dataset is properly formatted as TSV, `fread` should automatically detect it as numeric. Forcing the conversion creates a brief, unnecessary copy in memory. + +### **4. Minor Style & Formatting Notes** + +* **Silent `NULL` Return:** In `.validate_inputs`, you end with `invisible(NULL)`. It is generally cleaner to just use `return(invisible(TRUE))` to indicate the validation passed successfully. +* **License Injection:** `metadata(tse)$license <- "..."` is a nice touch! It ensures data provenance travels with the object. From 74c38f4e5d182be11bec364d09155d94a891b46b Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 11:05:49 +0300 Subject: [PATCH 10/16] nix stuff and small adjustment --- MetalogDB/.Rprofile | 54 ++++++++++++++++++++++++++++++++ MetalogDB/.envrc | 1 + MetalogDB/default.nix | 62 +++++++++++++++++++++++++++++++++++++ MetalogDB/fetchMetalogTSE.R | 5 +-- MetalogDB/generate_env.R | 23 ++++++++++++++ 5 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 MetalogDB/.Rprofile create mode 100644 MetalogDB/.envrc create mode 100644 MetalogDB/default.nix create mode 100644 MetalogDB/generate_env.R diff --git a/MetalogDB/.Rprofile b/MetalogDB/.Rprofile new file mode 100644 index 0000000..91774d0 --- /dev/null +++ b/MetalogDB/.Rprofile @@ -0,0 +1,54 @@ +### File generated by `rix::rix_init()` ### +# 1. Currently, system RStudio does not inherit environmental variables +# defined in `$HOME/.zshrc`, `$HOME/.bashrc` and alike. This is workaround to +# make the path of the nix store and hence basic nix commands available +# in an RStudio session +# 2. For nix-R session, remove `R_LIBS_USER`, system's R user library.`. +# This guarantees no user libraries from the system are loaded and only +# R packages in the Nix store are used. This makes Nix-R behave in pure manner +# at run-time. +{ + is_rstudio <- Sys.getenv("RSTUDIO") == "1" + is_nix_r <- nzchar(Sys.getenv("NIX_STORE")) + is_code <- Sys.getenv("TERM_PROGRAM") == "vscode" + is_positron <- Sys.getenv("POSITRON") == "1" + if (isFALSE(is_nix_r) && isTRUE(is_rstudio)) { + cat("{rix} detected RStudio R session") + old_path <- Sys.getenv("PATH") + nix_path <- "/nix/var/nix/profiles/default/bin" + has_nix_path <- any(grepl(nix_path, old_path)) + if (isFALSE(has_nix_path)) { + Sys.setenv(PATH = paste(old_path, nix_path, sep = ":")) + } + rm(old_path, nix_path) + } + if (isTRUE(is_nix_r)) { + install.packages <- function(...) { + stop("You are currently in an R session running from Nix.\n", "Don't install packages using install.packages(),\nadd them to ", "the default.nix file instead.") + } + update.packages <- function(...) { + stop("You are currently in an R session running from Nix.\n", "Don't update packages using update.packages(),\n", "generate a new default.nix with a more recent version of R. ", "If you need bleeding edge packages, read the", "'Understanding the rPackages set release cycle and using ", "bleeding edge packages' vignette.") + } + remove.packages <- function(...) { + stop("You are currently in an R session running from Nix.\n", "Don't remove packages using `remove.packages()``,\ndelete them ", "from the default.nix file instead.") + } + current_paths <- .libPaths() + userlib_paths <- Sys.getenv("R_LIBS_USER") + user_dir <- grep(paste(userlib_paths, collapse = "|"), current_paths, fixed = TRUE) + new_paths <- current_paths[-user_dir] + .libPaths(new_paths) + rm(current_paths, userlib_paths, user_dir, new_paths) + } + if (isTRUE(is_code) && interactive() && isFALSE(is_rstudio) && isFALSE(is_positron)) { + vscode_r_init <- file.path(Sys.getenv(if (.Platform$OS.type == "windows") + "USERPROFILE" + else "HOME"), ".vscode-R", "init.R") + if (file.exists(vscode_r_init)) { + source(vscode_r_init) + } + else { + message("No .vscode-R/init.R file found. If you want to use VSCode-R, you need to source it in your .Rprofile or start vscode from within nix-shell") + } + } + rm(is_rstudio, is_nix_r, is_code, is_positron) +} diff --git a/MetalogDB/.envrc b/MetalogDB/.envrc new file mode 100644 index 0000000..1d953f4 --- /dev/null +++ b/MetalogDB/.envrc @@ -0,0 +1 @@ +use nix diff --git a/MetalogDB/default.nix b/MetalogDB/default.nix new file mode 100644 index 0000000..5dbc01b --- /dev/null +++ b/MetalogDB/default.nix @@ -0,0 +1,62 @@ +# This file was generated by the {rix} R package v0.18.2 on 2026-04-01 +# with following call: +# >rix(r_ver = "98d89b5d2a42d5f965c30ef32eee726a11357cfd", +# > r_pkgs = c("Matrix", +# > "TreeSummarizedExperiment", +# > "data.table", +# > "dplyr", +# > "httr", +# > "jsonlite", +# > "mia", +# > "rix"), +# > system_pkgs = NULL, +# > git_pkgs = NULL, +# > ide = "radian", +# > project_path = path_default_nix, +# > overwrite = TRUE, +# > print = FALSE) +# It uses upstream nixpkgs' revision 98d89b5d2a42d5f965c30ef32eee726a11357cfd for reproducibility purposes +# which will install R version latest-upstream. +# Report any issues to https://github.com/ropensci/rix +let + pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/98d89b5d2a42d5f965c30ef32eee726a11357cfd.tar.gz") {}; + + rpkgs = builtins.attrValues { + inherit (pkgs.rPackages) + data_table + dplyr + httr + jsonlite + Matrix + mia + rix + TreeSummarizedExperiment; + }; + + system_packages = builtins.attrValues { + inherit (pkgs) + glibcLocales + nix + R; + }; + + wrapped_pkgs = pkgs.radianWrapper.override { + packages = [ rpkgs ]; + }; + + shell = pkgs.mkShell { + LOCALE_ARCHIVE = if pkgs.stdenv.hostPlatform.system == "x86_64-linux" then "${pkgs.glibcLocales}/lib/locale/locale-archive" else ""; + LANG = "en_US.UTF-8"; + LC_ALL = "en_US.UTF-8"; + LC_TIME = "en_US.UTF-8"; + LC_MONETARY = "en_US.UTF-8"; + LC_PAPER = "en_US.UTF-8"; + LC_MEASUREMENT = "en_US.UTF-8"; + + buildInputs = [ rpkgs system_packages wrapped_pkgs ]; + + }; +in + { + inherit pkgs shell; + } diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index f6550d8..b9e4de0 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -77,7 +77,7 @@ library(mia) } # We make a simple GET request first just to see where "latest" redirects us - message("Fetching file from Metalog...") + message(paste0("Fetching file from Metalog...", target_url)) response <- httr::GET(target_url, httr::config(followlocation = TRUE)) if (httr::status_code(response) != 200) { @@ -283,6 +283,7 @@ fetchMetalogTSE <- function( # Construct download URLs, download and cache data_files <- .resolve_metalog_url(collection, metadata, use_cache) + # Lastest database file for tax mapping mapping_db <- .download_if_missing( "https://metalog.embl.de/static/download/profiles/metaphlan4_clades.tsv.gz", use_cache = use_cache @@ -298,7 +299,7 @@ fetchMetalogTSE <- function( } md_dt <- .load_metadata(data_files[["md"]], assay_list[["samples"]]) - # Construct TSE with full lineage mappings + # Map SGB's to full lineage tax <- .construct_taxmap(mapping_db, assay_list[["taxa"]]) tse <- TreeSummarizedExperiment( diff --git a/MetalogDB/generate_env.R b/MetalogDB/generate_env.R new file mode 100644 index 0000000..3d022a6 --- /dev/null +++ b/MetalogDB/generate_env.R @@ -0,0 +1,23 @@ +library(rix) + +path_default_nix <- "." + +rix( + r_ver = "latest-upstream", + r_pkgs = c( + "Matrix", + "TreeSummarizedExperiment", + "data.table", + "dplyr", + "httr", + "jsonlite", + "mia", # Not strictly needed, just useful to test + "rix" # Can be removed from final + ), + system_pkgs = NULL, + git_pkgs = NULL, + ide = "radian", + project_path = path_default_nix, + overwrite = TRUE, + print = FALSE +) From 9eb7940be0105e491e5764e797db34d7174bf2fe Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 11:57:58 +0300 Subject: [PATCH 11/16] sidestep https to http downgrade on server --- MetalogDB/fetchMetalogTSE.R | 73 +++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index b9e4de0..5c89511 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -19,7 +19,7 @@ library(mia) # Function to check that inputs to fetchMetalogTSE .validate_inputs <- function(collection, metadata, samplelist, use_cache) { - allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") + allowed_collections <- c("human", "animal", "ocean", "environmental") if (missing(collection) || !collection %in% allowed_collections) { stop( "Validation Error: 'collection' must be one of: ", @@ -27,7 +27,7 @@ library(mia) ) } - allowed_metadata <- c("core", "partially_harmonized", "all") + allowed_metadata <- c("core", "extended", "all") if (!metadata %in% allowed_metadata) { stop( "Validation Error: 'metadata' must be one of: ", @@ -54,6 +54,7 @@ library(mia) } # Function to download datafiles, adapted from Metalog's example script +# Function to download datafiles, adapted to handle broken HTTP redirects .download_if_missing <- function( target_url, download_dir = ".data_cache", @@ -63,7 +64,6 @@ library(mia) # Caching Logic if (use_cache) { - # Replace "latest" with a date regex pattern (YYYY-MM-DD) pattern <- sub("latest", "[0-9]{4}-[0-9]{2}-[0-9]{2}", base_filename) matching_files <- list.files(download_dir, pattern = pattern, full.names = TRUE) @@ -72,25 +72,53 @@ library(mia) message("Loaded cached file: ", latest_file) return(latest_file) } - } else { - message("Skipping cache. Forcing download for: ", base_filename) } - # We make a simple GET request first just to see where "latest" redirects us - message(paste0("Fetching file from Metalog...", target_url)) - response <- httr::GET(target_url, httr::config(followlocation = TRUE)) + if (!dir.exists(download_dir)) { + dir.create(download_dir, recursive = TRUE, showWarnings = FALSE) + } - if (httr::status_code(response) != 200) { - stop("Error fetching the file! Status code: ", httr::status_code(response)) + message("Fetching file from: ", target_url) + + ### Masquarade as a browser, in case of anti-scraping measures + ua <- httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + + ### Metalog server downgrades protocol from https >> http, + ### Since 1st of April 2026. This janky solution is needed to force https + + # Catch the redirect + initial_req <- httr::GET(target_url, ua, httr::config(followlocation = FALSE)) + + # Check if we got a redirect to HTTP + if (initial_req$status_code >= 300 && initial_req$status_code < 400) { + # Extract the redirected URL + final_url <- initial_req$headers$location + + # Force HTTPS protocol + final_url <- sub("^http://", "https://", final_url) + message("Intercepted redirect. Forcing HTTPS: ", final_url) + + } else if (initial_req$status_code == 200) { + final_url <- target_url # No redirect occurred + } else { + stop("Initial request failed with status: ", initial_req$status_code) } - # Extract the final URL and save - url_with_date <- response$url - filename <- basename(url_with_date) + # Download the actual file from the corrected URL + filename <- basename(final_url) destfile <- file.path(download_dir, filename) - + message("Downloading to: ", destfile) - httr::GET(url_with_date, httr::write_disk(destfile, overwrite = TRUE)) + final_req <- httr::GET( + final_url, + ua, + httr::write_disk(destfile, overwrite = TRUE) + ) + + if (httr::status_code(final_req) != 200) { + if (file.exists(destfile)) file.remove(destfile) + stop("Error downloading the file! Status code: ", httr::status_code(final_req)) + } return(destfile) } @@ -106,20 +134,11 @@ library(mia) # Construct download URL's into target list base_url <- "https://metalog.embl.de/static/download" - - profile <- dplyr::case_when( - collection == "human" ~ "human", - collection == "animal" ~ "animal", - collection == "ocean" ~ "ocean", - collection == "other_environment" ~ "environmental" - ) + + profile <- collection assay_url <- sprintf("%s/profiles/%s_metaphlan4_latest.tsv.gz", base_url, profile) - md_type <- dplyr::case_when( - metadata == "core" ~ "core", - metadata == "partially_harmonized" ~ "extended", - metadata == "all" ~ "all" - ) + md_type <- metadata md_url <- sprintf("%s/metadata/%s_%s_long_latest.tsv.gz", base_url, profile, md_type) # Get files From 2386c4b85bdf3e8d14aa8b67c7ec5d13e6c8e42b Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 11:58:12 +0300 Subject: [PATCH 12/16] flesh out readme --- MetalogDB/README.md | 37 +++++++++++++++ MetalogDB/gemini3-codereview.md | 38 ---------------- MetalogDB/spec.md | 80 --------------------------------- 3 files changed, 37 insertions(+), 118 deletions(-) delete mode 100644 MetalogDB/gemini3-codereview.md delete mode 100644 MetalogDB/spec.md diff --git a/MetalogDB/README.md b/MetalogDB/README.md index e69de29..605e861 100644 --- a/MetalogDB/README.md +++ b/MetalogDB/README.md @@ -0,0 +1,37 @@ +# Usage + +To use this function download the directory and source the `fetchMetalogTSE.R` +script. You should now have the function available in your R session. + +The function will create a `.data_cache` dir in your working directory if one +is not present. + +To build a `tse` -object you can call the function like so; + +`tse <- fetchMetalogTSE("human", "core")` + +It will attempt to download the latest versions of datafiles from the Metalog +Database and compile them into a TSE. + +By default it will use the cache on subsequent calls. + +Available options are; + +"human", "animal", "ocean", and "environmental" + +It will download the metaphlan4 profiles for these collections. + +For metadata options are; + +"core", "extended", "all" + +Core metadata is available for most, if not all, samples. Extended +is the partially harmonized metadata set, while "all" downloads all the metadata +available down to study specific variables. + +You can disable cache with the `use_cache = FALSE` option, this will force +a download of the latest available datasets. + +Additionally you can use the metalog webUI to explore the samples. This allows +you to download a samplelist file that you can pass to the function, and +it will subset the tse to the selected samples upon creation. \ No newline at end of file diff --git a/MetalogDB/gemini3-codereview.md b/MetalogDB/gemini3-codereview.md deleted file mode 100644 index f5b9b37..0000000 --- a/MetalogDB/gemini3-codereview.md +++ /dev/null @@ -1,38 +0,0 @@ -Here is a code review of your `fetchMetalogTSE.R` script. - -Overall, this is a very well-structured script. You've done an excellent job modularizing the code into logical helper functions, utilizing `data.table` for rapid file reading, and applying sparse matrices (`Matrix::sparseMatrix`) to keep memory usage low when handling large microbiome datasets. - -There are, however, a few critical bugs (specifically regarding input validation and missing dependencies) and some areas where you can improve robustness and best practices. - -### **1. Critical Bugs** - -* **Mismatch in Allowed `collection` Values:** There is a direct conflict between your validation function and your execution logic. - In `.validate_inputs`, you enforce: `c("human", "animal", "ocean_water", "other_environmental")`. - However, the `fetchMetalogTSE` signature and `.resolve_metalog_url` expect: `"ocean"` and `"other_environment"`. If a user passes `"ocean"`, `.validate_inputs` will throw an error. You need to standardize these names across the script. -* **Missing Dependency for `tidyr`:** In `.construct_taxmap`, you use `tidyr::separate(...)`. However, `library(tidyr)` is missing from your setup block at the top of the file. If a user runs this script without having `tidyr` loaded, it will crash. -* **Missing Namespace for `SimpleList` and `DataFrame`:** - In your main function, you use `SimpleList` and `DataFrame`. These belong to `S4Vectors`. While they are often attached when loading `TreeSummarizedExperiment`, it is safer to either load `S4Vectors` explicitly or use `S4Vectors::SimpleList` and `S4Vectors::DataFrame` to prevent environment-specific errors. - -### **2. Robustness and Defensive Programming** - -* **Hardcoded Cache Directory:** You are creating a `.data_cache` folder directly in the user's current working directory. If you plan to bundle this into an R package, this violates CRAN policies (packages cannot alter the user's working directory). - *Recommendation:* Use `tools::R_user_dir("YourPackageName", which = "cache")` or the `rappdirs` package to find the OS-appropriate, centralized cache directory. -* **Unsafe Column Subsetting in `.filter_datasets`:** - You extract target samples using `target_samples <- sl_df[["sample_alias"]]`. If the uploaded sample list doesn't contain a column explicitly named `"sample_alias"`, `target_samples` becomes `NULL`. The subsequent `intersect` will fail, or worse, silently empty your dataset. - *Recommendation:* Add a check: `if (!"sample_alias" %in% names(sl_df)) stop(...)`. -* **Inconsistent Namespace Usage (`::`):** - You use `httr::GET` and `dplyr::case_when`, but you use `fread`, `setnames`, and `dcast` without `data.table::`. If you are writing a script, `library()` calls are fine. But if this is destined for an R package, it's best practice to use `::` consistently for imported functions or strictly manage your `@import` tags in roxygen. -* **Unused Library:** - You load `library(mia)` at the top, but you never actually call any `mia` functions (e.g., `makeTreeSummarizedExperimentFromPhyloseq` or similar). Unless it's needed for a side-effect, you should remove it to minimize dependencies. - -### **3. Performance & Memory** - -* **Reading Massive TSVs:** In `.load_assay`, `dt <- fread(path, sep = sep)` reads the *entire* file into RAM before filtering it down with `dt[startsWith(clade_name, "t__SGB"), ]`. Metaphlan profiles can be massive. - *Recommendation:* If you are on a Unix-like system, you could use the `cmd` argument in `fread` to pre-filter via bash before it hits R memory (e.g., `fread(cmd = paste0("grep 't__SGB' ", path))`). If you need to remain cross-platform, what you have is fine, just be aware of the peak memory spike. -* **Duplicate Conversion:** - `dt[, rel_abund := as.numeric(rel_abund)]` is fine, but if the dataset is properly formatted as TSV, `fread` should automatically detect it as numeric. Forcing the conversion creates a brief, unnecessary copy in memory. - -### **4. Minor Style & Formatting Notes** - -* **Silent `NULL` Return:** In `.validate_inputs`, you end with `invisible(NULL)`. It is generally cleaner to just use `return(invisible(TRUE))` to indicate the validation passed successfully. -* **License Injection:** `metadata(tse)$license <- "..."` is a nice touch! It ensures data provenance travels with the object. diff --git a/MetalogDB/spec.md b/MetalogDB/spec.md deleted file mode 100644 index 5935656..0000000 --- a/MetalogDB/spec.md +++ /dev/null @@ -1,80 +0,0 @@ - -# Software Specification: Metalog to TSE Fetcher - -*Generated by Gemini3 to guide development* - -## 1. Overview -The goal of this package is to provide an interface between the open metalog database and the R/Bioconductor ecosystem. The package dynamically downloads specified microbiome collections and metadata, subsets the data if requested, and constructs a strictly validated `TreeSummarizedExperiment` (TSE) object. - -## 2. Dependencies -* **Core:** `TreeSummarizedExperiment`, `SummarizedExperiment`, `S4Vectors` -* **Data Import/Download:** `httr2` or `curl` (for robust API/download requests), `data.table` or `vroom` (for fast reading of large abundance/metadata tables) -* **Caching (Highly Recommended):** `BiocFileCache` or `rappdirs` (to prevent re-downloading massive datasets if the user runs the function multiple times). - -## 3. User-Facing API - -### Main Function Signature -```r -fetchMetalogTSE(collection, metadata_resolution = "core", samplelist = NULL, cache = TRUE) -``` - -### Arguments Definition -* **`collection`** *(character)*: The target metalog collection. - * *Allowed values:* `"human"`, `"animal"`, `"ocean_water"`, `"other_environmental"`. -* **`metadata_resolution`** *(character)*: The depth of metadata to attach to the samples. - * *Allowed values:* `"core"`, `"partially_harmonized"`, `"all_study_specific"`. -* **`samplelist`** *(character vector or NULL)*: A vector of specific sample IDs to retain. - * *Default:* `NULL` (downloads and retains all samples in the chosen collection). -* **`cache`** *(logical)*: Whether to use local caching for downloaded files to speed up subsequent requests. - -### Output -* Returns a `TreeSummarizedExperiment` object. - -## 4. Internal Architecture & Pipeline - -The execution of `fetchMetalogTSE` should follow a strict internal pipeline divided into modular, internal helper functions. - -### Step 1: Input Validation (`.validate_inputs()`) -* Check if `collection` and `metadata_resolution` match the allowed strict choices. Stop and throw an informative error if not. -* If `samplelist` is provided, ensure it is a valid character vector. - -### Step 2: URL Construction & Caching (`.resolve_metalog_url()`, `.fetch_with_cache()`) -* Map the combination of `collection` and `metadata_resolution` to the specific metalog download URLs or API endpoints. -* Check the local cache. If the file exists and is up-to-date, load from the cache. If not, initiate the download. -* *Action Required:* Print a console message notifying the user: `"Downloading/loading data from metalog (License: ODbL v1.0)..."` - -### Step 3: Data Ingestion (`.read_metalog_data()`) -* Load the raw data files into memory. Because microbiome datasets can be large, use fast parsers like `data.table::fread()` or `vroom::vroom()`. -* Data typically consists of: - 1. **Feature Table:** (Taxa/ASVs/OTUs x Samples) - 2. **Taxonomy Table:** (Taxa x Taxonomic Ranks) - 3. **Metadata Table:** (Samples x Metadata Variables) - 4. **Phylogenetic Tree:** (Optional, if metalog provides Newick files) - -### Step 4: Filtering (`.filter_samples()`) -* *Condition:* If `samplelist` is NOT `NULL`. -* Subset the Metadata Table to include only rows matching `samplelist`. -* Subset the Feature Table columns (or rows, depending on orientation) to match `samplelist`. -* *Optimization:* If the files are overwhelmingly large, attempt to filter the data *during* the read step (e.g., selecting specific columns) to save RAM, if the parser allows it. - -### Step 5: TSE Construction (`.build_tse()`) -Map the ingested, filtered R objects to the standard TSE slots: -* **`assays`**: A list containing the Feature Table (e.g., `list(counts = feature_matrix)`). Ensure it is a standard `matrix` or a sparse matrix (`dgCMatrix`) to save memory. -* **`colData`**: A `DataFrame` containing the chosen metadata table. Row names must exactly match the column names of the `assays` matrix. -* **`rowData`**: A `DataFrame` containing the taxonomy table. Row names must exactly match the row names of the `assays` matrix. -* **`rowTree`**: The phylogenetic tree object (typically `phylo` class), if applicable. - -### Step 6: License & Metadata Injection (`.inject_provenance()`) -* Append provenance data to the `metadata` slot of the TSE object to fulfill ODbL licensing attribution requirements. -```r -metadata(tse)$source <- "metalog database" -metadata(tse)$license <- "Open Database License (ODbL) v1.0" -metadata(tse)$collection <- collection -metadata(tse)$metadata_resolution <- metadata_resolution -metadata(tse)$download_date <- Sys.Date() -``` - -## 5. Error Handling & Edge Cases -* **Missing Samples:** If a user passes a `samplelist` but some of those IDs do not exist in the requested `collection`, throw a warning listing the missing IDs (or a truncated list if there are many) and proceed with the ones that were found. -* **Network Timeouts:** Wrap the download step in `tryCatch()` to gracefully handle poor internet connections or metalog server downtime. Suggest the user check their connection or try again later. -* **Empty Result:** If the `samplelist` filtering results in 0 matching samples, stop and throw an error rather than building an empty, useless TSE object. From 0dfde66cc3fdbc5d68a637a49b3a85ff68d7f73e Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 12:13:04 +0300 Subject: [PATCH 13/16] add helpers direct to function --- MetalogDB/R/build_tse.R | 0 MetalogDB/R/download_if_missing.R | 39 ------------------------- MetalogDB/R/fetch_with_cache.R | 0 MetalogDB/R/filter_samples.R | 0 MetalogDB/R/inject_provenance.R | 0 MetalogDB/R/read_metalog_data.R | 0 MetalogDB/R/resolve_metalog_url.R | 47 ------------------------------- MetalogDB/R/validate_inputs.R | 36 ----------------------- 8 files changed, 122 deletions(-) delete mode 100644 MetalogDB/R/build_tse.R delete mode 100644 MetalogDB/R/download_if_missing.R delete mode 100644 MetalogDB/R/fetch_with_cache.R delete mode 100644 MetalogDB/R/filter_samples.R delete mode 100644 MetalogDB/R/inject_provenance.R delete mode 100644 MetalogDB/R/read_metalog_data.R delete mode 100644 MetalogDB/R/resolve_metalog_url.R delete mode 100644 MetalogDB/R/validate_inputs.R diff --git a/MetalogDB/R/build_tse.R b/MetalogDB/R/build_tse.R deleted file mode 100644 index e69de29..0000000 diff --git a/MetalogDB/R/download_if_missing.R b/MetalogDB/R/download_if_missing.R deleted file mode 100644 index e0d1866..0000000 --- a/MetalogDB/R/download_if_missing.R +++ /dev/null @@ -1,39 +0,0 @@ -# Adapted from Metalog's example script - -.download_if_missing <- function(target_url, download_dir = ".data_cache", use_cache = TRUE) { - - base_filename <- basename(target_url) - - # Caching Logic - if (use_cache) { - # Replace "latest" with a date regex pattern (YYYY-MM-DD) - pattern <- sub("latest", "[0-9]{4}-[0-9]{2}-[0-9]{2}", base_filename) - matching_files <- list.files(download_dir, pattern = pattern, full.names = TRUE) - - if (length(matching_files) > 0) { - latest_file <- max(matching_files) - message("Loaded cached file: ", latest_file) - return(latest_file) - } - } else { - message("Skipping cache. Forcing download for: ", base_filename) - } - - # We make a simple GET request first just to see where "latest" redirects us - message("Fetching file from Metalog...") - response <- httr::GET(base_url, httr::config(followlocation = TRUE)) - - if (httr::status_code(response) != 200) { - stop("Error fetching the file! Status code: ", httr::status_code(response)) - } - - # Extract the final URL and save - url_with_date <- response$url - filename <- basename(url_with_date) - destfile <- file.path(download_dir, filename) - - message("Downloading to: ", destfile) - httr::GET(url_with_date, httr::write_disk(destfile, overwrite = TRUE)) - - return(destfile) -} diff --git a/MetalogDB/R/fetch_with_cache.R b/MetalogDB/R/fetch_with_cache.R deleted file mode 100644 index e69de29..0000000 diff --git a/MetalogDB/R/filter_samples.R b/MetalogDB/R/filter_samples.R deleted file mode 100644 index e69de29..0000000 diff --git a/MetalogDB/R/inject_provenance.R b/MetalogDB/R/inject_provenance.R deleted file mode 100644 index e69de29..0000000 diff --git a/MetalogDB/R/read_metalog_data.R b/MetalogDB/R/read_metalog_data.R deleted file mode 100644 index e69de29..0000000 diff --git a/MetalogDB/R/resolve_metalog_url.R b/MetalogDB/R/resolve_metalog_url.R deleted file mode 100644 index 3a6dbc6..0000000 --- a/MetalogDB/R/resolve_metalog_url.R +++ /dev/null @@ -1,47 +0,0 @@ -# Function constructs download URL for requested metalog data from DB, -# If cache, checks if files already exist in datadir, and downloads if not. -# Otherwise downloads files to datadir. - -.resolve_metalog_url <- function(collection, metadata, use_cache) { - - cache_dir <- ".data_cache" - if (!dir.exists(cache_dir)) { - dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) - } - - # Construct download URL's into target list - base_url <- "https://metalog.embl.de/static/download" - - profile <- dplyr::case_when( - collection == "human" ~ "human", - collection == "animal" ~ "animal", - collection == "ocean" ~ "ocean", - collection == "other_environment" ~ "environmental" - ) - assay_url <- sprintf("%s/profiles/%s_metaphlan4_latest.tsv.gz", base_url, profile) - - md_type <- dplyr::case_when( - metadata == "core" ~ "core", - metadata == "partially_harmonized" ~ "extended", - metadata == "all" ~ "all" - ) - md_url <- sprintf("%s/metadata/%s_%s_long_latest.tsv.gz", base_url, profile, md_type) - - # Get files - assay_file <- .download_if_missing( - target_url = assay_url, - download_dir = cache_dir, - use_cache = use_cache - ) - - md_file <- .download_if_missing( - target_url = md_url, - download_dir = cache_dir, - use_cache = use_cache - ) - - return(list( - assay = assay_file, - md = md_file - )) -} diff --git a/MetalogDB/R/validate_inputs.R b/MetalogDB/R/validate_inputs.R deleted file mode 100644 index d17fe0a..0000000 --- a/MetalogDB/R/validate_inputs.R +++ /dev/null @@ -1,36 +0,0 @@ -# Function to check that inputs to fetchMetalogTSE - -.validate_inputs <- function(collection, metadata, samplelist, use_cache) { - allowed_collections <- c("human", "animal", "ocean_water", "other_environmental") - if (missing(collection) || !collection %in% allowed_collections) { - stop( - "Validation Error: 'collection' must be one of: ", - paste(paste0('""', allowed_collections, '""'), collapse = ", ") - ) - } - - allowed_metadata <- c("core", "partially_harmonized", "all") - if (!metadata %in% allowed_metadata) { - stop( - "Validation Error: 'metadata' must be one of: ", - paste(paste0('""', allowed_metadata, '""'), collapse = ", ") - ) - } - - if (!is.null(samplelist)) { - ext <- tolower(tools::file_ext(samplelist)) - allowed_exts <- c("csv", "tsv", "txt", "json") - if (!ext %in% allowed_exts) { - stop( - "Validation Error: samplelist file type must be one of: ", - paste(allowed_exts, collapse = ", ") - ) - } - } - - if (!is.logical(use_cache) || length(use_cache) != 1 || is.na(use_cache)) { - stop("Validation Error: 'use_cache' must be a single logical value (TRUE or FALSE)") - } - - invisible(NULL) -} From b90f122a89fca3e77d558095c317e4c21404e2a8 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 12:14:37 +0300 Subject: [PATCH 14/16] add nix stuff to gitignore --- MetalogDB/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 MetalogDB/.gitignore diff --git a/MetalogDB/.gitignore b/MetalogDB/.gitignore new file mode 100644 index 0000000..7259285 --- /dev/null +++ b/MetalogDB/.gitignore @@ -0,0 +1,4 @@ +/.data_cache +*.nix +.envrc +/.direnv From 59f9647cdd33329fa4382bfe80c8f0dabe90d38d Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 12:17:17 +0300 Subject: [PATCH 15/16] remove nix stuff before PR --- MetalogDB/.Rprofile | 54 ---------------------------------- MetalogDB/.envrc | 1 - MetalogDB/.gitignore | 4 --- MetalogDB/default.nix | 62 ---------------------------------------- MetalogDB/generate_env.R | 23 --------------- 5 files changed, 144 deletions(-) delete mode 100644 MetalogDB/.Rprofile delete mode 100644 MetalogDB/.envrc delete mode 100644 MetalogDB/.gitignore delete mode 100644 MetalogDB/default.nix delete mode 100644 MetalogDB/generate_env.R diff --git a/MetalogDB/.Rprofile b/MetalogDB/.Rprofile deleted file mode 100644 index 91774d0..0000000 --- a/MetalogDB/.Rprofile +++ /dev/null @@ -1,54 +0,0 @@ -### File generated by `rix::rix_init()` ### -# 1. Currently, system RStudio does not inherit environmental variables -# defined in `$HOME/.zshrc`, `$HOME/.bashrc` and alike. This is workaround to -# make the path of the nix store and hence basic nix commands available -# in an RStudio session -# 2. For nix-R session, remove `R_LIBS_USER`, system's R user library.`. -# This guarantees no user libraries from the system are loaded and only -# R packages in the Nix store are used. This makes Nix-R behave in pure manner -# at run-time. -{ - is_rstudio <- Sys.getenv("RSTUDIO") == "1" - is_nix_r <- nzchar(Sys.getenv("NIX_STORE")) - is_code <- Sys.getenv("TERM_PROGRAM") == "vscode" - is_positron <- Sys.getenv("POSITRON") == "1" - if (isFALSE(is_nix_r) && isTRUE(is_rstudio)) { - cat("{rix} detected RStudio R session") - old_path <- Sys.getenv("PATH") - nix_path <- "/nix/var/nix/profiles/default/bin" - has_nix_path <- any(grepl(nix_path, old_path)) - if (isFALSE(has_nix_path)) { - Sys.setenv(PATH = paste(old_path, nix_path, sep = ":")) - } - rm(old_path, nix_path) - } - if (isTRUE(is_nix_r)) { - install.packages <- function(...) { - stop("You are currently in an R session running from Nix.\n", "Don't install packages using install.packages(),\nadd them to ", "the default.nix file instead.") - } - update.packages <- function(...) { - stop("You are currently in an R session running from Nix.\n", "Don't update packages using update.packages(),\n", "generate a new default.nix with a more recent version of R. ", "If you need bleeding edge packages, read the", "'Understanding the rPackages set release cycle and using ", "bleeding edge packages' vignette.") - } - remove.packages <- function(...) { - stop("You are currently in an R session running from Nix.\n", "Don't remove packages using `remove.packages()``,\ndelete them ", "from the default.nix file instead.") - } - current_paths <- .libPaths() - userlib_paths <- Sys.getenv("R_LIBS_USER") - user_dir <- grep(paste(userlib_paths, collapse = "|"), current_paths, fixed = TRUE) - new_paths <- current_paths[-user_dir] - .libPaths(new_paths) - rm(current_paths, userlib_paths, user_dir, new_paths) - } - if (isTRUE(is_code) && interactive() && isFALSE(is_rstudio) && isFALSE(is_positron)) { - vscode_r_init <- file.path(Sys.getenv(if (.Platform$OS.type == "windows") - "USERPROFILE" - else "HOME"), ".vscode-R", "init.R") - if (file.exists(vscode_r_init)) { - source(vscode_r_init) - } - else { - message("No .vscode-R/init.R file found. If you want to use VSCode-R, you need to source it in your .Rprofile or start vscode from within nix-shell") - } - } - rm(is_rstudio, is_nix_r, is_code, is_positron) -} diff --git a/MetalogDB/.envrc b/MetalogDB/.envrc deleted file mode 100644 index 1d953f4..0000000 --- a/MetalogDB/.envrc +++ /dev/null @@ -1 +0,0 @@ -use nix diff --git a/MetalogDB/.gitignore b/MetalogDB/.gitignore deleted file mode 100644 index 7259285..0000000 --- a/MetalogDB/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -/.data_cache -*.nix -.envrc -/.direnv diff --git a/MetalogDB/default.nix b/MetalogDB/default.nix deleted file mode 100644 index 5dbc01b..0000000 --- a/MetalogDB/default.nix +++ /dev/null @@ -1,62 +0,0 @@ -# This file was generated by the {rix} R package v0.18.2 on 2026-04-01 -# with following call: -# >rix(r_ver = "98d89b5d2a42d5f965c30ef32eee726a11357cfd", -# > r_pkgs = c("Matrix", -# > "TreeSummarizedExperiment", -# > "data.table", -# > "dplyr", -# > "httr", -# > "jsonlite", -# > "mia", -# > "rix"), -# > system_pkgs = NULL, -# > git_pkgs = NULL, -# > ide = "radian", -# > project_path = path_default_nix, -# > overwrite = TRUE, -# > print = FALSE) -# It uses upstream nixpkgs' revision 98d89b5d2a42d5f965c30ef32eee726a11357cfd for reproducibility purposes -# which will install R version latest-upstream. -# Report any issues to https://github.com/ropensci/rix -let - pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/98d89b5d2a42d5f965c30ef32eee726a11357cfd.tar.gz") {}; - - rpkgs = builtins.attrValues { - inherit (pkgs.rPackages) - data_table - dplyr - httr - jsonlite - Matrix - mia - rix - TreeSummarizedExperiment; - }; - - system_packages = builtins.attrValues { - inherit (pkgs) - glibcLocales - nix - R; - }; - - wrapped_pkgs = pkgs.radianWrapper.override { - packages = [ rpkgs ]; - }; - - shell = pkgs.mkShell { - LOCALE_ARCHIVE = if pkgs.stdenv.hostPlatform.system == "x86_64-linux" then "${pkgs.glibcLocales}/lib/locale/locale-archive" else ""; - LANG = "en_US.UTF-8"; - LC_ALL = "en_US.UTF-8"; - LC_TIME = "en_US.UTF-8"; - LC_MONETARY = "en_US.UTF-8"; - LC_PAPER = "en_US.UTF-8"; - LC_MEASUREMENT = "en_US.UTF-8"; - - buildInputs = [ rpkgs system_packages wrapped_pkgs ]; - - }; -in - { - inherit pkgs shell; - } diff --git a/MetalogDB/generate_env.R b/MetalogDB/generate_env.R deleted file mode 100644 index 3d022a6..0000000 --- a/MetalogDB/generate_env.R +++ /dev/null @@ -1,23 +0,0 @@ -library(rix) - -path_default_nix <- "." - -rix( - r_ver = "latest-upstream", - r_pkgs = c( - "Matrix", - "TreeSummarizedExperiment", - "data.table", - "dplyr", - "httr", - "jsonlite", - "mia", # Not strictly needed, just useful to test - "rix" # Can be removed from final - ), - system_pkgs = NULL, - git_pkgs = NULL, - ide = "radian", - project_path = path_default_nix, - overwrite = TRUE, - print = FALSE -) From e3b5989dc95a25dd4a02f6a51bb72895b6383bdc Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 1 Apr 2026 12:20:35 +0300 Subject: [PATCH 16/16] Mia not needed --- MetalogDB/fetchMetalogTSE.R | 1 - 1 file changed, 1 deletion(-) diff --git a/MetalogDB/fetchMetalogTSE.R b/MetalogDB/fetchMetalogTSE.R index 5c89511..5bc698d 100644 --- a/MetalogDB/fetchMetalogTSE.R +++ b/MetalogDB/fetchMetalogTSE.R @@ -11,7 +11,6 @@ library(TreeSummarizedExperiment) library(data.table) library(dplyr) library(httr) -library(mia) # ------------- # Helpers FUNs