From 9a65ba55d8424284904d85ded91f85a95e041253 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 5 Jun 2026 15:18:16 +0200 Subject: [PATCH 1/2] Implement new private_api endpoint that computes the unique peptides for a taxon --- Cargo.lock | 27 ++++ api/Cargo.toml | 1 + api/src/controllers/private_api/mod.rs | 1 + .../private_api/unique_peptides.rs | 149 ++++++++++++++++++ api/src/routes.rs | 6 +- database/src/lib.rs | 75 +++++++++ 6 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 api/src/controllers/private_api/unique_peptides.rs diff --git a/Cargo.lock b/Cargo.lock index a86b9cb..4fd915d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -250,6 +250,21 @@ dependencies = [ "serde", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitarray" version = "0.1.0" @@ -634,6 +649,17 @@ name = "fa-compression" version = "0.1.0" source = "git+https://github.com/unipept/unipept-index.git#2ff35f06ab808a44f1542ddf4ff86e94f6c70f91" +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2540,6 +2566,7 @@ dependencies = [ "criterion", "database", "datastore", + "fancy-regex", "http", "http-body-util", "index", diff --git a/api/Cargo.toml b/api/Cargo.toml index 7a2651a..e0f71ab 100644 --- a/api/Cargo.toml +++ b/api/Cargo.toml @@ -27,6 +27,7 @@ tower-layer = "0.3.2" tower-service = "0.3.2" itertools = "0.13.0" reqwest = { version = "0.12.8", features = [ "json" ] } +fancy-regex = "0.14" [dev-dependencies] criterion = "0.5.1" diff --git a/api/src/controllers/private_api/mod.rs b/api/src/controllers/private_api/mod.rs index 3544aba..8b6ef6d 100644 --- a/api/src/controllers/private_api/mod.rs +++ b/api/src/controllers/private_api/mod.rs @@ -1,3 +1,4 @@ +pub mod unique_peptides; pub mod ecnumbers; pub mod goterms; pub mod interpros; diff --git a/api/src/controllers/private_api/unique_peptides.rs b/api/src/controllers/private_api/unique_peptides.rs new file mode 100644 index 0000000..904af7c --- /dev/null +++ b/api/src/controllers/private_api/unique_peptides.rs @@ -0,0 +1,149 @@ +use axum::{extract::State, Json}; +use datastore::LineageRank; +use fancy_regex::Regex; +use serde::{Deserialize, Serialize}; + +use crate::{ + controllers::generate_handlers, + errors::ApiError, + AppState, +}; +use database::get_proteins_for_taxon; + +fn default_cleavage_regex() -> String { + String::from("[KR](?!P)") +} + +fn default_min_length() -> usize { + 5 +} + +#[derive(Deserialize)] +pub struct Parameters { + taxon_id: u32, + #[serde(default = "default_cleavage_regex")] + cleavage_regex: String, + #[serde(default = "default_min_length")] + min_length: usize, +} + +#[derive(Serialize)] +pub struct UniquePeptidesResult { + unique_peptides: Vec, + total_peptides: usize, + total_unique_peptides: usize, +} + +async fn handler( + State(AppState { index, datastore, database, .. }): State, + Parameters { taxon_id, cleavage_regex, min_length }: Parameters, +) -> Result { + let re = Regex::new(&cleavage_regex) + .map_err(|e| ApiError::UnknownRankError(format!("Invalid cleavage_regex: {}", e)))?; + + let taxon_info = datastore.taxon_store().get(taxon_id) + .ok_or_else(|| ApiError::UnknownRankError(format!("Taxon {} not found", taxon_id)))?; + + let rank = &taxon_info.1; + if *rank != LineageRank::Species && *rank != LineageRank::Strain { + return Err(ApiError::UnknownRankError(format!( + "Taxon {} is at rank '{}', but must be at species or strain level", + taxon_id, rank + ))); + } + + let proteins = get_proteins_for_taxon(database.get_conn(), taxon_id).await?; + + let mut peptides: Vec = proteins.iter().flat_map(|protein| { + let sequence = &protein.protein; + let mut fragments = Vec::new(); + let mut start = 0; + for m in re.find_iter(sequence).flatten() { + let end = m.end(); + if end > start { + fragments.push(sequence[start..end].to_string()); + } + start = end; + } + if start < sequence.len() { + fragments.push(sequence[start..].to_string()); + } + fragments + }).collect(); + + peptides.retain(|p| p.len() >= min_length); + peptides.sort(); + peptides.dedup(); + + let total_peptides = peptides.len(); + + let (peptides, results) = tokio::task::spawn_blocking(move || { + let results = index.analyse(&peptides, false, false, Some(10_000)); + (peptides, results) + }).await?; + + let unique_peptides: Vec = peptides.into_iter() + .zip(results.into_iter()) + .filter_map(|(peptide, result)| { + if result.cutoff_used || result.proteins.is_empty() { + return None; + } + if result.proteins.iter().all(|p| p.taxon == taxon_id) { + Some(peptide) + } else { + None + } + }) + .collect(); + + let total_unique_peptides = unique_peptides.len(); + + Ok(UniquePeptidesResult { + unique_peptides, + total_peptides, + total_unique_peptides, + }) +} + +generate_handlers!( + async fn json_handler( + state => State, + params => Parameters + ) -> Result, ApiError> { + Ok(Json(handler(state, params).await?)) + } +); + +#[cfg(test)] +mod tests { + use fancy_regex::Regex; + + fn cleave(sequence: &str, pattern: &str) -> Vec { + let re = Regex::new(pattern).unwrap(); + let mut fragments = Vec::new(); + let mut start = 0; + for m in re.find_iter(sequence).flatten() { + let end = m.end(); + if end > start { + fragments.push(sequence[start..end].to_string()); + } + start = end; + } + if start < sequence.len() { + fragments.push(sequence[start..].to_string()); + } + fragments + } + + #[test] + fn tryptic_cleavage_splits_after_k_and_r_not_before_p() { + // K at pos 1 not followed by P → split after K; R at end of string → split after R + assert_eq!(cleave("MKVTLPGAR", "[KR](?!P)"), vec!["MK", "VTLPGAR"]); + } + + #[test] + fn tryptic_cleavage_skips_k_before_p() { + // K at pos 3 is followed by P → no split; R at end of string → split after R + assert_eq!(cleave("ACTKPDEFR", "[KR](?!P)"), vec!["ACTKPDEFR"]); + } +} diff --git a/api/src/routes.rs b/api/src/routes.rs index 4efae4e..96dcda1 100644 --- a/api/src/routes.rs +++ b/api/src/routes.rs @@ -19,7 +19,7 @@ use crate::{ }, datasets::sampledata, mpa::{pept2data}, - private_api::{ecnumbers, goterms, interpros, metadata, proteins, proteins_filter, reference_proteomes, reference_proteomes_filter, taxa, taxa_filter} + private_api::{ecnumbers, goterms, interpros, metadata, proteins, proteins_filter, reference_proteomes, reference_proteomes_filter, taxa, taxa_filter, unique_peptides} }, middleware::{ cors::create_cors_layer, @@ -146,6 +146,8 @@ fn create_private_api_routes() -> Router { "/taxa/count", get(taxa_filter::get_json_count_handler).post(taxa_filter::post_json_count_handler), "/taxa/filter", - get(taxa_filter::get_json_filter_handler).post(taxa_filter::post_json_filter_handler) + get(taxa_filter::get_json_filter_handler).post(taxa_filter::post_json_filter_handler), + "/taxa/unique_peptides", + get(unique_peptides::get_json_handler).post(unique_peptides::post_json_handler) ) } diff --git a/database/src/lib.rs b/database/src/lib.rs index cf35fd4..d5f1ff8 100644 --- a/database/src/lib.rs +++ b/database/src/lib.rs @@ -217,6 +217,81 @@ pub async fn get_accessions_count_by_filter( .unwrap_or(0) as u32) } +/// Retrieves all proteins from the database that belong to the given taxon ID +/// +/// # Arguments +/// * `client` - OpenSearch connection handle +/// * `taxon_id` - NCBI taxon ID to retrieve proteins for +/// +/// # Returns +/// * Vector of `UniprotEntry` records for all proteins associated with the taxon +/// * `DatabaseError` if the database operation fails +/// +/// Uses `search_after` pagination to handle taxa with more than 10,000 proteins. +pub async fn get_proteins_for_taxon( + client: &OpenSearch, + taxon_id: u32, +) -> Result, DatabaseError> { + const PAGE_SIZE: usize = 1000; + let mut all_proteins: Vec = Vec::new(); + let mut search_after: Option = None; + + loop { + let body = if let Some(ref last_accession) = search_after { + json!({ + "query": { "term": { "taxon_id": taxon_id } }, + "size": PAGE_SIZE, + "sort": [{ "uniprot_accession_number": "asc" }], + "search_after": [last_accession] + }) + } else { + json!({ + "query": { "term": { "taxon_id": taxon_id } }, + "size": PAGE_SIZE, + "sort": [{ "uniprot_accession_number": "asc" }] + }) + }; + + let response = client + .search(SearchParts::Index(&["uniprot_entries"])) + .body(body) + .send() + .await?; + + if !response.status_code().is_success() { + return Err(GeneralError(response.text().await?)); + } + + let response_body: serde_json::Value = response.json().await?; + + let hits = match response_body["hits"]["hits"].as_array() { + Some(h) => h, + None => break, + }; + + let hit_count = hits.len(); + + for hit in hits { + if let Ok(entry) = serde_json::from_value::(hit["_source"].clone()) { + all_proteins.push(entry); + } + } + + if hit_count < PAGE_SIZE { + break; + } + + if let Some(last_hit) = hits.last() { + match last_hit["sort"][0].as_str() { + Some(sort_val) => search_after = Some(sort_val.to_string()), + None => break, + } + } + } + + Ok(all_proteins) +} + /// Gets UniProt accession IDs from the database that match the given filter criteria /// /// # Arguments From e554b6d97a799dbcdcfeb8172a2e1c4e8ae8b6d7 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 5 Jun 2026 15:29:57 +0200 Subject: [PATCH 2/2] Cleanup code --- .../private_api/unique_peptides.rs | 80 ++++++++----------- database/src/lib.rs | 24 +++--- datastore/src/taxon_store.rs | 4 + 3 files changed, 45 insertions(+), 63 deletions(-) diff --git a/api/src/controllers/private_api/unique_peptides.rs b/api/src/controllers/private_api/unique_peptides.rs index 904af7c..e1a5494 100644 --- a/api/src/controllers/private_api/unique_peptides.rs +++ b/api/src/controllers/private_api/unique_peptides.rs @@ -34,6 +34,22 @@ pub struct UniquePeptidesResult { total_unique_peptides: usize, } +fn cleave_sequence(sequence: &str, re: &Regex) -> Vec { + let mut fragments = Vec::new(); + let mut start = 0; + for m in re.find_iter(sequence).flatten() { + let end = m.end(); + if end > start { + fragments.push(sequence[start..end].to_string()); + } + start = end; + } + if start < sequence.len() { + fragments.push(sequence[start..].to_string()); + } + fragments +} + async fn handler( State(AppState { index, datastore, database, .. }): State, Parameters { taxon_id, cleavage_regex, min_length }: Parameters, @@ -41,10 +57,9 @@ async fn handler( let re = Regex::new(&cleavage_regex) .map_err(|e| ApiError::UnknownRankError(format!("Invalid cleavage_regex: {}", e)))?; - let taxon_info = datastore.taxon_store().get(taxon_id) + let rank = datastore.taxon_store().get_rank(taxon_id) .ok_or_else(|| ApiError::UnknownRankError(format!("Taxon {} not found", taxon_id)))?; - let rank = &taxon_info.1; if *rank != LineageRank::Species && *rank != LineageRank::Strain { return Err(ApiError::UnknownRankError(format!( "Taxon {} is at rank '{}', but must be at species or strain level", @@ -54,25 +69,12 @@ async fn handler( let proteins = get_proteins_for_taxon(database.get_conn(), taxon_id).await?; - let mut peptides: Vec = proteins.iter().flat_map(|protein| { - let sequence = &protein.protein; - let mut fragments = Vec::new(); - let mut start = 0; - for m in re.find_iter(sequence).flatten() { - let end = m.end(); - if end > start { - fragments.push(sequence[start..end].to_string()); - } - start = end; - } - if start < sequence.len() { - fragments.push(sequence[start..].to_string()); - } - fragments - }).collect(); + let mut peptides: Vec = proteins.iter() + .flat_map(|protein| cleave_sequence(&protein.protein, &re)) + .filter(|f| f.len() >= min_length) + .collect(); - peptides.retain(|p| p.len() >= min_length); - peptides.sort(); + peptides.sort_unstable(); peptides.dedup(); let total_peptides = peptides.len(); @@ -83,16 +85,12 @@ async fn handler( }).await?; let unique_peptides: Vec = peptides.into_iter() - .zip(results.into_iter()) + .zip(results) .filter_map(|(peptide, result)| { - if result.cutoff_used || result.proteins.is_empty() { - return None; - } - if result.proteins.iter().all(|p| p.taxon == taxon_id) { - Some(peptide) - } else { - None - } + (!result.cutoff_used + && !result.proteins.is_empty() + && result.proteins.iter().all(|p| p.taxon == taxon_id)) + .then_some(peptide) }) .collect(); @@ -116,34 +114,20 @@ generate_handlers!( #[cfg(test)] mod tests { + use super::cleave_sequence; use fancy_regex::Regex; - fn cleave(sequence: &str, pattern: &str) -> Vec { - let re = Regex::new(pattern).unwrap(); - let mut fragments = Vec::new(); - let mut start = 0; - for m in re.find_iter(sequence).flatten() { - let end = m.end(); - if end > start { - fragments.push(sequence[start..end].to_string()); - } - start = end; - } - if start < sequence.len() { - fragments.push(sequence[start..].to_string()); - } - fragments - } - #[test] fn tryptic_cleavage_splits_after_k_and_r_not_before_p() { + let re = Regex::new("[KR](?!P)").unwrap(); // K at pos 1 not followed by P → split after K; R at end of string → split after R - assert_eq!(cleave("MKVTLPGAR", "[KR](?!P)"), vec!["MK", "VTLPGAR"]); + assert_eq!(cleave_sequence("MKVTLPGAR", &re), vec!["MK", "VTLPGAR"]); } #[test] fn tryptic_cleavage_skips_k_before_p() { + let re = Regex::new("[KR](?!P)").unwrap(); // K at pos 3 is followed by P → no split; R at end of string → split after R - assert_eq!(cleave("ACTKPDEFR", "[KR](?!P)"), vec!["ACTKPDEFR"]); + assert_eq!(cleave_sequence("ACTKPDEFR", &re), vec!["ACTKPDEFR"]); } } diff --git a/database/src/lib.rs b/database/src/lib.rs index d5f1ff8..dbeaab3 100644 --- a/database/src/lib.rs +++ b/database/src/lib.rs @@ -233,24 +233,18 @@ pub async fn get_proteins_for_taxon( taxon_id: u32, ) -> Result, DatabaseError> { const PAGE_SIZE: usize = 1000; - let mut all_proteins: Vec = Vec::new(); + let mut all_proteins: Vec = Vec::with_capacity(PAGE_SIZE); let mut search_after: Option = None; loop { - let body = if let Some(ref last_accession) = search_after { - json!({ - "query": { "term": { "taxon_id": taxon_id } }, - "size": PAGE_SIZE, - "sort": [{ "uniprot_accession_number": "asc" }], - "search_after": [last_accession] - }) - } else { - json!({ - "query": { "term": { "taxon_id": taxon_id } }, - "size": PAGE_SIZE, - "sort": [{ "uniprot_accession_number": "asc" }] - }) - }; + let mut body = json!({ + "query": { "term": { "taxon_id": taxon_id } }, + "size": PAGE_SIZE, + "sort": [{ "uniprot_accession_number": "asc" }] + }); + if let Some(ref last_accession) = search_after { + body["search_after"] = json!([last_accession]); + } let response = client .search(SearchParts::Index(&["uniprot_entries"])) diff --git a/datastore/src/taxon_store.rs b/datastore/src/taxon_store.rs index 57620b6..71f8327 100644 --- a/datastore/src/taxon_store.rs +++ b/datastore/src/taxon_store.rs @@ -85,6 +85,10 @@ impl TaxonStore { self.mapper.get(&key).map(|(name, _, _)| name) } + pub fn get_rank(&self, key: u32) -> Option<&LineageRank> { + self.mapper.get(&key).map(|(_, rank, _)| rank) + } + pub fn is_valid(&self, key: u32) -> bool { self.mapper.contains_key(&key) && self.mapper[&key].2 }