diff --git a/Cargo.lock b/Cargo.lock index a86b9cb..4fd915d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -250,6 +250,21 @@ dependencies = [ "serde", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitarray" version = "0.1.0" @@ -634,6 +649,17 @@ name = "fa-compression" version = "0.1.0" source = "git+https://github.com/unipept/unipept-index.git#2ff35f06ab808a44f1542ddf4ff86e94f6c70f91" +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2540,6 +2566,7 @@ dependencies = [ "criterion", "database", "datastore", + "fancy-regex", "http", "http-body-util", "index", diff --git a/api/Cargo.toml b/api/Cargo.toml index 7a2651a..e0f71ab 100644 --- a/api/Cargo.toml +++ b/api/Cargo.toml @@ -27,6 +27,7 @@ tower-layer = "0.3.2" tower-service = "0.3.2" itertools = "0.13.0" reqwest = { version = "0.12.8", features = [ "json" ] } +fancy-regex = "0.14" [dev-dependencies] criterion = "0.5.1" diff --git a/api/src/controllers/private_api/mod.rs b/api/src/controllers/private_api/mod.rs index 3544aba..8b6ef6d 100644 --- a/api/src/controllers/private_api/mod.rs +++ b/api/src/controllers/private_api/mod.rs @@ -1,3 +1,4 @@ +pub mod unique_peptides; pub mod ecnumbers; pub mod goterms; pub mod interpros; diff --git a/api/src/controllers/private_api/unique_peptides.rs b/api/src/controllers/private_api/unique_peptides.rs new file mode 100644 index 0000000..e1a5494 --- /dev/null +++ b/api/src/controllers/private_api/unique_peptides.rs @@ -0,0 +1,133 @@ +use axum::{extract::State, Json}; +use datastore::LineageRank; +use fancy_regex::Regex; +use serde::{Deserialize, Serialize}; + +use crate::{ + controllers::generate_handlers, + errors::ApiError, + AppState, +}; +use database::get_proteins_for_taxon; + +fn default_cleavage_regex() -> String { + String::from("[KR](?!P)") +} + +fn default_min_length() -> usize { + 5 +} + +#[derive(Deserialize)] +pub struct Parameters { + taxon_id: u32, + #[serde(default = "default_cleavage_regex")] + cleavage_regex: String, + #[serde(default = "default_min_length")] + min_length: usize, +} + +#[derive(Serialize)] +pub struct UniquePeptidesResult { + unique_peptides: Vec, + total_peptides: usize, + total_unique_peptides: usize, +} + +fn cleave_sequence(sequence: &str, re: &Regex) -> Vec { + let mut fragments = Vec::new(); + let mut start = 0; + for m in re.find_iter(sequence).flatten() { + let end = m.end(); + if end > start { + fragments.push(sequence[start..end].to_string()); + } + start = end; + } + if start < sequence.len() { + fragments.push(sequence[start..].to_string()); + } + fragments +} + +async fn handler( + State(AppState { index, datastore, database, .. }): State, + Parameters { taxon_id, cleavage_regex, min_length }: Parameters, +) -> Result { + let re = Regex::new(&cleavage_regex) + .map_err(|e| ApiError::UnknownRankError(format!("Invalid cleavage_regex: {}", e)))?; + + let rank = datastore.taxon_store().get_rank(taxon_id) + .ok_or_else(|| ApiError::UnknownRankError(format!("Taxon {} not found", taxon_id)))?; + + if *rank != LineageRank::Species && *rank != LineageRank::Strain { + return Err(ApiError::UnknownRankError(format!( + "Taxon {} is at rank '{}', but must be at species or strain level", + taxon_id, rank + ))); + } + + let proteins = get_proteins_for_taxon(database.get_conn(), taxon_id).await?; + + let mut peptides: Vec = proteins.iter() + .flat_map(|protein| cleave_sequence(&protein.protein, &re)) + .filter(|f| f.len() >= min_length) + .collect(); + + peptides.sort_unstable(); + peptides.dedup(); + + let total_peptides = peptides.len(); + + let (peptides, results) = tokio::task::spawn_blocking(move || { + let results = index.analyse(&peptides, false, false, Some(10_000)); + (peptides, results) + }).await?; + + let unique_peptides: Vec = peptides.into_iter() + .zip(results) + .filter_map(|(peptide, result)| { + (!result.cutoff_used + && !result.proteins.is_empty() + && result.proteins.iter().all(|p| p.taxon == taxon_id)) + .then_some(peptide) + }) + .collect(); + + let total_unique_peptides = unique_peptides.len(); + + Ok(UniquePeptidesResult { + unique_peptides, + total_peptides, + total_unique_peptides, + }) +} + +generate_handlers!( + async fn json_handler( + state => State, + params => Parameters + ) -> Result, ApiError> { + Ok(Json(handler(state, params).await?)) + } +); + +#[cfg(test)] +mod tests { + use super::cleave_sequence; + use fancy_regex::Regex; + + #[test] + fn tryptic_cleavage_splits_after_k_and_r_not_before_p() { + let re = Regex::new("[KR](?!P)").unwrap(); + // K at pos 1 not followed by P → split after K; R at end of string → split after R + assert_eq!(cleave_sequence("MKVTLPGAR", &re), vec!["MK", "VTLPGAR"]); + } + + #[test] + fn tryptic_cleavage_skips_k_before_p() { + let re = Regex::new("[KR](?!P)").unwrap(); + // K at pos 3 is followed by P → no split; R at end of string → split after R + assert_eq!(cleave_sequence("ACTKPDEFR", &re), vec!["ACTKPDEFR"]); + } +} diff --git a/api/src/routes.rs b/api/src/routes.rs index 4efae4e..96dcda1 100644 --- a/api/src/routes.rs +++ b/api/src/routes.rs @@ -19,7 +19,7 @@ use crate::{ }, datasets::sampledata, mpa::{pept2data}, - private_api::{ecnumbers, goterms, interpros, metadata, proteins, proteins_filter, reference_proteomes, reference_proteomes_filter, taxa, taxa_filter} + private_api::{ecnumbers, goterms, interpros, metadata, proteins, proteins_filter, reference_proteomes, reference_proteomes_filter, taxa, taxa_filter, unique_peptides} }, middleware::{ cors::create_cors_layer, @@ -146,6 +146,8 @@ fn create_private_api_routes() -> Router { "/taxa/count", get(taxa_filter::get_json_count_handler).post(taxa_filter::post_json_count_handler), "/taxa/filter", - get(taxa_filter::get_json_filter_handler).post(taxa_filter::post_json_filter_handler) + get(taxa_filter::get_json_filter_handler).post(taxa_filter::post_json_filter_handler), + "/taxa/unique_peptides", + get(unique_peptides::get_json_handler).post(unique_peptides::post_json_handler) ) } diff --git a/database/src/lib.rs b/database/src/lib.rs index cf35fd4..dbeaab3 100644 --- a/database/src/lib.rs +++ b/database/src/lib.rs @@ -217,6 +217,75 @@ pub async fn get_accessions_count_by_filter( .unwrap_or(0) as u32) } +/// Retrieves all proteins from the database that belong to the given taxon ID +/// +/// # Arguments +/// * `client` - OpenSearch connection handle +/// * `taxon_id` - NCBI taxon ID to retrieve proteins for +/// +/// # Returns +/// * Vector of `UniprotEntry` records for all proteins associated with the taxon +/// * `DatabaseError` if the database operation fails +/// +/// Uses `search_after` pagination to handle taxa with more than 10,000 proteins. +pub async fn get_proteins_for_taxon( + client: &OpenSearch, + taxon_id: u32, +) -> Result, DatabaseError> { + const PAGE_SIZE: usize = 1000; + let mut all_proteins: Vec = Vec::with_capacity(PAGE_SIZE); + let mut search_after: Option = None; + + loop { + let mut body = json!({ + "query": { "term": { "taxon_id": taxon_id } }, + "size": PAGE_SIZE, + "sort": [{ "uniprot_accession_number": "asc" }] + }); + if let Some(ref last_accession) = search_after { + body["search_after"] = json!([last_accession]); + } + + let response = client + .search(SearchParts::Index(&["uniprot_entries"])) + .body(body) + .send() + .await?; + + if !response.status_code().is_success() { + return Err(GeneralError(response.text().await?)); + } + + let response_body: serde_json::Value = response.json().await?; + + let hits = match response_body["hits"]["hits"].as_array() { + Some(h) => h, + None => break, + }; + + let hit_count = hits.len(); + + for hit in hits { + if let Ok(entry) = serde_json::from_value::(hit["_source"].clone()) { + all_proteins.push(entry); + } + } + + if hit_count < PAGE_SIZE { + break; + } + + if let Some(last_hit) = hits.last() { + match last_hit["sort"][0].as_str() { + Some(sort_val) => search_after = Some(sort_val.to_string()), + None => break, + } + } + } + + Ok(all_proteins) +} + /// Gets UniProt accession IDs from the database that match the given filter criteria /// /// # Arguments diff --git a/datastore/src/taxon_store.rs b/datastore/src/taxon_store.rs index 57620b6..71f8327 100644 --- a/datastore/src/taxon_store.rs +++ b/datastore/src/taxon_store.rs @@ -85,6 +85,10 @@ impl TaxonStore { self.mapper.get(&key).map(|(name, _, _)| name) } + pub fn get_rank(&self, key: u32) -> Option<&LineageRank> { + self.mapper.get(&key).map(|(_, rank, _)| rank) + } + pub fn is_valid(&self, key: u32) -> bool { self.mapper.contains_key(&key) && self.mapper[&key].2 }