Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ tower-layer = "0.3.2"
tower-service = "0.3.2"
itertools = "0.13.0"
reqwest = { version = "0.12.8", features = [ "json" ] }
fancy-regex = "0.14"

[dev-dependencies]
criterion = "0.5.1"
Expand Down
1 change: 1 addition & 0 deletions api/src/controllers/private_api/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod unique_peptides;
pub mod ecnumbers;
pub mod goterms;
pub mod interpros;
Expand Down
133 changes: 133 additions & 0 deletions api/src/controllers/private_api/unique_peptides.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
use axum::{extract::State, Json};
use datastore::LineageRank;
use fancy_regex::Regex;
use serde::{Deserialize, Serialize};

use crate::{
controllers::generate_handlers,
errors::ApiError,
AppState,
};
use database::get_proteins_for_taxon;

fn default_cleavage_regex() -> String {
String::from("[KR](?!P)")
}

fn default_min_length() -> usize {
5
}

#[derive(Deserialize)]
pub struct Parameters {
taxon_id: u32,
#[serde(default = "default_cleavage_regex")]
cleavage_regex: String,
#[serde(default = "default_min_length")]
min_length: usize,
}

#[derive(Serialize)]
pub struct UniquePeptidesResult {
unique_peptides: Vec<String>,
total_peptides: usize,
total_unique_peptides: usize,
}

fn cleave_sequence(sequence: &str, re: &Regex) -> Vec<String> {
let mut fragments = Vec::new();
let mut start = 0;
for m in re.find_iter(sequence).flatten() {
let end = m.end();
if end > start {
fragments.push(sequence[start..end].to_string());
}
start = end;
}
if start < sequence.len() {
fragments.push(sequence[start..].to_string());
}
fragments
}

async fn handler(
State(AppState { index, datastore, database, .. }): State<AppState>,
Parameters { taxon_id, cleavage_regex, min_length }: Parameters,
) -> Result<UniquePeptidesResult, ApiError> {
let re = Regex::new(&cleavage_regex)
.map_err(|e| ApiError::UnknownRankError(format!("Invalid cleavage_regex: {}", e)))?;

let rank = datastore.taxon_store().get_rank(taxon_id)
.ok_or_else(|| ApiError::UnknownRankError(format!("Taxon {} not found", taxon_id)))?;

if *rank != LineageRank::Species && *rank != LineageRank::Strain {
return Err(ApiError::UnknownRankError(format!(
"Taxon {} is at rank '{}', but must be at species or strain level",
taxon_id, rank
)));
}

let proteins = get_proteins_for_taxon(database.get_conn(), taxon_id).await?;

let mut peptides: Vec<String> = proteins.iter()
.flat_map(|protein| cleave_sequence(&protein.protein, &re))
.filter(|f| f.len() >= min_length)
.collect();

peptides.sort_unstable();
peptides.dedup();

let total_peptides = peptides.len();

let (peptides, results) = tokio::task::spawn_blocking(move || {
let results = index.analyse(&peptides, false, false, Some(10_000));
(peptides, results)
}).await?;

let unique_peptides: Vec<String> = peptides.into_iter()
.zip(results)
.filter_map(|(peptide, result)| {
(!result.cutoff_used
&& !result.proteins.is_empty()
&& result.proteins.iter().all(|p| p.taxon == taxon_id))
.then_some(peptide)
})
.collect();

let total_unique_peptides = unique_peptides.len();

Ok(UniquePeptidesResult {
unique_peptides,
total_peptides,
total_unique_peptides,
})
}

generate_handlers!(
async fn json_handler(
state => State<AppState>,
params => Parameters
) -> Result<Json<UniquePeptidesResult>, ApiError> {
Ok(Json(handler(state, params).await?))
}
);

#[cfg(test)]
mod tests {
use super::cleave_sequence;
use fancy_regex::Regex;

#[test]
fn tryptic_cleavage_splits_after_k_and_r_not_before_p() {
let re = Regex::new("[KR](?!P)").unwrap();
// K at pos 1 not followed by P → split after K; R at end of string → split after R
assert_eq!(cleave_sequence("MKVTLPGAR", &re), vec!["MK", "VTLPGAR"]);
}

#[test]
fn tryptic_cleavage_skips_k_before_p() {
let re = Regex::new("[KR](?!P)").unwrap();
// K at pos 3 is followed by P → no split; R at end of string → split after R
assert_eq!(cleave_sequence("ACTKPDEFR", &re), vec!["ACTKPDEFR"]);
}
}
6 changes: 4 additions & 2 deletions api/src/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::{
},
datasets::sampledata,
mpa::{pept2data},
private_api::{ecnumbers, goterms, interpros, metadata, proteins, proteins_filter, reference_proteomes, reference_proteomes_filter, taxa, taxa_filter}
private_api::{ecnumbers, goterms, interpros, metadata, proteins, proteins_filter, reference_proteomes, reference_proteomes_filter, taxa, taxa_filter, unique_peptides}
},
middleware::{
cors::create_cors_layer,
Expand Down Expand Up @@ -146,6 +146,8 @@ fn create_private_api_routes() -> Router<AppState> {
"/taxa/count",
get(taxa_filter::get_json_count_handler).post(taxa_filter::post_json_count_handler),
"/taxa/filter",
get(taxa_filter::get_json_filter_handler).post(taxa_filter::post_json_filter_handler)
get(taxa_filter::get_json_filter_handler).post(taxa_filter::post_json_filter_handler),
"/taxa/unique_peptides",
get(unique_peptides::get_json_handler).post(unique_peptides::post_json_handler)
)
}
69 changes: 69 additions & 0 deletions database/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,75 @@ pub async fn get_accessions_count_by_filter(
.unwrap_or(0) as u32)
}

/// Retrieves all proteins from the database that belong to the given taxon ID
///
/// # Arguments
/// * `client` - OpenSearch connection handle
/// * `taxon_id` - NCBI taxon ID to retrieve proteins for
///
/// # Returns
/// * Vector of `UniprotEntry` records for all proteins associated with the taxon
/// * `DatabaseError` if the database operation fails
///
/// Uses `search_after` pagination to handle taxa with more than 10,000 proteins.
pub async fn get_proteins_for_taxon(
client: &OpenSearch,
taxon_id: u32,
) -> Result<Vec<UniprotEntry>, DatabaseError> {
const PAGE_SIZE: usize = 1000;
let mut all_proteins: Vec<UniprotEntry> = Vec::with_capacity(PAGE_SIZE);
let mut search_after: Option<String> = None;

loop {
let mut body = json!({
"query": { "term": { "taxon_id": taxon_id } },
"size": PAGE_SIZE,
"sort": [{ "uniprot_accession_number": "asc" }]
});
if let Some(ref last_accession) = search_after {
body["search_after"] = json!([last_accession]);
}

let response = client
.search(SearchParts::Index(&["uniprot_entries"]))
.body(body)
.send()
.await?;

if !response.status_code().is_success() {
return Err(GeneralError(response.text().await?));
}

let response_body: serde_json::Value = response.json().await?;

let hits = match response_body["hits"]["hits"].as_array() {
Some(h) => h,
None => break,
};

let hit_count = hits.len();

for hit in hits {
if let Ok(entry) = serde_json::from_value::<UniprotEntry>(hit["_source"].clone()) {
all_proteins.push(entry);
}
}

if hit_count < PAGE_SIZE {
break;
}

if let Some(last_hit) = hits.last() {
match last_hit["sort"][0].as_str() {
Some(sort_val) => search_after = Some(sort_val.to_string()),
None => break,
}
}
}

Ok(all_proteins)
}

/// Gets UniProt accession IDs from the database that match the given filter criteria
///
/// # Arguments
Expand Down
4 changes: 4 additions & 0 deletions datastore/src/taxon_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ impl TaxonStore {
self.mapper.get(&key).map(|(name, _, _)| name)
}

pub fn get_rank(&self, key: u32) -> Option<&LineageRank> {
self.mapper.get(&key).map(|(_, rank, _)| rank)
}

pub fn is_valid(&self, key: u32) -> bool {
self.mapper.contains_key(&key) && self.mapper[&key].2
}
Expand Down
Loading