diff --git a/README.md b/README.md index 0db8d28..1e111f4 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,9 @@ result = ( # Passes BEFORE this point apply to the parent rearrangement; # passes AFTER apply per-descendant. So each clone shares the # same V(D)J recombination but accumulates its own SHM + errors. + # NOTE: expand_clones is the legacy fixed-size *star* model. For + # real clonal trees and repertoires, see "Clonal lineages & + # repertoires" below (clonal_lineage / clonal_repertoire). .expand_clones(n_clones=50, per_clone=20) # 3. Somatic hypermutation per descendant — S5F context-dependent # model at 5% per-base rate (matches memory-B-cell SHM). @@ -145,6 +148,41 @@ result[0]["experiment_id"], result[0]["tissue"] # ('exp001', 'peripher result.to_tsv("repertoire.tsv") ``` +## Clonal lineages & repertoires + +GenAIRR models clonal structure three ways. Pick by what you need: + +| Method | Model | Use for | +|---|---|---| +| **`clonal_lineage`** | BCR affinity-maturation **trees** — generation-synchronous birth–death, per-division S5F SHM, optional affinity selection, sample + genotype-collapse | B-cell lineage trees with **ground truth** (Newick/FASTA per clone), benchmarking lineage/clone inference, ML training data | +| **`clonal_repertoire`** | Non-tree **abundance** repertoires — heavy-tailed clone sizes (power-law/lognormal) + unexpanded-singleton fraction, reads through library-prep, collapsed to `duplicate_count` | **TCR** repertoires (no SHM) and flat-BCR abundance; clone-calling benchmarks | +| `expand_clones` *(deprecated)* | Fixed-size **star**: one founder + `per_clone` independent descendants | legacy; kept working, but prefer the two above | + +```python +import GenAIRR as ga + +# BCR clonal lineage trees — affinity maturation, with ground truth +bcr = (ga.Experiment.on("human_igh").recombine() + .clonal_lineage(n_clones=50, max_generations=6, n_sample=30, + rate=0.01, selection_strength=10.0) # 0 = neutral + .sequencing_errors(rate=0.001) + .run_records(seed=0)) +bcr.records # per-cell AIRR records: clone_id, lineage_node_id, lineage_affinity, ... +bcr.lineage_trees[0].to_newick() # ground-truth tree (branch length = per-edge mutations) + +# TCR clone-size repertoire — proliferated rearrangements, no SHM, abundance +tcr = (ga.Experiment.on("human_tcrb").allow_curatable_refdata().recombine() + .clonal_repertoire(n_clones=200, size_distribution="power_law", + exponent=2.0, max_size=500, unexpanded_fraction=0.5) + .sequencing_errors(rate=0.005) + .run_records(seed=0)) +tcr.records # AIRR records: clone_id (membership) + duplicate_count (abundance) +``` + +`clonal_lineage` is BCR-only (it applies S5F SHM and rejects TCR loci); `clonal_repertoire` covers TCR and flat BCR. See the guides: +[Clonal lineage trees](https://mutejester.github.io/GenAIRR/guides/clonal-lineage.html) · +[Clonal repertoires (TCR & abundance)](https://mutejester.github.io/GenAIRR/guides/clonal-repertoire.html). + Other feature flags worth knowing: | Step | What it does | @@ -620,7 +658,7 @@ The full documentation site is at **[mutejester.github.io/GenAIRR](https://mutej - **Learn** — [Lesson 1: V(D)J recombination](https://mutejester.github.io/GenAIRR/lesson-1.html) · [Lesson 2: Pipeline scrubber](https://mutejester.github.io/GenAIRR/lesson-2.html) · [Lesson 3: S5F SHM](https://mutejester.github.io/GenAIRR/lesson-3.html) · [Lesson 4: Sequencing artifacts](https://mutejester.github.io/GenAIRR/lesson-4.html) · [Lesson 5: Ground-truth payoff](https://mutejester.github.io/GenAIRR/lesson-5.html) - **Concepts** — [Simulation Pipeline](https://mutejester.github.io/GenAIRR/concept-pipeline.html) · [Persistent IR](https://mutejester.github.io/GenAIRR/concept-persistent-ir.html) · [Contracts](https://mutejester.github.io/GenAIRR/concept-contracts.html) · [AIRR Record](https://mutejester.github.io/GenAIRR/concept-airr-record.html) · [Live Calls](https://mutejester.github.io/GenAIRR/concept-live-call.html) -- **Guides** — [Build a config](https://mutejester.github.io/GenAIRR/guide-build-config.html) · [Productive sampling](https://mutejester.github.io/GenAIRR/guide-productive.html) · [Clonal families](https://mutejester.github.io/GenAIRR/guide-clonal-families.html) · [Export](https://mutejester.github.io/GenAIRR/guide-export.html) · [Replay](https://mutejester.github.io/GenAIRR/guide-replay.html) · [Reproduce a dataset](https://mutejester.github.io/GenAIRR/guide-reproduce.html) · [Compare SHM models](https://mutejester.github.io/GenAIRR/guide-compare-shm.html) · [Tune corruption](https://mutejester.github.io/GenAIRR/guide-tune-corruption.html) +- **Guides** — [Build a config](https://mutejester.github.io/GenAIRR/guide-build-config.html) · [Productive sampling](https://mutejester.github.io/GenAIRR/guide-productive.html) · [Clonal lineage trees](https://mutejester.github.io/GenAIRR/guides/clonal-lineage.html) · [Clonal repertoires](https://mutejester.github.io/GenAIRR/guides/clonal-repertoire.html) · [Export](https://mutejester.github.io/GenAIRR/guide-export.html) · [Replay](https://mutejester.github.io/GenAIRR/guide-replay.html) · [Reproduce a dataset](https://mutejester.github.io/GenAIRR/guide-reproduce.html) · [Compare SHM models](https://mutejester.github.io/GenAIRR/guide-compare-shm.html) · [Tune corruption](https://mutejester.github.io/GenAIRR/guide-tune-corruption.html) - **Reference** — [API + Configs + AIRR fields](https://mutejester.github.io/GenAIRR/reference.html) --- diff --git a/engine_rs/src/lib.rs b/engine_rs/src/lib.rs index 3a6de57..2c385ed 100644 --- a/engine_rs/src/lib.rs +++ b/engine_rs/src/lib.rs @@ -46,6 +46,7 @@ pub mod event; pub mod feasibility; pub mod ir; pub mod junction; +pub mod lineage; pub mod live_call; pub mod pass; pub mod passes; diff --git a/engine_rs/src/lineage/affinity.rs b/engine_rs/src/lineage/affinity.rs new file mode 100644 index 0000000..f136ed3 --- /dev/null +++ b/engine_rs/src/lineage/affinity.rs @@ -0,0 +1,261 @@ +//! Affinity model for clonal selection: BLOSUM62-weighted, region-weighted +//! amino-acid distance to a target antigen, and target generation. + +use crate::ir::{compute_codon_rail, Simulation}; +use crate::rng::Rng; + +/// 20 standard amino acids in the BLOSUM62 matrix's row/column order. +const AA_ORDER: &[u8; 20] = b"ARNDCQEGHILKMFPSTWYV"; + +/// BLOSUM62 substitution score for two amino-acid bytes (uppercase one-letter +/// codes). Unknown / non-standard residues (including `*` and `X`) score as a +/// strong mismatch so they contribute maximally to distance. +pub fn blosum62(a: u8, b: u8) -> i8 { + fn idx(aa: u8) -> Option { + AA_ORDER.iter().position(|&x| x == aa) + } + // Canonical BLOSUM62, rows/cols in AA_ORDER. + const M: [[i8; 20]; 20] = [ + // A R N D C Q E G H I L K M F P S T W Y V + [ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0], + [-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3], + [-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3], + [-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3], + [ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1], + [-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2], + [-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2], + [ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3], + [-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3], + [-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3], + [-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1], + [-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2], + [-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1], + [-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1], + [-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2], + [ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2], + [ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0], + [-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3], + [-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1], + [ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4], + ]; + match (idx(a), idx(b)) { + (Some(i), Some(j)) => M[i][j], + _ => -4, + } +} + +/// Per-position substitution cost from BLOSUM62: `S(a,a) - S(a,b)`, which is 0 +/// when `b == a` and grows as the substitution becomes less favorable. +fn substitution_cost(a: u8, b: u8) -> f64 { + (blosum62(a, a) as f64 - blosum62(a, b) as f64).max(0.0) +} + +/// Region-weighted BLOSUM62 amino-acid distance between two aa sequences. +/// `weights` is per amino-acid position; positions beyond the shorter sequence +/// are ignored, and weights shorter than the sequences default to 1.0. +pub fn weighted_aa_distance(a: &[u8], b: &[u8], weights: &[f64]) -> f64 { + let n = a.len().min(b.len()); + let mut d = 0.0; + for i in 0..n { + let w = weights.get(i).copied().unwrap_or(1.0); + d += w * substitution_cost(a[i], b[i]); + } + d +} + +/// Generate a "mature" target: the naive aa sequence with up to `m` random +/// single-residue substitutions to a different standard amino acid. +/// Deterministic for the given `rng` state. +pub fn make_mature_target(naive_aa: &[u8], m: u32, rng: &mut Rng) -> Vec { + let mut target = naive_aa.to_vec(); + if target.is_empty() { + return target; + } + for _ in 0..m { + // Unbiased uniform draws (Lemire `range_u32`), matching the engine's + // RNG convention rather than `% len` modulo bias. + let pos = rng.range_u32(target.len() as u32) as usize; + let cur = target[pos]; + // Pick a standard amino acid uniformly among the 19 != cur (no skew). + let pick = match AA_ORDER.iter().position(|&x| x == cur) { + Some(ci) => { + let r = rng.range_u32(19) as usize; + AA_ORDER[if r >= ci { r + 1 } else { r }] + } + None => AA_ORDER[rng.range_u32(20) as usize], + }; + target[pos] = pick; + } + target +} + +/// Translate a `Simulation` to its amino-acid sequence by concatenating the +/// in-frame translation of each region (respecting each region's frame phase), +/// in biological order. Used to score a cell's affinity to a target antigen. +pub fn sim_to_aa(sim: &Simulation) -> Vec { + let mut aa = Vec::new(); + for region in sim.sequence.regions.iter() { + let rail = compute_codon_rail(region, &sim.pool); + aa.extend_from_slice(&rail.amino_acids); + } + aa +} + +/// Affinity-selection model: maps a cell's amino-acid sequence to an affinity in +/// (0, 1] (1 = identical to target) and to a fitness multiplier for its offspring +/// rate. `selection_strength == 0` makes fitness identically 1 (neutral). +#[derive(Clone, Debug)] +pub struct AffinityModel { + target_aa: Vec, + aa_weights: Vec, + beta: f64, + selection_strength: f64, + /// Affinity of the founder; fitness is measured relative to this baseline so + /// the founder has fitness ~1 and cells that improve on it exceed 1. + founder_affinity: f64, +} + +impl AffinityModel { + /// Build a model. `founder_aa` is the naive founder's aa sequence; its + /// affinity becomes the fitness baseline. + pub fn new( + target_aa: Vec, + aa_weights: Vec, + beta: f64, + selection_strength: f64, + founder_aa: &[u8], + ) -> Self { + let founder_affinity = + (-beta * weighted_aa_distance(founder_aa, &target_aa, &aa_weights)).exp(); + Self { + target_aa, + aa_weights, + beta, + selection_strength, + founder_affinity, + } + } + + /// Affinity in (0, 1]: `exp(-beta * region-weighted BLOSUM distance to target)`. + pub fn affinity_value(&self, aa: &[u8]) -> f64 { + (-self.beta * weighted_aa_distance(aa, &self.target_aa, &self.aa_weights)).exp() + } + + /// Fitness from an already-computed affinity value (avoids re-translation). + pub fn fitness_from_affinity(&self, affinity_value: f64) -> f64 { + (1.0 + self.selection_strength * (affinity_value - self.founder_affinity)).max(0.0) + } + + /// Fitness multiplier for offspring rate: `1 + strength * (affinity - founder_affinity)`, + /// clamped at 0. `selection_strength == 0` ⇒ always 1 (neutral). + pub fn fitness(&self, aa: &[u8]) -> f64 { + self.fitness_from_affinity(self.affinity_value(aa)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ir::{Nucleotide, NucHandle, Region, Segment, Simulation}; + use crate::rng::Rng; + + fn aa_founder(seq: &[u8]) -> Simulation { + let mut sim = Simulation::new(); + for (i, b) in seq.iter().enumerate() { + let (next, _) = sim.with_nucleotide_pushed( + Nucleotide::germline(*b, i as u16, Segment::V)); + sim = next; + } + sim.with_region_added(Region::new(Segment::V, NucHandle::new(0), NucHandle::new(seq.len() as u32))) + } + + #[test] + fn sim_to_aa_translates_in_frame() { + // 8 'A' bases -> codons AAA, AAA (last 2 ignored) -> "KK" (Lys, Lys) + let sim = aa_founder(b"AAAAAAAA"); + assert_eq!(sim_to_aa(&sim), b"KK".to_vec()); + } + + #[test] + fn affinity_value_is_one_at_target() { + let m = AffinityModel::new(b"KK".to_vec(), vec![1.0; 2], 1.0, 1.0, b"KK"); + assert!((m.affinity_value(b"KK") - 1.0).abs() < 1e-9); + } + + #[test] + fn affinity_value_decreases_with_distance() { + let m = AffinityModel::new(b"KK".to_vec(), vec![1.0; 2], 1.0, 1.0, b"KK"); + let near = m.affinity_value(b"KK"); + let far = m.affinity_value(b"WW"); + assert!(far < near, "far {far} should be < near {near}"); + assert!(far > 0.0); + } + + #[test] + fn fitness_is_neutral_when_strength_zero() { + // selection_strength = 0 => fitness == 1.0 for ANY sequence + let m = AffinityModel::new(b"KK".to_vec(), vec![1.0; 2], 1.0, 0.0, b"NN"); + assert!((m.fitness(b"KK") - 1.0).abs() < 1e-9); + assert!((m.fitness(b"WW") - 1.0).abs() < 1e-9); + assert!((m.fitness(b"NN") - 1.0).abs() < 1e-9); + } + + #[test] + fn fitness_rewards_closer_than_founder_penalizes_farther() { + // founder = "NN" (far from target "KK"); strength 1. + let m = AffinityModel::new(b"KK".to_vec(), vec![1.0; 2], 1.0, 1.0, b"NN"); + // a cell AT the target is fitter than the founder baseline (fitness > 1) + assert!(m.fitness(b"KK") > 1.0); + // a cell equal to the founder has fitness exactly 1 (baseline) + assert!((m.fitness(b"NN") - 1.0).abs() < 1e-9); + // fitness never goes negative + assert!(m.fitness(b"WWWWWWWW") >= 0.0); + } + + #[test] + fn blosum62_known_entries() { + assert_eq!(blosum62(b'A', b'A'), 4); + assert_eq!(blosum62(b'W', b'W'), 11); + assert_eq!(blosum62(b'C', b'C'), 9); + assert_eq!(blosum62(b'A', b'R'), -1); + assert_eq!(blosum62(b'W', b'C'), -2); + assert_eq!(blosum62(b'D', b'E'), 2); + assert_eq!(blosum62(b'R', b'A'), blosum62(b'A', b'R')); + let _ = blosum62(b'*', b'A'); + let _ = blosum62(b'X', b'X'); + } + + #[test] + fn weighted_distance_zero_for_identical() { + let a = b"ACDEW"; + let w = vec![1.0; 5]; + assert!(weighted_aa_distance(a, a, &w).abs() < 1e-9); + } + + #[test] + fn weighted_distance_increases_with_substitutions_and_respects_weights() { + let a = b"AAAAA"; + let b = b"AAAAW"; + let flat = vec![1.0; 5]; + let d_flat = weighted_aa_distance(a, b, &flat); + assert!(d_flat > 0.0); + let mut heavy = vec![1.0; 5]; + heavy[4] = 10.0; + assert!(weighted_aa_distance(a, b, &heavy) > d_flat); + let mut zero = vec![1.0; 5]; + zero[4] = 0.0; + assert!(weighted_aa_distance(a, b, &zero).abs() < 1e-9); + } + + #[test] + fn mature_target_applies_m_substitutions_deterministically() { + let naive = b"ACDEFGHIKLMNPQRSTVWY".to_vec(); + let mut rng = Rng::new(42); + let t = make_mature_target(&naive, 3, &mut rng); + assert_eq!(t.len(), naive.len()); + let diffs = naive.iter().zip(&t).filter(|(a, b)| a != b).count(); + assert!(diffs >= 1 && diffs <= 3, "expected up to 3 aa substitutions, got {diffs}"); + let mut rng2 = Rng::new(42); + assert_eq!(make_mature_target(&naive, 3, &mut rng2), t); + } +} diff --git a/engine_rs/src/lineage/branching.rs b/engine_rs/src/lineage/branching.rs new file mode 100644 index 0000000..6c4c0a6 --- /dev/null +++ b/engine_rs/src/lineage/branching.rs @@ -0,0 +1,424 @@ +//! Generation-synchronous clonal branching loop. + +use crate::ir::Simulation; +use crate::pass::{Pass, PassContext}; +use crate::rng::Rng; +use crate::trace::Trace; + +use super::affinity::{sim_to_aa, AffinityModel}; +use super::poisson::poisson_sample; +use super::tree::{LineageNode, LineageTree}; + +/// Parameters controlling one clonal family's growth (neutral mode). +#[derive(Clone, Debug)] +pub struct BranchingParams { + /// Base expected offspring per cell per generation (neutral λ). + pub lambda_base: f64, + /// Expected mutations per cell division. NOTE: currently inert — per-division + /// mutation count is driven by the injected `Pass` mutator; this field is + /// reserved for a later task and is not read yet. + pub lambda_mut: f64, + /// Maximum number of generations to grow. + pub max_generations: u32, + /// Carrying capacity: the live population per generation is capped at this, + /// and it is the pivot for logistic offspring-rate damping (the effective + /// rate falls to zero as the live set approaches `n_max`). + pub n_max: u32, + /// Number of cells to sample at the end (used in a later task). + pub n_sample: u32, + /// Master seed for the whole family (deterministic). + pub seed: u64, +} + +/// Extract a node's genotype (pool bases) from its `Simulation`. +fn genotype_of(sim: &Simulation) -> Vec { + sim.pool.as_slice().iter().map(|n| n.base).collect() +} + +/// Apply one division's mutations: run `mutator` against `parent` with a fresh, +/// deterministic per-division RNG seeded by `child_seed`. Returns the mutated child. +fn mutate_child(parent: &Simulation, mutator: &dyn Pass, child_seed: u64) -> Simulation { + let mut trace = Trace::new(); + let mut rng = Rng::new(child_seed); + let mut ctx = PassContext { + replay_cursor: None, + trace: &mut trace, + rng: &mut rng, + pass_index: 0, + refdata: None, + contracts: None, + feasibility: None, + reference_index: None, + event_log_sink: None, + }; + mutator.execute(parent, &mut ctx) +} + +/// Core generation-synchronous growth. With `mutator = None`, children are exact +/// clones (and NO extra RNG is consumed). With `Some(m)`, each division draws a +/// deterministic sub-seed and applies `m`. With `affinity = Some(model)`, each +/// cell's offspring rate is modulated by the model's fitness; `None` leaves the +/// rate unchanged (byte-identical to the pre-affinity path). Returns +/// (tree, peak_live_population, sims_arena, final_live) where sims_arena[node.id] +/// is the Simulation for that node, and `final_live` is the set of node ids that +/// are ALIVE when growth stopped — the cells produced in the final completed +/// generation (every cell is replaced by its offspring each generation, so the +/// living population at sampling time is exactly this set). It is empty when the +/// family went extinct (a cell drew 0 offspring and left no progeny). +fn grow_core( + founder: &Simulation, + params: &BranchingParams, + mutator: Option<&dyn Pass>, + affinity: Option<&AffinityModel>, +) -> (LineageTree, usize, Vec, Vec) { + let mut nodes: Vec = Vec::new(); + let mut sims: Vec = Vec::new(); + let mut rng = Rng::new(params.seed); + + let root_affinity = affinity + .map(|m| m.affinity_value(&sim_to_aa(founder))) + .unwrap_or(0.0); + + nodes.push(LineageNode { + id: 0, + parent_id: None, + generation: 0, + genotype: genotype_of(founder), + mutations_from_parent: 0, + affinity: root_affinity, + abundance: 0, + observed: false, + }); + sims.push(founder.clone()); + + let mut live: Vec = vec![0]; + let mut next_id: u32 = 1; + let mut peak_live: usize = live.len(); + + for gen in 1..=params.max_generations { + if live.is_empty() { + break; // lineage went extinct (every cell drew 0 offspring) + } + // Logistic carrying-capacity damping: as the live population approaches + // n_max, the effective offspring rate falls toward zero (plateau). + let live_frac = (live.len() as f64) / (params.n_max as f64); + let eff_lambda = params.lambda_base * (1.0 - live_frac).max(0.0); + + let mut next_live: Vec = Vec::new(); + 'generation: for &parent_id in &live { + let cell_lambda = match affinity { + Some(m) => eff_lambda * m.fitness_from_affinity(nodes[parent_id as usize].affinity), + None => eff_lambda, + }; + let k = poisson_sample(&mut rng, cell_lambda); + for _ in 0..k { + // Hard cap: a Poisson draw can still overshoot near saturation. + if next_live.len() >= params.n_max as usize { + break 'generation; + } + let parent_mut_count = sims[parent_id as usize].mutation_count; + let child_sim = match mutator { + Some(m) => { + let child_seed = rng.next_u64(); + mutate_child(&sims[parent_id as usize], m, child_seed) + } + None => sims[parent_id as usize].clone(), + }; + let muts = child_sim + .mutation_count + .saturating_sub(parent_mut_count); + let child_affinity = affinity + .map(|m| m.affinity_value(&sim_to_aa(&child_sim))) + .unwrap_or(0.0); + nodes.push(LineageNode { + id: next_id, + parent_id: Some(parent_id), + generation: gen, + genotype: genotype_of(&child_sim), + mutations_from_parent: muts, + affinity: child_affinity, + abundance: 0, + observed: false, + }); + sims.push(child_sim); + next_live.push(next_id); + next_id += 1; + } + } + live = next_live; + if live.len() > peak_live { + peak_live = live.len(); + } + } + + // `live` now holds the cells alive when growth stopped: either the last + // completed generation's offspring (ran to max_generations) or empty (the + // population went extinct — every live cell drew 0 offspring). This is the + // living population sampling must draw from. + (LineageTree { nodes }, peak_live, sims, live) +} + +/// Grow a full clonal lineage with per-division mutation via `mutator`. +/// Deterministic for `params.seed`. +pub fn grow_lineage( + founder: &Simulation, + params: &BranchingParams, + mutator: &dyn Pass, +) -> LineageTree { + grow_core(founder, params, Some(mutator), None).0 +} + +/// Grow the lineage TOPOLOGY only (children are exact clones of their parent +/// `Simulation`; no mutation — a later task layers mutation on top). +pub fn grow_topology(founder: &Simulation, params: &BranchingParams) -> LineageTree { + grow_core(founder, params, None, None).0 +} + +/// Like `grow_topology`, but also returns the final living node ids (cells alive +/// when growth stopped). Empty when the family went extinct. +#[cfg(test)] +pub fn grow_topology_with_live( + founder: &Simulation, + params: &BranchingParams, +) -> (LineageTree, Vec) { + let (tree, _peak, _sims, live) = grow_core(founder, params, None, None); + (tree, live) +} + +/// Grow a clonal lineage with per-division mutation AND affinity selection. +/// Deterministic for `params.seed`. +pub fn grow_lineage_with_affinity( + founder: &Simulation, + params: &BranchingParams, + mutator: &dyn Pass, + model: &AffinityModel, +) -> LineageTree { + grow_core(founder, params, Some(mutator), Some(model)).0 +} + +/// Grow + mutate a lineage and ALSO return the per-node `Simulation` arena +/// (index == node id) and the final living node ids (cells alive when growth +/// stopped; empty when the family went extinct), for building per-node AIRR +/// `Outcome`s and sampling the living population. +pub fn grow_lineage_retaining_sims( + founder: &Simulation, + params: &BranchingParams, + mutator: &dyn Pass, + affinity: Option<&AffinityModel>, +) -> (LineageTree, Vec, Vec) { + let (tree, _peak, sims, live) = grow_core(founder, params, Some(mutator), affinity); + (tree, sims, live) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dist::{EmpiricalLengthDist, UniformBase}; + use crate::ir::{Nucleotide, NucHandle, Region, Segment, Simulation}; + use crate::passes::UniformMutationPass; + + /// Minimal founder: a 4-base V region "AAAA". + fn founder() -> Simulation { + let mut sim = Simulation::new(); + for (i, b) in b"AAAA".iter().enumerate() { + let (next, _) = sim.with_nucleotide_pushed( + Nucleotide::germline(*b, i as u16, Segment::V), + ); + sim = next; + } + sim.with_region_added(Region::new(Segment::V, NucHandle::new(0), NucHandle::new(4))) + } + + fn neutral_params() -> BranchingParams { + BranchingParams { + lambda_base: 1.5, + lambda_mut: 0.0, + max_generations: 5, + n_max: 1000, + n_sample: 10, + seed: 42, + } + } + + #[test] + fn grows_root_and_children_with_monotonic_generations() { + let tree = grow_topology(&founder(), &neutral_params()); + assert_eq!(tree.nodes.iter().filter(|n| n.parent_id.is_none()).count(), 1); + let root = tree.root(); + assert_eq!(root.id, 0); + assert_eq!(root.generation, 0); + for n in &tree.nodes { + if let Some(pid) = n.parent_id { + let parent = tree.get(pid).unwrap(); + assert_eq!(n.generation, parent.generation + 1); + } + } + assert!(tree.len() > 1, "tree did not grow: {} nodes", tree.len()); + } + + #[test] + fn growth_is_deterministic_for_a_seed() { + let a = grow_topology(&founder(), &neutral_params()); + let b = grow_topology(&founder(), &neutral_params()); + assert_eq!(a.len(), b.len()); + for (x, y) in a.nodes.iter().zip(b.nodes.iter()) { + assert_eq!(x.id, y.id); + assert_eq!(x.parent_id, y.parent_id); + assert_eq!(x.generation, y.generation); + } + } + + #[test] + fn carrying_capacity_bounds_live_population() { + let params = BranchingParams { + lambda_base: 4.0, // would explode unbounded + lambda_mut: 0.0, + max_generations: 40, + n_max: 200, + n_sample: 10, + seed: 7, + }; + let (_tree, peak_live, _sims, _live) = grow_core(&founder(), ¶ms, None, None); + // hard cap: live population never exceeds n_max + assert!(peak_live <= params.n_max as usize, + "peak live {peak_live} exceeded n_max {}", params.n_max); + // damping plateau (equilibrium ~ (1 - 1/lambda_base) * n_max = 150) is + // comfortably above half capacity — confirms it grew, not died early + assert!(peak_live > (params.n_max as usize) / 2, + "peak live {peak_live} did not approach capacity"); + } + + #[test] + fn final_live_is_empty_when_founder_never_divides() { + // lambda_base = 0 => the founder draws 0 offspring => after generation 1 + // the live set is empty and the loop breaks. The final living population + // is empty (the clone went extinct). + let params = BranchingParams { lambda_base: 0.0, ..neutral_params() }; + let (_tree, live) = grow_topology_with_live(&founder(), ¶ms); + assert!(live.is_empty(), "expected extinct (empty) final-live set, got {:?}", live); + } + + #[test] + fn final_live_is_the_max_generation_set() { + // In a grown family, every node in the final-live set is at the maximum + // generation reached, and the set is non-empty. A single founder has a + // real chance of immediate extinction at lambda_base ~1.5, so scan seeds + // for one that grows (the invariant under test is about non-extinct runs). + let mut grew = false; + for seed in 0..50u64 { + let params = BranchingParams { seed, ..neutral_params() }; + let (tree, live) = grow_topology_with_live(&founder(), ¶ms); + if live.is_empty() { + continue; // extinct for this seed; try another + } + grew = true; + let max_gen = tree.nodes.iter().map(|n| n.generation).max().unwrap(); + for &id in &live { + assert_eq!(tree.get(id).unwrap().generation, max_gen, + "final-live node {id} not at max generation {max_gen} (seed {seed})"); + } + } + assert!(grew, "no seed in 0..50 produced a surviving family"); + } + + /// A mutator that applies exactly 2 substitutions per division. + fn two_mut_mutator() -> UniformMutationPass { + UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(2, 1.0)])), + Box::new(UniformBase), + ) + } + + #[test] + fn children_accumulate_mutations_and_branch_lengths() { + let params = BranchingParams { + lambda_base: 1.2, lambda_mut: 0.0, max_generations: 4, + n_max: 500, n_sample: 10, seed: 11, + }; + let mutator = two_mut_mutator(); + let tree = grow_lineage(&founder(), ¶ms, &mutator); + + assert_eq!(tree.root().mutations_from_parent, 0); + + let mut saw_mutated_child = false; + for n in &tree.nodes { + if n.parent_id.is_some() { + // <= 2 (not == 2): on the 4-base "AAAA" founder the two uniform + // substitutions can draw the same position, yielding 1 net change. + assert!(n.mutations_from_parent <= 2); + if n.mutations_from_parent > 0 { + saw_mutated_child = true; + let parent = tree.get(n.parent_id.unwrap()).unwrap(); + assert_ne!(n.genotype, parent.genotype); + } + } + } + assert!(saw_mutated_child, "no child accumulated any mutation"); + } + + #[test] + fn mutated_growth_is_deterministic() { + let params = BranchingParams { + lambda_base: 1.2, lambda_mut: 0.0, max_generations: 4, + n_max: 500, n_sample: 10, seed: 11, + }; + let a = grow_lineage(&founder(), ¶ms, &two_mut_mutator()); + let b = grow_lineage(&founder(), ¶ms, &two_mut_mutator()); + assert_eq!(a.len(), b.len()); + for (x, y) in a.nodes.iter().zip(b.nodes.iter()) { + assert_eq!(x.genotype, y.genotype); + assert_eq!(x.mutations_from_parent, y.mutations_from_parent); + } + } + + #[test] + fn affinity_run_populates_node_affinities_and_root_matches() { + use crate::lineage::affinity::{sim_to_aa, AffinityModel}; + let f = founder(); + let founder_aa = sim_to_aa(&f); + let w = vec![1.0; founder_aa.len().max(1)]; + let model = AffinityModel::new(b"W".to_vec(), w, 1.0, 1.0, &founder_aa); + let params = BranchingParams { lambda_base:1.2, lambda_mut:0.0, max_generations:4, n_max:500, n_sample:10, seed:11 }; + let tree = grow_lineage_with_affinity(&f, ¶ms, &two_mut_mutator(), &model); + assert!((tree.root().affinity - model.affinity_value(&founder_aa)).abs() < 1e-9); + for n in &tree.nodes { + assert!(n.affinity > 0.0 && n.affinity <= 1.0 + 1e-9, "affinity out of range: {}", n.affinity); + } + } + + #[test] + fn affinity_strength_zero_matches_neutral_topology() { + use crate::lineage::affinity::{sim_to_aa, AffinityModel}; + let f = founder(); + let founder_aa = sim_to_aa(&f); + let w = vec![1.0; founder_aa.len().max(1)]; + // selection_strength = 0 => fitness identically 1 => identical topology/genotypes to neutral + let model = AffinityModel::new(b"W".to_vec(), w, 1.0, 0.0, &founder_aa); + let params = BranchingParams { lambda_base:1.2, lambda_mut:0.0, max_generations:4, n_max:500, n_sample:10, seed:11 }; + let with_aff = grow_lineage_with_affinity(&f, ¶ms, &two_mut_mutator(), &model); + let neutral = grow_lineage(&f, ¶ms, &two_mut_mutator()); + assert_eq!(with_aff.len(), neutral.len()); + for (a, b) in with_aff.nodes.iter().zip(neutral.nodes.iter()) { + assert_eq!(a.genotype, b.genotype); + assert_eq!(a.parent_id, b.parent_id); + assert_eq!(a.generation, b.generation); + assert_eq!(a.mutations_from_parent, b.mutations_from_parent); + } + } + + #[test] + fn affinity_growth_is_deterministic() { + use crate::lineage::affinity::{sim_to_aa, AffinityModel}; + let f = founder(); + let founder_aa = sim_to_aa(&f); + let mk = || AffinityModel::new(b"W".to_vec(), vec![1.0; founder_aa.len().max(1)], 1.0, 2.0, &founder_aa); + let params = BranchingParams { lambda_base:1.5, lambda_mut:0.0, max_generations:5, n_max:500, n_sample:10, seed:21 }; + let a = grow_lineage_with_affinity(&f, ¶ms, &two_mut_mutator(), &mk()); + let b = grow_lineage_with_affinity(&f, ¶ms, &two_mut_mutator(), &mk()); + assert_eq!(a.len(), b.len()); + for (x, y) in a.nodes.iter().zip(b.nodes.iter()) { + assert_eq!(x.genotype, y.genotype); + assert!((x.affinity - y.affinity).abs() < 1e-12); + } + } +} diff --git a/engine_rs/src/lineage/clone_size.rs b/engine_rs/src/lineage/clone_size.rs new file mode 100644 index 0000000..0361b8c --- /dev/null +++ b/engine_rs/src/lineage/clone_size.rs @@ -0,0 +1,170 @@ +//! Clone-size distributions for repertoire composition. T-cell (and the +//! singleton tail of B-cell) repertoires have heavy-tailed clone-size +//! distributions; this module samples integer clone sizes >= 1. + +use crate::rng::Rng; + +/// A heavy-tailed clone-size distribution. Sizes are integers in `[1, x_max]`. +#[derive(Clone, Debug)] +pub enum CloneSizeDist { + /// Truncated discrete power law P(size=k) ∝ k^(-exponent), k in [1, x_max]. + /// `exponent` ~2–3 is typical for TCR repertoires. + PowerLaw { exponent: f64, x_max: u32 }, + /// Log-normal sizes: round(exp(mu + sigma*Z)), clamped to [1, x_max]. + LogNormal { mu: f64, sigma: f64, x_max: u32 }, +} + +impl Default for CloneSizeDist { + fn default() -> Self { + CloneSizeDist::PowerLaw { + exponent: 2.0, + x_max: 100_000, + } + } +} + +/// Standard-normal variate via Box–Muller using two uniforms from `rng`. +fn next_standard_normal(rng: &mut Rng) -> f64 { + let mut u1 = rng.next_f64(); + if u1 < 1e-300 { + u1 = 1e-300; + } + let u2 = rng.next_f64(); + (-2.0 * u1.ln()).sqrt() * (std::f64::consts::TAU * u2).cos() +} + +/// Sample one clone size (>= 1) from `dist`, deterministic for the `rng` state. +pub fn sample_clone_size(rng: &mut Rng, dist: &CloneSizeDist) -> u32 { + match *dist { + CloneSizeDist::PowerLaw { exponent, x_max } => { + let x_max = x_max.max(1) as f64; + let u = rng.next_f64(); + let x = if (exponent - 1.0).abs() < 1e-9 { + x_max.powf(u) + } else { + let a = 1.0 - exponent; + (1.0 + u * (x_max.powf(a) - 1.0)).powf(1.0 / a) + }; + (x.round() as i64).clamp(1, x_max as i64) as u32 + } + CloneSizeDist::LogNormal { mu, sigma, x_max } => { + let z = next_standard_normal(rng); + let x = (mu + sigma * z).exp(); + (x.round() as i64).clamp(1, x_max.max(1) as i64) as u32 + } + } +} + +/// Draw `n_clones` clone sizes from `dist`. A fraction `unexpanded_fraction` +/// (in [0,1]) of the clones are forced to be unexpanded singletons (size 1); +/// the deterministic forced count is `round(n_clones * unexpanded_fraction)`, +/// placed at the front of the returned vector. The remainder are drawn from +/// `dist` (and may themselves be 1, so the realized singleton count can exceed +/// the forced minimum — matching real repertoires). Deterministic for the `rng` +/// state. `unexpanded_fraction` is clamped to [0,1]. +pub fn sample_repertoire_sizes( + rng: &mut Rng, + n_clones: u32, + dist: &CloneSizeDist, + unexpanded_fraction: f64, +) -> Vec { + let frac = unexpanded_fraction.clamp(0.0, 1.0); + let n = n_clones as usize; + let forced_singletons = (((n as f64) * frac).round() as usize).min(n); + let mut sizes = Vec::with_capacity(n); + for _ in 0..forced_singletons { + sizes.push(1u32); + } + for _ in forced_singletons..n { + sizes.push(sample_clone_size(rng, dist)); + } + sizes +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::rng::Rng; + + #[test] + fn sizes_are_at_least_one_and_within_max() { + let mut rng = Rng::new(1); + let d = CloneSizeDist::PowerLaw { exponent: 2.0, x_max: 1000 }; + for _ in 0..1000 { + let s = sample_clone_size(&mut rng, &d); + assert!(s >= 1 && s <= 1000, "size {s} out of [1,1000]"); + } + } + + #[test] + fn power_law_is_deterministic() { + let d = CloneSizeDist::PowerLaw { exponent: 2.0, x_max: 1000 }; + let mut a = Rng::new(42); + let mut b = Rng::new(42); + let xs: Vec = (0..100).map(|_| sample_clone_size(&mut a, &d)).collect(); + let ys: Vec = (0..100).map(|_| sample_clone_size(&mut b, &d)).collect(); + assert_eq!(xs, ys); + } + + #[test] + fn power_law_is_heavy_tailed_mostly_small_some_large() { + let mut rng = Rng::new(7); + let d = CloneSizeDist::PowerLaw { exponent: 2.0, x_max: 100_000 }; + let n = 20_000; + let sizes: Vec = (0..n).map(|_| sample_clone_size(&mut rng, &d)).collect(); + let ones = sizes.iter().filter(|&&s| s == 1).count(); + let big = sizes.iter().filter(|&&s| s >= 100).count(); + assert!(ones > n / 3, "expected many singletons, got {ones}/{n}"); + assert!(big > 0, "expected some large clones in the tail"); + let max = *sizes.iter().max().unwrap(); + assert!(max > 50, "tail did not reach large sizes (max {max})"); + } + + #[test] + fn lognormal_is_deterministic_and_in_range() { + let d = CloneSizeDist::LogNormal { mu: 1.0, sigma: 1.0, x_max: 10_000 }; + let mut a = Rng::new(9); + let mut b = Rng::new(9); + let xs: Vec = (0..100).map(|_| sample_clone_size(&mut a, &d)).collect(); + let ys: Vec = (0..100).map(|_| sample_clone_size(&mut b, &d)).collect(); + assert_eq!(xs, ys); + for &s in &xs { + assert!(s >= 1 && s <= 10_000); + } + } + + #[test] + fn repertoire_sizes_respects_count_and_min_one() { + let mut rng = Rng::new(3); + let d = CloneSizeDist::PowerLaw { exponent: 2.0, x_max: 1000 }; + let sizes = sample_repertoire_sizes(&mut rng, 500, &d, 0.0); + assert_eq!(sizes.len(), 500); + assert!(sizes.iter().all(|&s| s >= 1)); + } + + #[test] + fn unexpanded_fraction_forces_singletons() { + let mut rng = Rng::new(5); + let d = CloneSizeDist::PowerLaw { exponent: 2.0, x_max: 1000 }; + let sizes = sample_repertoire_sizes(&mut rng, 1000, &d, 0.4); + assert_eq!(sizes.len(), 1000); + let ones = sizes.iter().filter(|&&s| s == 1).count(); + assert!(ones >= 400, "expected >=400 singletons, got {ones}"); + } + + #[test] + fn unexpanded_fraction_one_is_all_singletons() { + let mut rng = Rng::new(8); + let d = CloneSizeDist::default(); + let sizes = sample_repertoire_sizes(&mut rng, 100, &d, 1.0); + assert!(sizes.iter().all(|&s| s == 1)); + } + + #[test] + fn repertoire_is_deterministic() { + let d = CloneSizeDist::PowerLaw { exponent: 2.0, x_max: 1000 }; + let a = sample_repertoire_sizes(&mut Rng::new(11), 200, &d, 0.3); + let b = sample_repertoire_sizes(&mut Rng::new(11), 200, &d, 0.3); + assert_eq!(a, b); + } +} diff --git a/engine_rs/src/lineage/export.rs b/engine_rs/src/lineage/export.rs new file mode 100644 index 0000000..f94bce2 --- /dev/null +++ b/engine_rs/src/lineage/export.rs @@ -0,0 +1,203 @@ +//! Ground-truth export for a `LineageTree`: node-table TSV, FASTA of all node +//! (ancestral + observed) sequences, and a Newick tree with per-edge mutation +//! counts as branch lengths. Consumed by lineage-inference benchmark tools. + +use std::fmt::Write as _; + +use super::tree::LineageTree; + +/// Tab-separated node table, one row per node plus a header. `parent_id` is +/// `NA` for the root. Columns: +/// `node_id, parent_id, generation, mutations_from_parent, abundance, observed, affinity, sequence`. +pub fn to_node_table_tsv(tree: &LineageTree) -> String { + let mut out = String::new(); + out.push_str( + "node_id\tparent_id\tgeneration\tmutations_from_parent\tabundance\tobserved\taffinity\tsequence\n", + ); + for n in &tree.nodes { + let parent = match n.parent_id { + Some(p) => p.to_string(), + None => "NA".to_string(), + }; + let seq = String::from_utf8_lossy(&n.genotype); + let _ = writeln!( + out, + "{}\t{}\t{}\t{}\t{}\t{}\t{:.4}\t{}", + n.id, parent, n.generation, n.mutations_from_parent, n.abundance, n.observed, n.affinity, seq + ); + } + out +} + +/// FASTA of every node (ancestral + observed). Header carries node id, +/// generation, abundance, and observed flag so downstream tools can map a +/// record back to its ground-truth node. +pub fn to_fasta(tree: &LineageTree) -> String { + let mut out = String::new(); + for n in &tree.nodes { + let seq = String::from_utf8_lossy(&n.genotype); + let _ = writeln!( + out, + ">node{}|gen={}|abundance={}|observed={}", + n.id, n.generation, n.abundance, n.observed + ); + let _ = writeln!(out, "{}", seq); + } + out +} + +/// Recursively render the subtree rooted at `id` in Newick form, with the edge +/// to this node labelled by its `mutations_from_parent` count as branch length. +/// Recursion depth is bounded by the number of generations (small), not node +/// count, so this is safe from stack overflow at realistic family sizes. +fn newick_subtree(tree: &LineageTree, id: u32) -> String { + let node = tree.get(id).expect("newick: node id out of range"); + let children = tree.children_of(id); + let label = format!("node{id}"); + if children.is_empty() { + format!("{label}:{}", node.mutations_from_parent) + } else { + let inner: Vec = children + .iter() + .map(|c| newick_subtree(tree, c.id)) + .collect(); + format!("({}){label}:{}", inner.join(","), node.mutations_from_parent) + } +} + +/// Newick string for the whole tree. Branch lengths are per-edge mutation +/// counts. The founder is the named Newick root (its children are wrapped in +/// parentheses, its label follows) and carries no branch length, since it is +/// the origin. Always terminated with `;`. This is standard rooted Newick +/// (e.g. `((node3:1)node1:1,node2:1)node0;`) — no phantom outer node — so +/// ete3 / dendropy / Bio.Phylo parse `node0` as the actual root. +/// +/// Precondition: the tree has exactly one root (one node with `parent_id == +/// None`); call [`LineageTree::validate`] first if the tree was hand-built. +pub fn to_newick(tree: &LineageTree) -> String { + let root = tree.root(); + let children = tree.children_of(root.id); + let root_label = format!("node{}", root.id); + let body = if children.is_empty() { + root_label + } else { + let inner: Vec = children + .iter() + .map(|c| newick_subtree(tree, c.id)) + .collect(); + format!("({}){root_label}", inner.join(",")) + }; + format!("{body};") +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dist::{EmpiricalLengthDist, UniformBase}; + use crate::ir::{Nucleotide, NucHandle, Region, Segment, Simulation}; + use crate::lineage::{simulate_family, BranchingParams}; + use crate::lineage::tree::{LineageNode, LineageTree}; + use crate::passes::UniformMutationPass; + + fn grown_founder() -> Simulation { + let mut sim = Simulation::new(); + for (i, b) in b"AAAAAAAA".iter().enumerate() { + let (next, _) = sim.with_nucleotide_pushed( + Nucleotide::germline(*b, i as u16, Segment::V)); + sim = next; + } + sim.with_region_added(Region::new(Segment::V, NucHandle::new(0), NucHandle::new(8))) + } + + #[test] + fn exports_a_grown_family_consistently() { + let params = BranchingParams { + lambda_base: 1.5, lambda_mut: 0.0, max_generations: 6, + n_max: 300, n_sample: 20, seed: 2024, + }; + let mutator = UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(1, 1.0)])), + Box::new(UniformBase), + ); + let tree = simulate_family(&grown_founder(), ¶ms, &mutator); + assert!(tree.validate().is_ok()); + + let tsv = to_node_table_tsv(&tree); + let fasta = to_fasta(&tree); + let nwk = to_newick(&tree); + + assert_eq!(tsv.lines().count(), tree.len() + 1); + assert_eq!(fasta.lines().count(), tree.len() * 2); + assert!(nwk.ends_with(';')); + assert_eq!(nwk.matches('(').count(), nwk.matches(')').count()); + for n in &tree.nodes { + assert!(nwk.contains(&format!("node{}", n.id)), + "newick missing node{}", n.id); + } + } + + // root(0,"AAAA") -> 1("AAAC", abundance 2, observed), 2("AAAG"); 1 -> 3("ATAC", abundance 1, observed) + fn sample_tree() -> LineageTree { + LineageTree { + nodes: vec![ + LineageNode { id: 0, parent_id: None, generation: 0, genotype: b"AAAA".to_vec(), mutations_from_parent: 0, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 1, parent_id: Some(0), generation: 1, genotype: b"AAAC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 2, observed: true }, + LineageNode { id: 2, parent_id: Some(0), generation: 1, genotype: b"AAAG".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 3, parent_id: Some(1), generation: 2, genotype: b"ATAC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 1, observed: true }, + ], + } + } + + #[test] + fn fasta_emits_every_node_with_metadata_header() { + let fasta = to_fasta(&sample_tree()); + let lines: Vec<&str> = fasta.lines().collect(); + assert_eq!(lines.len(), 8); // 4 nodes => header+seq each + assert_eq!(lines[0], ">node0|gen=0|abundance=0|observed=false"); + assert_eq!(lines[1], "AAAA"); + assert_eq!(lines[2], ">node1|gen=1|abundance=2|observed=true"); + assert_eq!(lines[3], "AAAC"); + assert!(fasta.contains(">node3|gen=2|abundance=1|observed=true")); + } + + #[test] + fn node_table_tsv_has_header_and_one_row_per_node() { + let tsv = to_node_table_tsv(&sample_tree()); + let lines: Vec<&str> = tsv.lines().collect(); + assert_eq!(lines.len(), 5); // 1 header + 4 nodes + assert_eq!( + lines[0], + "node_id\tparent_id\tgeneration\tmutations_from_parent\tabundance\tobserved\taffinity\tsequence" + ); + assert_eq!(lines[1], "0\tNA\t0\t0\t0\tfalse\t0.0000\tAAAA"); + assert_eq!(lines[2], "1\t0\t1\t1\t2\ttrue\t0.0000\tAAAC"); + assert_eq!(lines[3], "2\t0\t1\t1\t0\tfalse\t0.0000\tAAAG"); + assert_eq!(lines[4], "3\t1\t2\t1\t1\ttrue\t0.0000\tATAC"); + } + + #[test] + fn newick_encodes_topology_and_branch_lengths() { + let nwk = to_newick(&sample_tree()); + assert!(nwk.ends_with(';'), "newick must end with ';': {nwk}"); + let opens = nwk.matches('(').count(); + let closes = nwk.matches(')').count(); + assert_eq!(opens, closes, "unbalanced parens: {nwk}"); + assert!(nwk.contains("node1:1")); + assert!(nwk.contains("node2:1")); + assert!(nwk.contains("node3:1")); + assert!(nwk.contains("node0")); + assert!(!nwk.contains("node0:"), "root must not have a branch length: {nwk}"); + assert_eq!(nwk, "((node3:1)node1:1,node2:1)node0;"); + } + + #[test] + fn newick_single_node_tree() { + let tree = LineageTree { + nodes: vec![LineageNode { + id: 0, parent_id: None, generation: 0, genotype: b"AAAA".to_vec(), + mutations_from_parent: 0, affinity: 0.0, abundance: 1, observed: true, + }], + }; + assert_eq!(to_newick(&tree), "node0;"); + } +} diff --git a/engine_rs/src/lineage/family.rs b/engine_rs/src/lineage/family.rs new file mode 100644 index 0000000..06b61be --- /dev/null +++ b/engine_rs/src/lineage/family.rs @@ -0,0 +1,181 @@ +//! One-call clonal family simulation: grow a lineage and sample it. + +use crate::ir::Simulation; +use crate::pass::Pass; +use crate::rng::Rng; + +use super::affinity::AffinityModel; +use super::branching::{grow_lineage_retaining_sims, BranchingParams}; +use super::sampling::sample_and_collapse; +use super::tree::LineageTree; + +/// Salt mixed into `params.seed` to give sampling an independent RNG stream +/// from growth, while staying deterministic for a given seed. +const SAMPLE_SEED_SALT: u64 = 0x5341_4D50_4C45_0001; // "SAMPLE\0\1" + +/// Grow a clonal family from `founder` and sample observed cells. +/// +/// Deterministic for `params.seed`. Growth and sampling use separate RNG +/// streams (the sampling stream is `params.seed ^ SAMPLE_SEED_SALT`), so the +/// whole family reproduces byte-for-byte across runs. +pub fn simulate_family( + founder: &Simulation, + params: &BranchingParams, + mutator: &dyn Pass, +) -> LineageTree { + let (mut tree, _sims, live) = grow_lineage_retaining_sims(founder, params, mutator, None); + let mut sample_rng = Rng::new(params.seed ^ SAMPLE_SEED_SALT); + sample_and_collapse(&mut tree, &live, params.n_sample, &mut sample_rng); + tree +} + +/// Grow + sample a clonal family with affinity selection. +pub fn simulate_family_with_affinity( + founder: &Simulation, + params: &BranchingParams, + mutator: &dyn Pass, + model: &AffinityModel, +) -> LineageTree { + let (mut tree, _sims, live) = + grow_lineage_retaining_sims(founder, params, mutator, Some(model)); + let mut sample_rng = Rng::new(params.seed ^ SAMPLE_SEED_SALT); + sample_and_collapse(&mut tree, &live, params.n_sample, &mut sample_rng); + tree +} + +/// Grow + sample a family, returning the tree AND the per-node Simulation arena. +/// Index in the arena equals the node id (arena[node.id] is the Simulation for +/// that node). Only observed (sampled) nodes are useful for AIRR projection; the +/// arena is full-tree and never trimmed. +pub fn simulate_family_sims( + founder: &Simulation, + params: &BranchingParams, + mutator: &dyn Pass, + affinity: Option<&AffinityModel>, +) -> (LineageTree, Vec) { + let (mut tree, sims, live) = grow_lineage_retaining_sims(founder, params, mutator, affinity); + let mut sample_rng = Rng::new(params.seed ^ SAMPLE_SEED_SALT); + sample_and_collapse(&mut tree, &live, params.n_sample, &mut sample_rng); + (tree, sims) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dist::{EmpiricalLengthDist, UniformBase}; + use crate::ir::{Nucleotide, NucHandle, Region, Segment, Simulation}; + use crate::lineage::BranchingParams; + use crate::passes::UniformMutationPass; + + fn founder() -> Simulation { + let mut sim = Simulation::new(); + for (i, b) in b"AAAAAAAA".iter().enumerate() { + let (next, _) = sim.with_nucleotide_pushed( + Nucleotide::germline(*b, i as u16, Segment::V)); + sim = next; + } + sim.with_region_added(Region::new(Segment::V, NucHandle::new(0), NucHandle::new(8))) + } + + #[test] + fn simulate_family_returns_valid_sampled_tree() { + let params = BranchingParams { + lambda_base: 1.5, lambda_mut: 0.0, max_generations: 6, + n_max: 300, n_sample: 20, seed: 2024, + }; + let mutator = UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(1, 1.0)])), + Box::new(UniformBase), + ); + let tree = simulate_family(&founder(), ¶ms, &mutator); + + assert!(tree.validate().is_ok(), "invalid tree: {:?}", tree.validate()); + let total_abundance: u32 = tree.nodes.iter().map(|n| n.abundance).sum(); + // abundance equals n_sample unless the family went extinct early + assert!(total_abundance == params.n_sample || total_abundance == 0); + + let tree2 = simulate_family(&founder(), ¶ms, + &UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(1, 1.0)])), + Box::new(UniformBase))); + assert_eq!(tree.len(), tree2.len()); + } + + #[test] + fn extinct_founder_yields_no_observed_cells() { + // lambda_base = 0 => founder never divides => the living final population + // is empty (extinct). An extinct clone must NOT be sampled: no observed node. + let params = BranchingParams { + lambda_base: 0.0, lambda_mut: 0.0, max_generations: 6, + n_max: 300, n_sample: 20, seed: 2024, + }; + let mutator = UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(1, 1.0)])), + Box::new(UniformBase), + ); + let (tree, _sims) = simulate_family_sims(&founder(), ¶ms, &mutator, None); + assert!( + tree.nodes.iter().all(|n| !n.observed && n.abundance == 0), + "extinct clone was sampled: {} observed nodes", + tree.nodes.iter().filter(|n| n.observed).count() + ); + } + + #[test] + fn observed_cells_are_all_at_the_final_living_generation() { + // In a grown family, every OBSERVED node must be a member of the final + // living generation (the maximum generation reached) — never an + // early-terminal/dead cell that drew 0 offspring at a lower generation. + // A single founder can go extinct, so scan seeds for one that grows. + let mutator = || UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(1, 1.0)])), + Box::new(UniformBase), + ); + let mut grew = false; + for seed in 0..50u64 { + let params = BranchingParams { + lambda_base: 1.6, lambda_mut: 0.0, max_generations: 6, + n_max: 300, n_sample: 20, seed, + }; + let (tree, _sims) = simulate_family_sims(&founder(), ¶ms, &mutator(), None); + let observed: Vec<&_> = tree.nodes.iter().filter(|n| n.observed).collect(); + if observed.is_empty() { + continue; // extinct for this seed + } + grew = true; + let max_gen = tree.nodes.iter().map(|n| n.generation).max().unwrap(); + for n in &observed { + assert_eq!( + n.generation, max_gen, + "observed node {} at generation {} is not at the final generation {} (seed {seed})", + n.id, n.generation, max_gen + ); + } + } + assert!(grew, "no seed in 0..50 produced a surviving family"); + } + + #[test] + fn simulate_family_sims_arena_length_matches_tree_and_observed_nodes_have_nonempty_pool() { + let params = BranchingParams { + lambda_base: 1.5, lambda_mut: 0.0, max_generations: 6, + n_max: 300, n_sample: 20, seed: 9999, + }; + let mutator = UniformMutationPass::new( + Box::new(EmpiricalLengthDist::from_pairs(vec![(1, 1.0)])), + Box::new(UniformBase), + ); + let (tree, sims) = simulate_family_sims(&founder(), ¶ms, &mutator, None); + + // Arena length must equal tree length (one entry per node). + assert_eq!(sims.len(), tree.len(), + "arena len {} != tree len {}", sims.len(), tree.len()); + + // Every observed node's sim must have a non-empty pool. + for node in tree.nodes.iter().filter(|n| n.observed) { + let sim = &sims[node.id as usize]; + assert!(!sim.pool.is_empty(), + "observed node {} has empty pool", node.id); + } + } +} diff --git a/engine_rs/src/lineage/mod.rs b/engine_rs/src/lineage/mod.rs new file mode 100644 index 0000000..5f4a753 --- /dev/null +++ b/engine_rs/src/lineage/mod.rs @@ -0,0 +1,41 @@ +//! Clonal lineage simulation: grow a real mutation tree from one founder +//! `Simulation` via a generation-synchronous birth–death process. +//! +//! ```ignore +//! use genairr_engine::lineage::{simulate_family, BranchingParams}; +//! use genairr_engine::passes::UniformMutationPass; +//! // build a founder Simulation + a mutator (S5F in production), then: +//! let params = BranchingParams { +//! lambda_base: 1.5, lambda_mut: 0.0, max_generations: 12, +//! n_max: 1000, n_sample: 60, seed: 42, +//! }; +//! let tree = simulate_family(&founder, ¶ms, &mutator); +//! assert!(tree.validate().is_ok()); +//! ``` +//! +//! The mutator is any `Pass`; production wires the S5F pass, tests use +//! `UniformMutationPass` (no reference cartridge required). Affinity-driven +//! selection (BLOSUM-weighted distance to a target antigen → fitness-modulated +//! offspring) lives in [`affinity`] and is used via [`simulate_family_with_affinity`]. +//! +//! Ground-truth export (Newick / FASTA / node-table TSV) lives in [`export`]. +//! Heavy-tailed clone-size distributions + repertoire composition (the TCR core +//! and the singleton tail of BCR repertoires) live in [`clone_size`]. + +pub mod tree; +pub mod poisson; +pub mod branching; +pub mod sampling; +pub mod family; +pub mod export; +pub mod affinity; +pub mod clone_size; + +pub use clone_size::{sample_clone_size, sample_repertoire_sizes, CloneSizeDist}; + +pub use affinity::{sim_to_aa, AffinityModel}; +pub use tree::{LineageNode, LineageTree}; +pub use branching::{grow_lineage, grow_lineage_retaining_sims, grow_lineage_with_affinity, grow_topology, BranchingParams}; +pub use sampling::sample_and_collapse; +pub use family::{simulate_family, simulate_family_sims, simulate_family_with_affinity}; +pub use export::{to_fasta, to_newick, to_node_table_tsv}; diff --git a/engine_rs/src/lineage/poisson.rs b/engine_rs/src/lineage/poisson.rs new file mode 100644 index 0000000..e428767 --- /dev/null +++ b/engine_rs/src/lineage/poisson.rs @@ -0,0 +1,68 @@ +//! Self-contained Poisson sampler over the engine `Rng` (Knuth's algorithm). + +use crate::rng::Rng; + +/// Draw a Poisson(`lambda`) variate using Knuth's multiplicative method. +/// +/// Deterministic for a given `Rng` state. Non-finite (NaN / ±inf) or +/// non-positive `lambda` always returns 0 — the `is_finite` guard matches the +/// engine's other Poisson sampler (`passes::count_source`) so an infinite +/// lambda cannot drive the loop to a pathological underflow-termination. +/// Suitable for the small lambdas used in clonal branching (offspring ~1.5, +/// mutations ~<1). Not optimized for very large lambda. +pub fn poisson_sample(rng: &mut Rng, lambda: f64) -> u32 { + if !(lambda.is_finite() && lambda > 0.0) { + return 0; + } + let l = (-lambda).exp(); + let mut k: u32 = 0; + let mut p: f64 = 1.0; + loop { + k += 1; + p *= rng.next_f64(); + if p <= l { + return k - 1; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::rng::Rng; + + #[test] + fn poisson_zero_lambda_is_always_zero() { + let mut rng = Rng::new(7); + for _ in 0..100 { + assert_eq!(poisson_sample(&mut rng, 0.0), 0); + } + } + + #[test] + fn poisson_is_deterministic_for_a_seed() { + let mut a = Rng::new(123); + let mut b = Rng::new(123); + let xs: Vec = (0..50).map(|_| poisson_sample(&mut a, 1.5)).collect(); + let ys: Vec = (0..50).map(|_| poisson_sample(&mut b, 1.5)).collect(); + assert_eq!(xs, ys); + } + + #[test] + fn poisson_nonfinite_lambda_is_zero() { + let mut rng = Rng::new(3); + assert_eq!(poisson_sample(&mut rng, f64::NAN), 0); + assert_eq!(poisson_sample(&mut rng, f64::INFINITY), 0); + assert_eq!(poisson_sample(&mut rng, -1.0), 0); + } + + #[test] + fn poisson_mean_is_near_lambda() { + let mut rng = Rng::new(99); + let n = 20_000u32; + let lambda = 1.5; + let total: u64 = (0..n).map(|_| poisson_sample(&mut rng, lambda) as u64).sum(); + let mean = total as f64 / n as f64; + assert!((mean - lambda).abs() < 0.1, "mean {mean} not near {lambda}"); + } +} diff --git a/engine_rs/src/lineage/sampling.rs b/engine_rs/src/lineage/sampling.rs new file mode 100644 index 0000000..a2e1991 --- /dev/null +++ b/engine_rs/src/lineage/sampling.rs @@ -0,0 +1,118 @@ +//! Sampling observed cells from a grown lineage and collapsing identical +//! genotypes into abundance-bearing observed nodes. + +use std::collections::HashMap; + +use crate::rng::Rng; + +use super::tree::LineageTree; + +/// Sample `n_sample` cells uniformly (with replacement) from `sampleable_ids` +/// (the LIVING population at sampling time — the final-generation cells, NOT all +/// tree leaves) and collapse identical genotypes: the first cell *drawn* carrying +/// a given genotype becomes the observed representative and accumulates the +/// abundance; later draws of the same genotype fold into it. Mutates `tree` in +/// place. +/// +/// If `sampleable_ids` is empty (an extinct family — no living cells), nothing is +/// marked observed: an unobserved extinct clone yields zero records. +pub fn sample_and_collapse( + tree: &mut LineageTree, + sampleable_ids: &[u32], + n_sample: u32, + rng: &mut Rng, +) { + if sampleable_ids.is_empty() || n_sample == 0 { + return; + } + + // genotype -> representative node id (first cell drawn with that genotype) + let mut rep_by_genotype: HashMap, u32> = HashMap::new(); + // representative node id -> accumulated abundance + let mut abundance: HashMap = HashMap::new(); + + for _ in 0..n_sample { + // Unbiased uniform index (Lemire), matching the engine's RNG convention; + // avoids the modulo bias of `next_u64() % len`. + let idx = rng.range_u32(sampleable_ids.len() as u32) as usize; + let cell_id = sampleable_ids[idx]; + let genotype = tree.get(cell_id).unwrap().genotype.clone(); + let rep = *rep_by_genotype.entry(genotype).or_insert(cell_id); + *abundance.entry(rep).or_insert(0) += 1; + } + + for (id, count) in abundance { + // Write-back relies on the `node.id == index into nodes` invariant + // (held by construction in the branching loop); guard it in debug builds. + debug_assert_eq!( + tree.nodes[id as usize].id, id, + "id-index invariant violated during sample write-back" + ); + let node = &mut tree.nodes[id as usize]; + node.abundance = count; + node.observed = true; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::lineage::tree::{LineageNode, LineageTree}; + use crate::rng::Rng; + + // Tree with duplicate genotypes among leaves to exercise collapse. + fn tree_with_dupes() -> LineageTree { + LineageTree { + nodes: vec![ + LineageNode { id: 0, parent_id: None, generation: 0, genotype: b"AAAA".to_vec(), mutations_from_parent: 0, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 1, parent_id: Some(0), generation: 1, genotype: b"AAAC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 2, parent_id: Some(0), generation: 1, genotype: b"AAAC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 3, parent_id: Some(1), generation: 2, genotype: b"AAGC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + ], + } + } + + // Sampleable set with duplicate genotypes among the living cells (nodes 1 & 2 + // share "AAAC") to exercise genotype-collapse. + fn sampleable() -> Vec { + vec![1, 2, 3] + } + + #[test] + fn sampling_sets_abundances_summing_to_n_sample() { + let mut tree = tree_with_dupes(); + let mut rng = Rng::new(5); + let n_sample = 3; + sample_and_collapse(&mut tree, &sampleable(), n_sample, &mut rng); + + let total: u32 = tree.nodes.iter().map(|n| n.abundance).sum(); + assert_eq!(total, n_sample); + + assert!(tree.nodes.iter().any(|n| n.observed)); + for n in &tree.nodes { + assert_eq!(n.observed, n.abundance > 0); + } + } + + #[test] + fn identical_genotypes_collapse_into_one_observed_node() { + let mut tree = tree_with_dupes(); + let mut rng = Rng::new(1); + sample_and_collapse(&mut tree, &sampleable(), 3, &mut rng); + use std::collections::HashSet; + let mut seen: HashSet> = HashSet::new(); + for n in tree.nodes.iter().filter(|n| n.observed) { + assert!(seen.insert(n.genotype.clone()), + "genotype observed in more than one node (collapse failed)"); + } + } + + #[test] + fn empty_sampleable_set_marks_nothing_observed() { + // An extinct family (no living cells) yields zero observed cells. + let mut tree = tree_with_dupes(); + let mut rng = Rng::new(3); + sample_and_collapse(&mut tree, &[], 5, &mut rng); + assert!(tree.nodes.iter().all(|n| !n.observed && n.abundance == 0)); + } +} diff --git a/engine_rs/src/lineage/tree.rs b/engine_rs/src/lineage/tree.rs new file mode 100644 index 0000000..188a3f5 --- /dev/null +++ b/engine_rs/src/lineage/tree.rs @@ -0,0 +1,191 @@ +//! Clonal lineage tree data structures. +//! +//! A `LineageTree` is a flat arena of `LineageNode`s addressed by `id`. The +//! root is the founder (naive rearrangement); every other node is a somatic +//! descendant produced by one cell division. `genotype` is the node's +//! nucleotide sequence (pool bases) captured at creation, used for +//! genotype-collapse and downstream export. + +/// One cell in a clonal lineage. +#[derive(Clone, Debug, PartialEq)] +pub struct LineageNode { + /// Stable index of this node within `LineageTree::nodes` (also its arena position). + pub id: u32, + /// Parent node id; `None` only for the root founder. + pub parent_id: Option, + /// Generation depth from the root (root == 0). + pub generation: u32, + /// Nucleotide bases of this cell's `Simulation` pool at creation time. + pub genotype: Vec, + /// Mutations introduced on the edge from the parent to this node + /// (the number of substitutions on the parent→child edge; 0 for the root). + pub mutations_from_parent: u32, + /// Affinity of this cell to the target antigen (0.0 = neutral / not computed). + pub affinity: f64, + /// Observation count after sampling + genotype-collapse. 0 until sampled. + pub abundance: u32, + /// Whether this node was observed: the representative node a sampled + /// genotype collapses onto. (Internal-ancestor observation is a later concern.) + pub observed: bool, +} + +/// A clonal lineage as a flat arena of nodes (node `id` == index into `nodes`). +#[derive(Clone, Debug, Default)] +pub struct LineageTree { + pub nodes: Vec, +} + +impl LineageTree { + /// Total node count (founder + all descendants). + pub fn len(&self) -> usize { + self.nodes.len() + } + + /// True when the tree has no nodes. + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() + } + + /// The founder node (the unique node with no parent). + /// + /// Panics if the tree has no root — call only on a grown tree. + pub fn root(&self) -> &LineageNode { + self.nodes + .iter() + .find(|n| n.parent_id.is_none()) + .expect("LineageTree has no root node") + } + + /// Node by id, or `None` if out of range. + pub fn get(&self, id: u32) -> Option<&LineageNode> { + self.nodes.get(id as usize) + } + + /// Direct children of `id`, in ascending id order. + pub fn children_of(&self, id: u32) -> Vec<&LineageNode> { + let mut v: Vec<&LineageNode> = self.nodes + .iter() + .filter(|n| n.parent_id == Some(id)) + .collect(); + v.sort_unstable_by_key(|n| n.id); + v + } + + /// Check structural invariants: `node.id == index` for every node; exactly + /// one root; every non-root parent exists; child generation == parent + /// generation + 1 (acyclicity then follows from the strictly-increasing + /// generation along any parent chain, since `get(pid)` resolves ids as + /// arena indices). + pub fn validate(&self) -> Result<(), String> { + if self.nodes.is_empty() { + return Err("empty lineage tree".to_string()); + } + // id == index invariant: relied upon by id-based lookups/write-backs. + for (idx, n) in self.nodes.iter().enumerate() { + if n.id as usize != idx { + return Err(format!( + "node at index {idx} has id {} (id-index mismatch)", + n.id + )); + } + } + let roots = self.nodes.iter().filter(|n| n.parent_id.is_none()).count(); + if roots != 1 { + return Err(format!("expected exactly 1 root, found {roots}")); + } + for n in &self.nodes { + if let Some(pid) = n.parent_id { + let parent = self + .get(pid) + .ok_or_else(|| format!("node {} references missing parent {pid}", n.id))?; + if n.generation != parent.generation + 1 { + return Err(format!( + "node {} generation {} != parent {} generation {} + 1", + n.id, n.generation, parent.id, parent.generation + )); + } + } + } + Ok(()) + } + + /// Nodes with no children (tips), in ascending id order. + /// + /// O(n): a single pass marks which ids are parents (relying on the + /// `id == index` invariant, same as the rest of the module), then filters. + /// Avoids the previous O(n²) of calling `children_of` per node. + pub fn leaves(&self) -> Vec<&LineageNode> { + let mut has_child = vec![false; self.nodes.len()]; + for n in &self.nodes { + if let Some(p) = n.parent_id { + if (p as usize) < has_child.len() { + has_child[p as usize] = true; + } + } + } + // Arena order == ascending id under the id==index invariant. + self.nodes + .iter() + .filter(|n| !has_child.get(n.id as usize).copied().unwrap_or(false)) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Build a tiny tree by hand: root(0) -> {1, 2}; 1 -> {3} + fn hand_tree() -> LineageTree { + LineageTree { + nodes: vec![ + LineageNode { id: 0, parent_id: None, generation: 0, genotype: b"AAAA".to_vec(), mutations_from_parent: 0, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 1, parent_id: Some(0), generation: 1, genotype: b"AAAC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 2, parent_id: Some(0), generation: 1, genotype: b"AAAG".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + LineageNode { id: 3, parent_id: Some(1), generation: 2, genotype: b"AATC".to_vec(), mutations_from_parent: 1, affinity: 0.0, abundance: 0, observed: false }, + ], + } + } + + #[test] + fn validate_accepts_well_formed_tree() { + assert!(hand_tree().validate().is_ok()); + } + + #[test] + fn validate_rejects_two_roots() { + let mut t = hand_tree(); + t.nodes[1].parent_id = None; // second root + assert!(t.validate().is_err()); + } + + #[test] + fn validate_rejects_nonmonotonic_generation() { + let mut t = hand_tree(); + t.nodes[3].generation = 1; // child not parent.gen + 1 + assert!(t.validate().is_err()); + } + + #[test] + fn validate_rejects_id_index_mismatch() { + let mut t = hand_tree(); + t.nodes[2].id = 99; // id no longer equals its arena index + assert!(t.validate().is_err()); + } + + #[test] + fn tree_accessors_report_structure() { + let t = hand_tree(); + assert_eq!(t.root().id, 0); + assert_eq!(t.len(), 4); + + let root_children: Vec = t.children_of(0).iter().map(|n| n.id).collect(); + assert_eq!(root_children, vec![1, 2]); + + let leaves: Vec = t.leaves().iter().map(|n| n.id).collect(); + assert_eq!(leaves, vec![2, 3]); // nodes with no children + + assert_eq!(t.get(3).unwrap().parent_id, Some(1)); + assert!(t.get(99).is_none()); + } +} diff --git a/engine_rs/src/live_call/mod.rs b/engine_rs/src/live_call/mod.rs index 7b1d535..474d0c0 100644 --- a/engine_rs/src/live_call/mod.rs +++ b/engine_rs/src/live_call/mod.rs @@ -28,7 +28,7 @@ pub use reference_index::{ BaseBitsets, BaseEvidence, BoundaryIndex, IndexedAllele, KmerHit, KmerIndex, ReferenceMatchIndex, SegmentRefIndex, DEFAULT_REFERENCE_KMER_LEN, }; -pub use refresh_hook::LiveCallRefreshHook; +pub use refresh_hook::{refresh_live_calls, LiveCallRefreshHook}; pub use refresh_plan::{LiveCallRefreshPlan, LiveCallRefreshStep}; fn assert_live_segment(segment: Segment) { diff --git a/engine_rs/src/live_call/refresh_hook.rs b/engine_rs/src/live_call/refresh_hook.rs index 64b750c..90fa844 100644 --- a/engine_rs/src/live_call/refresh_hook.rs +++ b/engine_rs/src/live_call/refresh_hook.rs @@ -98,6 +98,26 @@ impl EffectHook for LiveCallRefreshHook { } } +/// Public, unconditional V/D/J live-call refresh for a simulation +/// whose pool was edited outside the compiled runtime (e.g. the +/// clonal-lineage branching loop, which mutates child sims with a +/// bare `Pass::execute` and so never runs `LiveCallRefreshHook`). +/// +/// Unlike [`refresh_segments_for_edit`], this recomputes the call +/// for *every* assignable segment from scratch against the current +/// pool, regardless of the dirty log — the caller may have applied +/// many independent edits whose dirty windows were never threaded. +/// The dirty log is drained so the returned sim is in a clean state. +pub fn refresh_live_calls( + mut sim: Simulation, + reference_index: &ReferenceMatchIndex, +) -> Simulation { + for &segment in Segment::assignable() { + sim = with_assembled_segment_live_call(&sim, reference_index, segment); + } + drain_dirty_windows(sim) +} + /// Refresh V/D/J live calls after a base-edit pass, using dirty /// windows stamped by the pass's `DirtySignalObserver` to skip /// segments whose region doesn't overlap any dirty position. diff --git a/engine_rs/src/python/lineage.rs b/engine_rs/src/python/lineage.rs new file mode 100644 index 0000000..c1c7ba9 --- /dev/null +++ b/engine_rs/src/python/lineage.rs @@ -0,0 +1,612 @@ +//! PyO3 bindings for the clonal lineage engine: `LineageNode`, `LineageTree`, +//! and the `simulate_lineage` entry point. + +use pyo3::prelude::*; +use pyo3::types::PyDict; + +use crate::airr_record::build_airr_record; +use crate::event::{EventRecord, StateSummary, TraceSpan}; +use crate::ir::{NucHandle, SimulationEvent}; +use crate::ir::Simulation; +use crate::lineage::export::{to_fasta, to_newick, to_node_table_tsv}; +use crate::lineage::tree::{LineageNode, LineageTree}; +use crate::lineage::{simulate_family, simulate_family_sims, simulate_family_with_affinity, sim_to_aa, AffinityModel, BranchingParams}; +use crate::lineage::affinity::make_mature_target; +use crate::live_call::{refresh_live_calls, ReferenceMatchIndex}; +use crate::pass::{Outcome, PassCompileEffect}; +use crate::rng::Rng; +use crate::passes::S5FMutationPass; +use crate::s5f::S5FKernel; + +use super::outcome::{PyOutcome, airr_record_to_pydict}; +use super::refdata::PyRefDataConfig; +use super::simulation::PySimulation; + +/// One node of a clonal lineage tree (read-only view). +#[pyclass(name = "LineageNode", module = "GenAIRR._engine", frozen)] +pub struct PyLineageNode { + pub(crate) inner: LineageNode, +} + +#[pymethods] +impl PyLineageNode { + #[getter] + fn id(&self) -> u32 { + self.inner.id + } + #[getter] + fn parent_id(&self) -> Option { + self.inner.parent_id + } + #[getter] + fn generation(&self) -> u32 { + self.inner.generation + } + #[getter] + fn mutations_from_parent(&self) -> u32 { + self.inner.mutations_from_parent + } + #[getter] + fn abundance(&self) -> u32 { + self.inner.abundance + } + #[getter] + fn observed(&self) -> bool { + self.inner.observed + } + #[getter] + fn affinity(&self) -> f64 { + self.inner.affinity + } + /// Nucleotide sequence (pool bases) as a string. + #[getter] + fn sequence(&self) -> String { + String::from_utf8_lossy(&self.inner.genotype).into_owned() + } + + fn __repr__(&self) -> String { + format!( + "LineageNode(id={}, parent_id={:?}, generation={}, abundance={}, observed={})", + self.inner.id, + self.inner.parent_id, + self.inner.generation, + self.inner.abundance, + self.inner.observed + ) + } +} + +/// A clonal lineage tree (read-only view) with ground-truth export. +#[pyclass(name = "LineageTree", module = "GenAIRR._engine", frozen)] +pub struct PyLineageTree { + pub(crate) inner: LineageTree, +} + +impl PyLineageTree { + pub(crate) fn new(inner: LineageTree) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyLineageTree { + fn __len__(&self) -> usize { + self.inner.len() + } + + /// All nodes (founder + descendants), in arena order (ascending id). + fn nodes(&self) -> Vec { + self.inner + .nodes + .iter() + .cloned() + .map(|inner| PyLineageNode { inner }) + .collect() + } + + /// Validate structural invariants; raises ValueError if malformed. + fn validate(&self) -> PyResult<()> { + self.inner + .validate() + .map_err(pyo3::exceptions::PyValueError::new_err) + } + + /// Standard rooted Newick string (branch length = per-edge mutation count). + fn to_newick(&self) -> String { + to_newick(&self.inner) + } + + /// FASTA of every node (ancestral + observed) sequence. + fn to_fasta(&self) -> String { + to_fasta(&self.inner) + } + + /// Tab-separated node table. + fn to_node_table_tsv(&self) -> String { + to_node_table_tsv(&self.inner) + } +} + +/// Grow + sample a clonal lineage family from `founder` using an S5F mutator +/// built from the supplied kernel tables. Returns the ground-truth tree. +/// +/// `mutability` must have 1024 entries and `substitution` 4096 (the S5F 5-mer +/// kernel); all values must be finite and non-negative. `rate` is the per-base +/// SHM rate in [0, 1]. Determinism is keyed on `seed`. `lambda_mut` is reserved +/// for a future plan and currently has no effect (mutations are driven by the +/// S5F `rate`). +#[pyfunction] +#[pyo3(signature = ( + founder, mutability, substitution, rate, + lambda_base, lambda_mut, max_generations, n_max, n_sample, seed, + selection_strength=0.0, beta=1.0, target_aa=None, mature_substitutions=5 +))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn simulate_lineage( + founder: &PySimulation, + mutability: Vec, + substitution: Vec, + rate: f64, + lambda_base: f64, + lambda_mut: f64, + max_generations: u32, + n_max: u32, + n_sample: u32, + seed: u64, + selection_strength: f64, + beta: f64, + target_aa: Option, + mature_substitutions: u32, +) -> PyResult { + use pyo3::exceptions::PyValueError; + + if mutability.len() != 1024 { + return Err(PyValueError::new_err(format!( + "simulate_lineage: mutability must have 1024 entries, got {}", + mutability.len() + ))); + } + if substitution.len() != 4096 { + return Err(PyValueError::new_err(format!( + "simulate_lineage: substitution must have 4096 entries, got {}", + substitution.len() + ))); + } + if !(rate.is_finite() && (0.0..=1.0).contains(&rate)) { + return Err(PyValueError::new_err(format!( + "simulate_lineage: rate must be in [0.0, 1.0], got {rate}" + ))); + } + if mutability.iter().any(|&m| !m.is_finite() || m < 0.0) { + return Err(PyValueError::new_err( + "simulate_lineage: mutability values must be finite and non-negative", + )); + } + if substitution.iter().any(|&s| !s.is_finite() || s < 0.0) { + return Err(PyValueError::new_err( + "simulate_lineage: substitution values must be finite and non-negative", + )); + } + if n_max == 0 { + return Err(PyValueError::new_err("simulate_lineage: n_max must be > 0")); + } + if n_sample == 0 { + return Err(PyValueError::new_err("simulate_lineage: n_sample must be > 0")); + } + if !(lambda_base.is_finite() && lambda_base >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_lineage: lambda_base must be finite and >= 0, got {lambda_base}" + ))); + } + if !(lambda_mut.is_finite() && lambda_mut >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_lineage: lambda_mut must be finite and >= 0, got {lambda_mut}" + ))); + } + // Cap generations: `to_newick` recurses to a depth equal to the generation + // count, so an unbounded value could overflow the stack across the FFI + // boundary. 1000 is far beyond any biological germinal-center reaction. + if max_generations > 1000 { + return Err(PyValueError::new_err(format!( + "simulate_lineage: max_generations must be <= 1000, got {max_generations}" + ))); + } + + // Lengths and value ranges are now guaranteed, so S5FKernel::new cannot panic. + let kernel = S5FKernel::new(mutability, substitution); + let mutator = S5FMutationPass::new_rate(kernel, rate); + let params = BranchingParams { + lambda_base, + lambda_mut, + max_generations, + n_max, + n_sample, + seed, + }; + + // Validate affinity params. + if !(beta.is_finite() && beta >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_lineage: beta must be finite and >= 0, got {beta}" + ))); + } + if !(selection_strength.is_finite() && selection_strength >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_lineage: selection_strength must be finite and >= 0, got {selection_strength}" + ))); + } + + // Neutral fast path: no selection and no explicit target → byte-identical to + // the pre-affinity behavior. + if selection_strength == 0.0 && target_aa.is_none() { + let tree = simulate_family(&founder.inner, ¶ms, &mutator); + return Ok(PyLineageTree::new(tree)); + } + + // Build the affinity model. Uniform per-position weights for v1 (the model + // accepts arbitrary weights; CDR3 region-weighting is a follow-up). + let founder_aa = sim_to_aa(&founder.inner); + let target = match target_aa { + Some(s) => s.into_bytes(), + None => { + // Deterministic auto "mature" target derived from the seed. + let mut trng = Rng::new(seed ^ 0x7461_7267_6574_0001); // "target\0\1" + make_mature_target(&founder_aa, mature_substitutions, &mut trng) + } + }; + let weights = vec![1.0; founder_aa.len()]; + let model = AffinityModel::new(target, weights, beta, selection_strength, &founder_aa); + let tree = simulate_family_with_affinity(&founder.inner, ¶ms, &mutator, &model); + Ok(PyLineageTree::new(tree)) +} + +/// A grown clonal family: the lineage tree plus per-node AIRR-projectable +/// `Outcome`s (only observed nodes carry one). +#[pyclass(name = "FamilyOutcome", module = "GenAIRR._engine")] +pub struct PyFamilyOutcome { + tree: LineageTree, + node_outcomes: Vec>, // index == node id +} + +#[pymethods] +impl PyFamilyOutcome { + /// The ground-truth lineage tree. + fn tree(&self) -> PyLineageTree { + PyLineageTree::new(self.tree.clone()) + } + + /// Per-node Outcomes aligned with `tree().nodes()`; None for unsampled nodes. + fn node_outcomes(&self) -> Vec> { + self.node_outcomes + .iter() + .cloned() + .map(|o: Option| o.map(PyOutcome::new)) + .collect() + } + + /// Observed nodes' Outcomes only (abundance > 0), in node-id order. + fn observed_outcomes(&self) -> Vec { + self.node_outcomes + .iter() + .cloned() + .flatten() + .map(PyOutcome::new) + .collect() + } + + /// Build AIRR Rearrangement record dicts for every observed node, + /// in node-id order (aligned with `observed_outcomes()`). + /// + /// Each node `Outcome` now carries a synthesized `MUTATE_S5F` + /// `EventRecord` encoding every pool position where + /// `base != germline` as a `BaseChanged` event, and + /// `mutation_count` is set to the number of such events. + /// `build_airr_record` therefore produces correct per-segment + /// and V-subregion mutation counts natively, without any manual + /// field overwrite. + fn airr_records<'py>( + &self, + py: Python<'py>, + refdata: &PyRefDataConfig, + ) -> PyResult>> { + let mut results = Vec::new(); + for (node_id, outcome) in self.node_outcomes.iter().enumerate() { + let Some(outcome) = outcome else { continue }; + let sequence_id = format!("node{}", node_id); + let rec = build_airr_record(outcome, refdata.inner(), &sequence_id); + let dict = airr_record_to_pydict(py, &rec)?; + results.push(dict); + } + Ok(results) + } +} + +// ────────────────────────────────────────────────────────────────── +// SHM event synthesis helper +// ────────────────────────────────────────────────────────────────── + +/// Synthesize a single `EventRecord` that encodes all net SHM +/// mutations visible in `sim.pool` (positions where `base != +/// germline`) as `SimulationEvent::BaseChanged` events. +/// +/// The record is tagged with `address::MUTATE_S5F` as its pass +/// name so the AIRR builder's event-loop filter picks it up and +/// the per-segment / per-subregion mutation counts it produces are +/// identical to the pool-scan. The returned count equals the number +/// of synthesized events; callers should set `sim.mutation_count` +/// to this value before wrapping `sim` in an `Outcome`. +fn synthesize_shm_event_record(sim: &Simulation) -> (EventRecord, u32) { + let mut events: Vec = Vec::new(); + for (i, nuc) in sim.pool.as_slice().iter().enumerate() { + if nuc.base == nuc.germline { + continue; + } + events.push(SimulationEvent::BaseChanged { + handle: NucHandle::new(i as u32), + old_base: nuc.germline, + new_base: nuc.base, + segment: nuc.segment, + germline_pos: nuc.germline_pos.get(), + }); + } + let net = events.len() as u32; + let summary = StateSummary::from_simulation(sim); + let record = EventRecord::pass_committed( + 0, + crate::address::MUTATE_S5F, + vec![PassCompileEffect::EditBases], + TraceSpan::new(0, 0), + summary, + summary, + events, + ); + (record, net) +} + + +/// Merge a per-node lineage `Outcome` with a corruption `Outcome` that was +/// run *from* the lineage node's final simulation. +/// +/// `base` is a lineage node `Outcome`: its `trace` is the founder +/// recombination trace, its `events` is the synthesized SHM `EventRecord`, +/// and its final simulation is post-SHM (`mutation_count` = net SHM). +/// +/// `corruption` is the corruption plan run from `base`'s final simulation: +/// its `trace` holds the corruption choices, its `events` holds the +/// corruption `EventRecord`s, and its final simulation is the corrupted +/// molecule (SHM `mutation_count` is preserved by the corruption passes). +/// +/// The merged `Outcome` carries the corrupted final simulation (so sequence +/// + quality bytes reflect the artefacts) but the *concatenation* of both +/// traces and both event ledgers, so the AIRR builder derives BOTH the SHM +/// per-segment counts (from the synthesized SHM event) AND the corruption +/// counters (`n_quality_errors`, `n_pcr_errors`, `n_indels`, …) from the +/// corruption events / trace. +#[pyfunction] +pub(crate) fn merge_lineage_corruption(base: &PyOutcome, corruption: &PyOutcome) -> PyOutcome { + let mut trace = base.inner.trace.clone(); + trace.append_delta(corruption.inner.trace.clone()); + + let mut events = base.inner.events.clone(); + events.extend(corruption.inner.events.iter().cloned()); + + PyOutcome::new(Outcome { + revisions: corruption.inner.revisions.clone(), + pass_names: corruption.inner.pass_names.clone(), + trace, + events, + }) +} + +/// Grow + sample a clonal lineage family from `founder` (a full `Outcome`) using +/// an S5F mutator, returning a `FamilyOutcome` with per-node AIRR-projectable +/// `Outcome`s for every observed (sampled) node. +#[pyfunction] +#[pyo3(signature = ( + founder, refdata, mutability, substitution, rate, + lambda_base, lambda_mut, max_generations, n_max, n_sample, seed, + selection_strength=0.0, beta=1.0, target_aa=None, mature_substitutions=5 +))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn simulate_family_outcomes( + founder: &PyOutcome, + refdata: &PyRefDataConfig, + mutability: Vec, + substitution: Vec, + rate: f64, + lambda_base: f64, + lambda_mut: f64, + max_generations: u32, + n_max: u32, + n_sample: u32, + seed: u64, + selection_strength: f64, + beta: f64, + target_aa: Option, + mature_substitutions: u32, +) -> PyResult { + use pyo3::exceptions::PyValueError; + + if mutability.len() != 1024 { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: mutability must have 1024 entries, got {}", + mutability.len() + ))); + } + if substitution.len() != 4096 { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: substitution must have 4096 entries, got {}", + substitution.len() + ))); + } + if !(rate.is_finite() && (0.0..=1.0).contains(&rate)) { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: rate must be in [0.0, 1.0], got {rate}" + ))); + } + if mutability.iter().any(|&m| !m.is_finite() || m < 0.0) { + return Err(PyValueError::new_err( + "simulate_family_outcomes: mutability values must be finite and non-negative", + )); + } + if substitution.iter().any(|&s| !s.is_finite() || s < 0.0) { + return Err(PyValueError::new_err( + "simulate_family_outcomes: substitution values must be finite and non-negative", + )); + } + if n_max == 0 { + return Err(PyValueError::new_err("simulate_family_outcomes: n_max must be > 0")); + } + if n_sample == 0 { + return Err(PyValueError::new_err("simulate_family_outcomes: n_sample must be > 0")); + } + if !(lambda_base.is_finite() && lambda_base >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: lambda_base must be finite and >= 0, got {lambda_base}" + ))); + } + if !(lambda_mut.is_finite() && lambda_mut >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: lambda_mut must be finite and >= 0, got {lambda_mut}" + ))); + } + if max_generations > 1000 { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: max_generations must be <= 1000, got {max_generations}" + ))); + } + if !(beta.is_finite() && beta >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: beta must be finite and >= 0, got {beta}" + ))); + } + if !(selection_strength.is_finite() && selection_strength >= 0.0) { + return Err(PyValueError::new_err(format!( + "simulate_family_outcomes: selection_strength must be finite and >= 0, got {selection_strength}" + ))); + } + + let founder_sim = founder.inner.final_simulation().clone(); + let founder_trace = founder.inner.trace.clone(); + + let kernel = S5FKernel::new(mutability, substitution); + let mutator = S5FMutationPass::new_rate(kernel, rate); + let params = BranchingParams { + lambda_base, + lambda_mut, + max_generations, + n_max, + n_sample, + seed, + }; + + // Optional affinity model (same logic as simulate_lineage's non-neutral path). + let model: Option = if selection_strength == 0.0 && target_aa.is_none() { + None + } else { + let founder_aa = sim_to_aa(&founder_sim); + let target = match target_aa { + Some(s) => s.into_bytes(), + None => { + let mut trng = Rng::new(seed ^ 0x7461_7267_6574_0001); + make_mature_target(&founder_aa, mature_substitutions, &mut trng) + } + }; + let weights = vec![1.0; founder_aa.len()]; + Some(AffinityModel::new(target, weights, beta, selection_strength, &founder_aa)) + }; + + let (tree, sims) = simulate_family_sims(&founder_sim, ¶ms, &mutator, model.as_ref()); + + // The branching loop mutates child sims with a bare `Pass::execute` + // (no `LiveCallRefreshHook`), so each node sim's `segment_calls` + // sidecar still reflects the FOUNDER's alignment, not its own + // mutated pool. Recompute the V/D/J live calls from each observed + // node's pool before building its Outcome; otherwise the + // synthesized record carries stale v/d/j calls and validate_record + // false-fails with AlleleCallTieSetMismatch under heavy SHM. + let reference_index = ReferenceMatchIndex::build(refdata.inner()); + + let node_outcomes: Vec> = tree + .nodes + .iter() + .map(|n| { + if n.observed { + let refreshed = refresh_live_calls(sims[n.id as usize].clone(), &reference_index); + let s = &refreshed; + // Synthesize a MUTATE_S5F EventRecord encoding every + // pool position where base != germline as a + // BaseChanged event. This makes the Outcome + // self-consistent: build_airr_record reads these + // events to derive per-segment / per-subregion + // mutation counts, and the net count is stored in + // mutation_count so the NMutationsMismatch validator + // check passes. + let (event_record, net) = synthesize_shm_event_record(s); + let s2 = s.with_mutation_count(net); + Some(Outcome { + revisions: vec![s2], + pass_names: Vec::new(), + trace: founder_trace.clone(), + events: vec![event_record], + }) + } else { + None + } + }) + .collect(); + + Ok(PyFamilyOutcome { tree, node_outcomes }) +} + +/// Draw `n_clones` clone sizes (>=1) from a heavy-tailed distribution, with a +/// fraction `unexpanded_fraction` forced to size 1. Deterministic for `seed`. +/// `kind` is "power_law" or "lognormal". +#[pyfunction] +#[pyo3(signature = (n_clones, seed, kind="power_law", exponent=2.0, mu=1.0, sigma=1.0, max_size=100_000, unexpanded_fraction=0.0))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn sample_clone_sizes( + n_clones: u32, + seed: u64, + kind: &str, + exponent: f64, + mu: f64, + sigma: f64, + max_size: u32, + unexpanded_fraction: f64, +) -> PyResult> { + use pyo3::exceptions::PyValueError; + if max_size == 0 { + return Err(PyValueError::new_err("sample_clone_sizes: max_size must be > 0")); + } + if !(unexpanded_fraction.is_finite() && (0.0..=1.0).contains(&unexpanded_fraction)) { + return Err(PyValueError::new_err( + "sample_clone_sizes: unexpanded_fraction must be in [0.0, 1.0]", + )); + } + let dist = match kind { + "power_law" => { + if !(exponent.is_finite() && exponent > 0.0) { + return Err(PyValueError::new_err( + "sample_clone_sizes: power_law exponent must be finite and > 0", + )); + } + crate::lineage::CloneSizeDist::PowerLaw { exponent, x_max: max_size } + } + "lognormal" => { + if !(mu.is_finite() && sigma.is_finite() && sigma >= 0.0) { + return Err(PyValueError::new_err( + "sample_clone_sizes: lognormal mu/sigma must be finite and sigma >= 0", + )); + } + crate::lineage::CloneSizeDist::LogNormal { mu, sigma, x_max: max_size } + } + other => { + return Err(PyValueError::new_err(format!( + "sample_clone_sizes: unknown kind {other:?}; use \"power_law\" or \"lognormal\"" + ))); + } + }; + let mut rng = crate::rng::Rng::new(seed); + Ok(crate::lineage::sample_repertoire_sizes(&mut rng, n_clones, &dist, unexpanded_fraction)) +} diff --git a/engine_rs/src/python/mod.rs b/engine_rs/src/python/mod.rs index fbb9e43..8186fdb 100644 --- a/engine_rs/src/python/mod.rs +++ b/engine_rs/src/python/mod.rs @@ -28,6 +28,7 @@ use pyo3::prelude::*; pub mod compiled; pub mod contract; pub mod event; +pub mod lineage; pub mod outcome; pub mod plan; pub mod refdata; @@ -47,6 +48,13 @@ pub(crate) fn register(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_function(pyo3::wrap_pyfunction!(lineage::simulate_lineage, m)?)?; + m.add_function(pyo3::wrap_pyfunction!(lineage::simulate_family_outcomes, m)?)?; + m.add_function(pyo3::wrap_pyfunction!(lineage::merge_lineage_corruption, m)?)?; + m.add_function(pyo3::wrap_pyfunction!(lineage::sample_clone_sizes, m)?)?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/engine_rs/src/python/outcome.rs b/engine_rs/src/python/outcome.rs index 943b62f..34c0e8e 100644 --- a/engine_rs/src/python/outcome.rs +++ b/engine_rs/src/python/outcome.rs @@ -225,7 +225,7 @@ impl PyOutcome { /// `Option` becomes `int | None`, `Option` becomes `float /// | None`, `Option` becomes `bool | None`. Strings and the /// few `bool`/`i64`/`f64` non-optional fields go through directly. -fn airr_record_to_pydict<'py>(py: Python<'py>, rec: &AirrRecord) -> PyResult> { +pub(crate) fn airr_record_to_pydict<'py>(py: Python<'py>, rec: &AirrRecord) -> PyResult> { let dict = PyDict::new_bound(py); // AIRR metadata diff --git a/mkdocs.yml b/mkdocs.yml index f4470f8..d864b06 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -204,7 +204,9 @@ nav: - Biology map: guides/biology-map.md - Recombination + junction biology: guides/recombination-junction.md - D inversion + receptor revision: guides/recombination-editing.md - - Clonal families: guides/clonal-families.md + - Clonal simulation overview: guides/clonal-families.md + - Clonal lineage trees: guides/clonal-lineage.md + - Clonal repertoires (TCR & abundance): guides/clonal-repertoire.md - Junction N/P additions: guides/junction-additions.md - Targeted SHM rates: guides/shm-targeting.md - Corruption + sequencing artefacts: guides/corruption-sequencing.md diff --git a/site_docs/assets/clonal-lineage-detection.png b/site_docs/assets/clonal-lineage-detection.png new file mode 100644 index 0000000..1f07577 Binary files /dev/null and b/site_docs/assets/clonal-lineage-detection.png differ diff --git a/site_docs/concepts/airr-record.md b/site_docs/concepts/airr-record.md index 87eb982..0960592 100644 --- a/site_docs/concepts/airr-record.md +++ b/site_docs/concepts/airr-record.md @@ -59,7 +59,7 @@ The canonical groups, in the order they appear on the record: 11. D-inversion provenance 12. Receptor-revision provenance 13. Read layout (paired-end) -14. Clonal stamping (when expand_clones runs) +14. Clonal stamping (when a clonal workflow runs) 15. Truth columns (when `expose_provenance=True`) The remaining sections walk through each. @@ -127,7 +127,7 @@ load-bearing in two cases: - **Family validation** uses them to confirm every descendant of a clonal family shares the recombination-time identity (see - [Clonal families](../guides/clonal-families.md)). + [Clonal simulation overview](../guides/clonal-families.md)). - **Aligner benchmarking** treats `v_call` as the aligner's prediction and `truth_*_call` as the ground truth, even when both came from the same simulator — the column split makes the @@ -343,20 +343,25 @@ for the quality model and the layout coordinates. ## Clonal fields -Two fields stamped Python-side when `.expand_clones(n_clones=..., -per_clone=...)` runs: - -| Field | Type | Meaning | -|---|---|---| -| `clone_id` | int | Identifier of the clonal family this record belongs to (0-based) | -| `parent_id` | int | Identifier of the ancestor outcome; equals `clone_id` for the records expanded from that ancestor | - -On non-clonal runs (no `expand_clones`), both fields are -**absent** from the record dict. Don't write code that assumes -they're always present; the family validators (`validate_families`, -`validate_families_with_parents`) are safe no-ops on non-clonal -batches because they handle the absent case explicitly. See -[Clonal families](../guides/clonal-families.md). +All modern clonal workflows stamp `clone_id`, but the rest of the +surface depends on which clonal model produced the record: + +| Field | Type | Produced by | Meaning | +|---|---|---|---| +| `clone_id` | int | `clonal_lineage`, `clonal_repertoire`, `expand_clones` | Planted clone / family label (0-based) | +| `duplicate_count` | int | `clonal_lineage`, `clonal_repertoire` | AIRR-standard abundance after genotype collapse | +| `parent_id` | int | legacy `expand_clones` | Identifier of the ancestor `Outcome`; equals `clone_id` for records expanded from that ancestor | +| `lineage_node_id` | int | `clonal_lineage` | Node id of the observed cell in the ground-truth lineage tree | +| `lineage_parent_id` | int | `clonal_lineage` | Parent node id in the lineage tree (`-1` for founder) | +| `lineage_generation` | int | `clonal_lineage` | Generation depth of the observed cell | +| `lineage_abundance` | int | `clonal_lineage` | Observation count after final-cell sampling and genotype collapse | +| `lineage_affinity` | float | `clonal_lineage` | Sequence-distance proxy to the target; 0 only when no affinity model is active | + +On non-clonal runs, these fields are **absent** from the record dict. Don't write +code that assumes they're always present; check with `"clone_id" in rec`. +`validate_families()` is a safe no-op on non-clonal batches because it handles +the absent case explicitly. See +[Clonal simulation overview](../guides/clonal-families.md). ## Validation @@ -369,10 +374,9 @@ The records page composes cleanly with the validation surface: - **`validate_families(refdata=None)`** groups records by `clone_id` and asserts each family agrees on `truth_v_call`, `truth_d_call`, `truth_j_call` (when present). -- **`validate_families_with_parents(refdata)`** compares each - descendant against its actual parent `Outcome` — catches a - whole class of family-stamping bugs the cheaper validator - misses. +- **`validate_families_with_parents(refdata)`** is for legacy + `expand_clones` results with `result.parents`; it compares each + descendant against its actual parent `Outcome`. The full validation picture lives at the [Validation hub](../validation/index.md). @@ -423,10 +427,11 @@ appear only when `expose_provenance=True` is passed to `run_records(...)`. Without the flag, the columns are absent entirely — not `None`-valued, absent. -**Expecting `clone_id` on non-clonal records.** Without -`.expand_clones(...)`, neither `clone_id` nor `parent_id` is -stamped on the record dict at all. Check for presence with -`"clone_id" in rec`, not `rec.get("clone_id") is not None`. +**Expecting `clone_id` on non-clonal records.** Without a clonal +workflow (`clonal_lineage`, `clonal_repertoire`, or legacy +`expand_clones`), clonal fields are not stamped on the record dict at +all. Check for presence with `"clone_id" in rec`, not +`rec.get("clone_id") is not None`. ## Where to go next @@ -438,7 +443,7 @@ stamped on the record dict at all. Check for presence with — the SHM counters in depth. - **[Corruption and sequencing artefacts](../guides/corruption-sequencing.md)** — the artefact counters in depth. -- **[Clonal families](../guides/clonal-families.md)** — the - `clone_id` / `parent_id` surface and family validation. +- **[Clonal simulation overview](../guides/clonal-families.md)** — + `clone_id`, `duplicate_count`, lineage metadata, and family validation. - **[Validation hub](../validation/index.md)** — re-deriving every field from the underlying `Outcome`. diff --git a/site_docs/demo.md b/site_docs/demo.md index 57fc53f..33810ef 100644 --- a/site_docs/demo.md +++ b/site_docs/demo.md @@ -287,10 +287,14 @@ ValueError: replay_from_trace_file: pass plan signature mismatch. --- -## 4. Clonal families +## 4. Legacy fixed-size clonal families One parent recombination can fork into many independently mutated -descendants. +descendants. This demo uses legacy `expand_clones` for the simple +fixed-size star shape; for new clone benchmarks, see +[`clonal_lineage`](guides/clonal-lineage.md) for BCR trees and +[`clonal_repertoire`](guides/clonal-repertoire.md) for TCR / abundance +repertoires.
Code
diff --git a/site_docs/guides/biology-map.md b/site_docs/guides/biology-map.md index 372a0bb..6d4b703 100644 --- a/site_docs/guides/biology-map.md +++ b/site_docs/guides/biology-map.md @@ -37,7 +37,9 @@ better starting point. | **End-loss (5′ / 3′)** | `.end_loss_5prime(length=...)`, `.end_loss_3prime(length=...)` (or `primer_trim_*prime` aliases) | Library / sequencing artefact — descendant | `end_loss_5_length`, `end_loss_3_length` | [Corruption + sequencing artefacts](corruption-sequencing.md) | | **Random strand orientation** | `.random_strand_orientation(prob=0.5)` | Read layout — descendant | `rev_comp` | [Corruption + sequencing artefacts](corruption-sequencing.md) | | **Paired-end layout** | `.paired_end(r1_length=..., insert_size=...)` | Read layout — descendant | `read_layout`, `r1_sequence`, `r2_sequence`, `r1_start`, `r1_end`, `r2_start`, `r2_end`, `insert_size` | [Paired-end reads and FASTQ](paired-end-fastq.md) | -| **Clonal expansion** | `.expand_clones(n_clones=..., per_clone=...)` | Ancestor / descendant fork | `clone_id`, `parent_id` (stamped Python-side) | [Clonal families](clonal-families.md) | +| **BCR lineage trees** | `.clonal_lineage(...)` | BCR affinity-maturation tree | `clone_id`, `lineage_*`, `duplicate_count`, `result.lineage_trees` | [Clonal lineage trees](clonal-lineage.md) | +| **TCR / flat-BCR clone-size repertoires** | `.clonal_repertoire(...)` | Non-tree clonal abundance | `clone_id`, `duplicate_count` | [Clonal repertoires](clonal-repertoire.md) | +| **Legacy fixed-size clonal stars** | `.expand_clones(n_clones=..., per_clone=...)` | Ancestor / descendant fork | `clone_id`, `parent_id`, `result.parents` | [Clonal simulation overview](clonal-families.md) | | **Contamination** | `.contaminate(prob=...)` | Library / sequencing artefact — descendant | `is_contaminant` | [Experiment builder](experiment-builder.md) | | **Sample metadata** | `.with_metadata(**fields)` | Bookkeeping — post-run | Arbitrary user-stamped columns | [Experiment builder](experiment-builder.md) | @@ -56,24 +58,32 @@ insertion. Productivity constraints (`productive_only`, `.receptor_revision()` edit the just-recombined molecule. Each can fire at most once per record. -**3. Ancestor / descendant fork (clonal pipelines only).** -`.expand_clones()` partitions the pipeline. Everything before -the fork runs once per ancestor; everything after fires per -descendant. +**3. Clonal structure (optional).** Choose one clonal surface: +`clonal_lineage()` for BCR trees, `clonal_repertoire()` for TCR / +flat-BCR clone-size repertoires, or legacy `expand_clones()` for a +fixed-size star. For flat forks, everything before the fork runs once +per clone and everything after fires per emitted copy. For +`clonal_lineage`, the tree growth and SHM happen inside the lineage +engine. **4. Biology — descendant phase.** `.mutate(...)` accumulates -biological SHM on top of recombination. On clonal pipelines -this fires *after* `expand_clones`; SHM is per-descendant, not -shared across the family. +biological SHM on top of recombination. On flat clonal pipelines +(`clonal_repertoire` / `expand_clones`) this fires after the fork so +SHM is per copy, not shared across the clone. TCR refdata rejects +`.mutate(...)`. `clonal_lineage` has its own tree-internal SHM rate. **5. Library / sequencing artefacts + read layout — descendant phase.** All corruption passes (`pcr_amplify`, `sequencing_errors`, `ambiguous_base_calls`, `polymerase_indels`, `end_loss_5prime`, `end_loss_3prime`, `random_strand_orientation`, `contaminate`) plus the read-layout projection -(`paired_end`). On clonal pipelines these must come after -`.expand_clones()`; calling any of them *before* the fork raises -`ValueError`. +(`paired_end`). On flat clonal pipelines these must come after the +fork; calling any of them before `clonal_repertoire()` or +`expand_clones()` raises `ValueError`. `clonal_lineage` accepts the +corruption passes after the fork but not `paired_end` yet. With +`clonal_repertoire`, paired-end records remain abundance-collapsed: +`duplicate_count` carries copy number and FASTQ export does not expand +it into repeated read pairs. **Per-batch bookkeeping** (`.with_metadata(...)`) stamps the result after every other stage has run. @@ -84,8 +94,8 @@ The two main ordering invariants: mutation; the corruption passes model the wet lab. Reversing the order would model SHM mutating an already-corrupted sequence, which doesn't match reality. -- **All descendant-phase passes follow `expand_clones`.** That's - what makes them per-descendant. Putting them earlier would +- **All descendant/read-phase passes follow flat clonal forks.** + That's what makes them per emitted copy. Putting them earlier would share their effects across the whole family. ## Cartridge-controlled vs Experiment-controlled @@ -135,7 +145,8 @@ The `Experiment` DSL carries the experimental design: - **Targeting overrides** — `segment_rates`, `v_subregion_rates` on `mutate(...)`; per-experiment `trim(v_3=..., d_5=..., ...)` distributions that override the cartridge defaults. -- **Clonal structure** — `expand_clones(n_clones, per_clone)`. +- **Clonal structure** — `clonal_lineage(...)`, + `clonal_repertoire(...)`, or legacy `expand_clones(...)`. - **Read layout** — `paired_end(...)`, `random_strand_orientation(...)`. - **Run-time flags** — `strict`, `expose_provenance`, @@ -170,10 +181,10 @@ priors. - **Paired-end geometry** — when `read_layout == "paired_end"`, R1/R2 coordinates checked against `insert_size`; reads are consistent with their parent assembled sequence. -- **Family invariants** — `validate_families` and - `validate_families_with_parents` assert each `clone_id` group - agrees on `truth_v_call` / `truth_d_call` / `truth_j_call`. - The parent-aware form additionally compares descendants +- **Family invariants** — `validate_families` groups by `clone_id` + and, when truth columns are present, asserts each group agrees on + `truth_v_call` / `truth_d_call` / `truth_j_call`. The parent-aware + form additionally compares legacy `expand_clones` descendants against their actual parent `Outcome`. **NOT validated:** diff --git a/site_docs/guides/clonal-families.md b/site_docs/guides/clonal-families.md index ba6847a..d1c4693 100644 --- a/site_docs/guides/clonal-families.md +++ b/site_docs/guides/clonal-families.md @@ -1,314 +1,298 @@ -# Generate clonal families - -

A clonal family is one parent recombination plus -many descendants that share the parent's V(D)J truth but diverge -through somatic hypermutation, library prep, and sequencing -artefacts. expand_clones is the DSL marker that turns -a flat pipeline into a clonal one — and once it's in the chain, -the API rejects misordered steps so you can't accidentally collapse -SHM diversity or split a recombination decision across descendants.

- -## What clonal simulation means - -In real biology, a clonal family is the lineage descended from a -single B cell whose V(D)J recombination has happened. Every cell -in that lineage shares the same V/D/J allele choices, the same -trims, the same NP bases, the same D orientation — those are -*recombination-time* decisions, frozen at fork. What diverges -between cousins is somatic hypermutation (each B cell mutates -independently), library prep artefacts (PCR errors don't cross -between reads), and sequencing geometry (R1/R2 windows are per-read). - -GenAIRR models that with a single DSL marker, `expand_clones`. The -methods you append BEFORE it run *once per clone* (and that -outcome is shared across every descendant). The methods you -append AFTER run *once per descendant* (independent draws per -read). - -## A minimal clonal example +# Clonal simulation overview -```python -import GenAIRR as ga +

GenAIRR has three clonal surfaces. Use +clonal_lineage when you need B-cell affinity-maturation trees, +clonal_repertoire when you need TCR or flat-BCR clone-size / +abundance repertoires, and legacy expand_clones only when you +need the older fixed-size star model. All three stamp planted clone labels so +AIRR clone-calling and ML benchmarks can compare inferred groups against the +truth the simulator created.

-result = ( - ga.Experiment.on("human_igh") - .recombine() # ancestor phase - .expand_clones(n_clones=5, per_clone=10) # fork - .mutate(model="s5f", rate=0.02) # descendant phase - .run_records(seed=1) -) +## Choose the right clonal model -print(len(result)) # 50 = 5 clones × 10 descendants -print(result[0]["clone_id"], result[0]["parent_id"]) # 0 0 -print(result[10]["clone_id"]) # 1 -``` +| Use case | DSL | Biology | Output truth | +|---|---|---|---| +| **BCR lineage reconstruction / affinity maturation** | [`clonal_lineage(...)`](clonal-lineage.md) | Generation-synchronous B-cell tree, per-division S5F SHM, optional sequence-distance selection, final live-cell sampling | AIRR records with `clone_id`, `lineage_*`, `duplicate_count`; one `LineageTree` per clone | +| **TCR clone-size / abundance benchmark** | [`clonal_repertoire(...)`](clonal-repertoire.md) | One rearranged T cell copied to a heavy-tailed clone size; no SHM; optional per-read technical noise | AIRR records with `clone_id` and AIRR `duplicate_count` | +| **Flat BCR abundance without genealogy** | [`clonal_repertoire(...)`](clonal-repertoire.md) | One BCR rearrangement copied to a heavy-tailed size; optional flat post-fork SHM per copy | AIRR records with `clone_id` and `duplicate_count` | +| **Legacy fixed-size star** | `expand_clones(...)` | One parent rearrangement and a fixed `per_clone` descendant count | AIRR records with `clone_id`, `parent_id`; parent `Outcome`s on `result.parents` | -`expand_clones(n_clones=5, per_clone=10)` produces exactly 5 × 10 = -50 records. Note that `run_records(seed=1)` doesn't take an `n` -argument — the clonal pipeline computes its own total. You can -pass `n=50` as a consistency check, but any other value (including -`n=100`) raises `ValueError` at run time. The pipeline knows how -many records it produces; you don't override it. +`expand_clones` is still supported for old scripts, but new clone-related +benchmarks should usually start with `clonal_lineage` or `clonal_repertoire`. +Those two encode the distinction AIRR users usually care about: BCR lineages are +mutation trees, while TCR clones are abundance groups with technical read noise. -## Ancestor vs descendant phase +## Shared DSL shape -The DSL partitions steps into two phases: +Clonal workflows all start by creating one founder rearrangement per clone: -**Ancestor-phase** — runs once per clonal parent. Use for any -mechanism whose decision must be inherited by every descendant -of a family. +```python +ga.Experiment.on("human_igh").recombine() +``` -| Pass | Why ancestor | -|---|---| -| `.recombine()` | V/D/J allele choices + trims + NP define the family's identity | -| `.invert_d(prob=...)` | D orientation is a recombination-time event; the family shares it | -| `.receptor_revision(prob=...)` | V replacement happens once during B-cell development; the family shares the post-revision V | +Everything before the clonal fork runs once per clone. Everything after a flat +fork (`clonal_repertoire` or `expand_clones`) runs once per emitted read/copy. +`clonal_lineage` is different: it grows the SHM tree internally, then optional +library-prep / sequencing artefact passes run once per observed cell. -**Descendant-phase** — runs independently per descendant. Use for -mechanisms that should vary within a family. +Only one clonal fork is allowed in a pipeline: -| Pass | Why descendant | -|---|---| -| `.mutate(...)` | Each memory B cell mutates independently | -| `.pcr_amplify(...)` | PCR errors are read-specific | -| `.polymerase_indels(...)` | Same — per-read library artefacts | -| `.ambiguous_base_calls(...)` | Per-read N-injection | -| `.sequencing_errors(...)` | Per-read quality artefacts | -| `.end_loss_5prime(...)`, `.end_loss_3prime(...)` | Per-read 5'/3' adapter loss | -| `.random_strand_orientation(...)` | Per-read strand decision | -| `.paired_end(...)` | R1/R2 windows are per-read | - -The DSL enforces ordering at chain time — not at compile time, not -silently at run time. If you call a descendant-phase method -*before* `expand_clones`, the next call to `expand_clones` raises: - -```text -ValueError: mutate must be called after expand_clones(); -SHM is descendant-specific in GenAIRR's current clonal model. -Move mutate(...) after expand_clones(...). +```python +.clonal_lineage(...) +.clonal_repertoire(...) +.expand_clones(...) ``` -If you call an ancestor-phase method (`invert_d` / `receptor_revision`) -*after* `expand_clones`, the offending method itself raises with -the symmetric message: +are mutually exclusive. -```text -ValueError: invert_d must be called before expand_clones(); -D inversion is a recombination-time decision and must be inherited -by all clone descendants. Move the invert_d(...) call before -expand_clones(...). -``` +`run_records(n=...)` is not the way to set clone output size for modern clonal +models. Use the clonal parameters instead: -A second call to `expand_clones` raises `expand_clones() can only -be called once per pipeline`. - -In practice the rule of thumb is: anything biology fixed once per -cell goes before the fork, anything happening per read goes after. -The DSL catches violations the moment you write them. +| Model | Output-size knobs | +|---|---| +| `clonal_lineage` | `n_clones`, `max_generations`, `n_max`, `n_sample`, extinction/survival, genotype collapse | +| `clonal_repertoire` | `n_clones`, `size_distribution`, `max_size`, `unexpanded_fraction`, genotype collapse | +| `expand_clones` | `n_clones × per_clone` fixed descendants | -## Reading clone IDs and parents +## BCR lineage trees -Every descendant record carries two integer fields that wire it -back to its clonal family: +Use `clonal_lineage` when you need a real B-cell genealogy: ```python -rec = result[0] - -rec["clone_id"] # 0 — family identity; every descendant of clone 0 carries 0 -rec["parent_id"] # 0 — addressing index into result.parents -``` +import GenAIRR as ga -**Today `clone_id == parent_id` by construction** — the two are -stamped separately because they carry distinct semantics -(`clone_id` is the family identity, `parent_id` is the lookup -index into the parents collection), so a future change to the -addressing scheme can move one without retrofitting the other. -Treat them as a pair; address downstream joins by `clone_id`. +result = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage( + n_clones=20, + max_generations=6, + n_max=300, + n_sample=30, + rate=0.01, + lambda_base=1.6, + selection_strength=10.0, + ) + .sequencing_errors(rate=0.005) + .run_records(seed=0, validate_records=True) +) -The parent outcomes live separately on the result: +rec = result.records[0] +print(rec["clone_id"], rec["lineage_node_id"], rec["lineage_generation"]) +print(rec["lineage_abundance"], rec["duplicate_count"]) -```python -result.parents # list of length n_clones -len(result.parents) # 5 - -parent_outcome = result.parents[rec["parent_id"]] -# Outcome carrying the pre-fork plan's full trace + event ledger -parent_outcome.final_simulation() # post-recombine IR -parent_outcome.trace() # pre-fork addressed-choice trace -parent_outcome.events() # pre-fork event ledger +tree = result.lineage_trees[rec["clone_id"]] +newick = tree.to_newick() +node_table = tree.to_node_table_tsv() ``` -The flat `result.outcomes` list continues to hold exactly one -entry per descendant record (length `n_clones × per_clone`); -parents are exposed on the separate `.parents` collection so -clonal consumers get extra information without changing the -descendant-list shape. +What happens: -!!! info "Non-clonal results" - `result.parents` is `None` when `expand_clones` is NOT in the - pipeline. Record dicts from non-clonal runs also have - `record.get("clone_id") is None` and `record.get("parent_id") - is None`. Code that may receive either shape can safely use - `result.parents or []` and `record.get("clone_id")` without - branching on clonal-ness. +1. `recombine()` creates one naive BCR founder per clone. +2. The Rust lineage engine grows a tree for `max_generations`, with live-cell + carrying capacity `n_max`. +3. Each child receives per-division S5F SHM at `rate`. +4. If selection is enabled, offspring rates are modulated by a BLOSUM62 + sequence-distance proxy, not a physical antigen-binding model. +5. `n_sample` cells are sampled from the living final-generation population and + identical genotypes are collapsed into `lineage_abundance` / + `duplicate_count`. -## Validation +`clonal_lineage` is BCR-only. Calling it on TCR refdata raises `ValueError` +because T cells do not somatically hypermutate. -The validation hub covers the layers in depth; here's the clonal -slice. Three calls compose for the strongest gate: +Deep dive: [Clonal lineage trees](clonal-lineage.md). + +## TCR and flat-BCR abundance repertoires + +Use `clonal_repertoire` when the clone truth is membership and abundance, not a +lineage tree: ```python -# 1. Per-record AIRR consistency. -record_report = result.validate_records(refdata) -assert record_report, record_report.summary() +import GenAIRR as ga -# 2. Within-family invariants (no refdata required). -family_report = result.validate_families() -assert family_report, family_report.summary() +result = ( + ga.Experiment.on("human_tcrb") + .allow_curatable_refdata() + .recombine() + .clonal_repertoire( + n_clones=200, + size_distribution="power_law", + exponent=2.0, + max_size=500, + unexpanded_fraction=0.5, + ) + .sequencing_errors(rate=0.005) + .run_records(seed=0, validate_records=True) +) -# 3. Parent-aware checks: descendant truth fields agree with their parent. -full_report = result.validate_families_with_parents(refdata) -assert full_report, full_report.summary() +for rec in result.records[:5]: + print(rec["clone_id"], rec["duplicate_count"], rec["v_call"], rec["j_call"]) ``` -What each adds: - -- **`validate_families()`** groups records by `clone_id` and asserts - the recombination-time truth fields (`truth_v_call`, `truth_d_call`, - `truth_j_call`) are invariant across every descendant of a clone. - No refdata required; works on records-only results (e.g. - round-tripped from TSV). On a non-clonal batch (no record carries - a non-null `clone_id`) it's a safe no-op and returns ok with - `family_count == 0` — you don't need to branch on whether the - pipeline was clonal. -- **`validate_families_with_parents(refdata)`** adds structural - checks (`ParentsMissing` / `ParentIdMissing` / `ParentIdOutOfRange`) - and value-comparison checks against the actual parent `Outcome` - on `result.parents` (`ParentDInvertedMismatch`, - `ParentOriginalVCallMismatch`, `ParentTruthVCallMismatch`, - etc.). It does NOT re-run the within-family checks - `validate_families` does; it's a sibling validator, not a - superset. - -The runtime opt-in `run_records(..., validate_records=True)` runs -the per-record validator *and* the field-level family checks — but -NOT the parent-aware checks. If you want -`validate_families_with_parents` in CI, call it explicitly: +What happens: -```python -result = exp.run_records(seed=42, validate_records=True) -assert result.validate_families_with_parents(refdata), "parent mismatch" -``` +1. `recombine()` creates one rearrangement per clone. +2. A clone size is drawn from a heavy-tailed distribution: rounded continuous + power-law by default, or log-normal. +3. That many copies pass through post-fork per-read passes such as + `sequencing_errors`, `pcr_amplify`, `polymerase_indels`, `end_loss_*`, + `ambiguous_base_calls`, `random_strand_orientation`, or `paired_end`. +4. Identical output sequences collapse into one AIRR record whose + `duplicate_count` is the represented abundance. -## Common workflows +For TCR, do not add `.mutate(...)`; the API rejects it. TCR within-clone sequence +diversity should come from technical artefact passes only. For flat BCR +abundance, you may add post-fork `.mutate(...)`, but that is independent SHM off +the founder, not a tree. -A few patterns that come up repeatedly with clonal output. +Deep dive: [Clonal repertoires](clonal-repertoire.md). -**Clone-level SHM benchmark.** Hold the parent V(D)J fixed and -let SHM diverge: +## Legacy fixed-size stars + +`expand_clones` remains available for old fixed-size star benchmarks: ```python result = ( ga.Experiment.on("human_igh") .recombine() - .expand_clones(n_clones=100, per_clone=50) - .mutate(model="s5f", rate=0.05) - .run_records(seed=42) + .expand_clones(n_clones=5, per_clone=10) + .mutate(model="s5f", rate=0.02) + .run_records(seed=1) ) -# Per-clone SHM distribution: -import pandas as pd -df = pd.DataFrame(result.records) -df.groupby("clone_id")["n_mutations"].describe() +print(len(result.records)) # 50 +print(result.records[0]["clone_id"]) # 0 +print(result.records[0]["parent_id"]) # 0 +parent = result.parents[result.records[0]["parent_id"]] ``` -**Parent / descendant comparison.** Pull the parent IR for each -descendant and compare against the post-SHM sequence: +`expand_clones` records carry `parent_id` and `result.parents` because the old +star model keeps an explicit parent `Outcome`. Modern `clonal_repertoire` does +not expose `parents`; it collapses abundance into records. `clonal_lineage` +exposes lineage truth through `lineage_*` fields and `result.lineage_trees` +instead. + +## Output fields by model + +| Field / object | `clonal_lineage` | `clonal_repertoire` | `expand_clones` | +|---|---|---|---| +| `clone_id` | Yes: planted family label | Yes: planted clone label | Yes: planted family label | +| `duplicate_count` | Yes: alias of `lineage_abundance` after final-cell sampling | Yes: collapsed abundance | No standard abundance field in the legacy star model | +| `lineage_node_id` / `lineage_parent_id` / `lineage_generation` | Yes | No | No | +| `lineage_abundance` / `lineage_affinity` | Yes | No | No | +| `parent_id` | No; use `lineage_parent_id` for tree parent | No | Yes | +| `result.parents` | No | No | Yes | +| `result.lineage_trees` | Yes | No | No | +| `result.outcomes` | One per observed record | One per emitted/collapsed record | One per descendant record | + +For external clone-calling tools, keep the planted label under a non-AIRR name so +it does not collide with the tool's inferred `clone_id`: ```python -for rec in result.records: - parent = result.parents[rec["parent_id"]] - parent_seq = parent.final_simulation().bases() - # rec["sequence"] is the post-SHM descendant; parent_seq is - # the pre-fork assembled IR. +import pandas as pd + +df = pd.DataFrame(result.records).rename(columns={"clone_id": "true_clone_id"}) +df.to_csv("repertoire.tsv", sep="\t", index=False) ``` -**Export records with clone IDs.** Every export format carries the -`clone_id` and `parent_id` columns as-is — they ship with the -record dict: +`duplicate_count` is the AIRR-standard abundance column consumed by +abundance-aware workflows. `clonal_repertoire` and `clonal_lineage` emit it +directly. + +## Validation + +Use record validation on every clonal workflow: ```python -result.to_tsv("repertoire.tsv") # clone_id + parent_id columns included -df = result.to_dataframe() # same in the DataFrame -result.to_fasta("seqs.fa", prefix="seq") # headers include sequence_id; clone IDs in TSV +result = exp.run_records(seed=42, validate_records=True) ``` -**Paired FASTQ from clonal output.** `paired_end` is descendant- -phase, so it composes with `expand_clones` naturally: +or explicitly: ```python -result = ( - ga.Experiment.on("human_igh") - .recombine() - .expand_clones(n_clones=50, per_clone=20) - .mutate(model="s5f", rate=0.05) - .paired_end(r1_length=150, insert_size=300) - .run_records(seed=1) -) +record_report = result.validate_records(refdata) +assert record_report, record_report.summary() +``` + +Family validation is records-only and works across all clonal models that carry +`clone_id`: -result.to_paired_fastq("reads_R1.fastq", "reads_R2.fastq") -# 1,000 R1 records + 1,000 R2 records, sequence_ids match +```python +family_report = result.validate_families() +assert family_report, family_report.summary() ``` +Currently `validate_families()` checks that every record in a clonal batch has a +`clone_id` and, when `truth_v_call` / `truth_d_call` / `truth_j_call` are present +from `expose_provenance=True`, that those truth calls are invariant within each +clone. It does not validate lineage topology, clone-size priors, or biological +realism. + +`validate_families_with_parents(refdata)` is specific to legacy +`expand_clones`, because it compares descendant records against +`result.parents`. For `clonal_lineage`, validate the tree objects directly with +`tree.validate()` and use the exported Newick/FASTA/node table for lineage-tool +scoring. + +## Ordering rules + +Ancestor-phase steps go before the clonal fork: + +| Step | Why | +|---|---| +| `.recombine()` | Defines the clone's V/D/J, trims, NP sequence, and junction | +| `.invert_d(...)` | Recombination-time D orientation; inherited by the clone | +| `.receptor_revision(...)` | Recombination/development-time V replacement; inherited by the clone | +| `.productive_only()` / `.restrict_alleles(...)` | Constraints on the founder draw | + +Descendant/read-phase steps go after a flat fork (`clonal_repertoire` or +`expand_clones`): + +| Step | Notes | +|---|---| +| `.mutate(...)` | BCR-only flat SHM; not allowed on TCR | +| `.pcr_amplify(...)`, `.sequencing_errors(...)`, `.polymerase_indels(...)` | Per-read technical artefacts | +| `.ambiguous_base_calls(...)`, `.end_loss_*prime(...)`, `.random_strand_orientation(...)` | Per-read observation artefacts | +| `.paired_end(...)` | Supported after legacy `expand_clones`; accepted after `clonal_repertoire` with abundance-collapse caveats; not yet supported after `clonal_lineage` | + +For `clonal_lineage`, do not add `.mutate(...)` afterward: SHM is internal to the +tree and controlled by `clonal_lineage(rate=...)`. Library-prep and sequencing +artefact passes may follow; `paired_end` is still a future addition for lineage +output. + +When `paired_end` follows `clonal_repertoire`, records are still collapsed by +assembled `sequence` and carry `duplicate_count`. TSV/DataFrame output preserves +that abundance. FASTQ exporters do not expand `duplicate_count` back into multiple +read pairs, so use this path for paired fields on collapsed records, not for exact +per-copy paired FASTQ depth. + ## Common mistakes -A handful of issues that show up with clonal pipelines. - -**Calling `invert_d()` after `expand_clones()`.** D orientation is -a recombination-time decision; it must be inherited by every -descendant. `invert_d` rejects this immediately at chain time with -"D inversion is a recombination-time decision and must be -inherited by all clone descendants." Move it before -`expand_clones`. Same goes for `receptor_revision`. - -**Putting `.paired_end()` before `expand_clones()`.** R1/R2 windows -are per-read; placing `paired_end` in the ancestor phase would -share one R1/R2 window across the whole family. `expand_clones` -scans the prior steps when called and rejects descendant-phase -methods that appear before it: "paired_end must be called after -expand_clones(); it is descendant-specific and must be sampled -independently for each clone member." - -**Expecting child traces to include the full parent trace.** A -descendant `Outcome.trace()` carries only the post-fork plan's -addressed choices — SHM substitutions, PCR errors, indels, etc. -The pre-fork plan's trace (recombination choices, NP bases, D -inversion) lives on `result.parents[i].trace()` instead. They're -two separate addressing namespaces because the pre-fork plan ran -once per parent and the post-fork plan ran once per descendant. - -**Expecting mutation-distance aggregation fields today.** The -clonal validator's per-clone mutation-distance distribution and -pre-SHM junction invariance checks are deliberately deferred -(future-slice scope). Today's `validate_families` checks the -recombination-time truth fields are invariant within a clone; it -does NOT verify that descendant SHM counts cluster around a -biologically plausible distribution. If you need that, compute it -yourself from `df.groupby("clone_id")["n_mutations"]`. +**Using `clonal_lineage` for TCR.** TCR clones do not SHM. Use +`clonal_repertoire` for TCR clone-size and abundance benchmarks. + +**Expecting exact discrete Zipf from `clonal_repertoire`.** The default +`power_law` sampler is a rounded continuous inverse-CDF draw. It is heavy-tailed +and Zipf-like, but not an exact discrete Zipf PMF. + +**Expecting `parent_id` on every clonal model.** `parent_id` belongs to legacy +`expand_clones`. Use `lineage_parent_id` for BCR lineage-tree parentage, and use +`clone_id` + `duplicate_count` for `clonal_repertoire`. + +**Using `n=` with modern clonal models.** `clonal_lineage` and +`clonal_repertoire` compute record counts from their own parameters and genotype +collapse. Passing `n` to `run_records` raises. ## Where to go next -- **[Validation & reproducibility](../validation/index.md)** — the - hub explaining the three validation layers and how the runtime - opt-in composes with `validate_families`. -- **[SHM and mutation targeting](shm-targeting.md)** — what runs - per descendant in the SHM pass. -- **[Export the results](../getting-started/export-results.md)** — - TSV / DataFrame / paired FASTQ formats; `clone_id` ships on - every record. -- **[The Experiment builder](experiment-builder.md)** — the full - control-panel page including the canonical ancestor-phase / - descendant-phase rule. -- For the engine-side mechanics of the plan split and the - pre-fork / post-fork compile path, see the contributor audit - [`docs/clonal_plan_split_design.md`](https://github.com/MuteJester/GenAIRR/blob/master/docs/clonal_plan_split_design.md). +- **[Clonal lineage trees](clonal-lineage.md)** — full BCR lineage model, + affinity-selection proxy, tree exporters, and Change-O validation example. +- **[Clonal repertoires](clonal-repertoire.md)** — TCR and flat-BCR abundance + model, clone-size parameters, `duplicate_count`, and tool export notes. +- **[Validation & reproducibility](../validation/index.md)** — record and family + validation layers. +- **[Corruption + sequencing artefacts](corruption-sequencing.md)** — technical + noise passes that compose with clonal workflows. +- **[Paired-end reads and FASTQ](paired-end-fastq.md)** — paired-end output; + currently available for non-lineage clonal workflows. diff --git a/site_docs/guides/clonal-lineage.md b/site_docs/guides/clonal-lineage.md new file mode 100644 index 0000000..98a802d --- /dev/null +++ b/site_docs/guides/clonal-lineage.md @@ -0,0 +1,436 @@ +# Clonal lineage trees (affinity maturation) + +

Where expand_clones produces a star — one founder and many independent descendants — +clonal_lineage grows a real tree: a generation-by-generation +birth–death process in which cells divide, somatically hypermutate, are selected +for antigen affinity, and are finally sampled. The output is a set of +per-cell AIRR records plus the ground-truth lineage tree (topology, +ancestral sequences, abundances) — the kind of object B-cell lineage-inference tools +(GCtree, IgPhyML, dowser, Change-O) are built to reconstruct. This page explains +precisely how it works under the hood; nothing here is a black box.

+ +## Why a tree, not a star + +A clonal family in vivo is the progeny of one naive B cell that has entered a +germinal center. Inside that germinal center the cell **divides**, its B-cell +receptor **somatically hypermutates** a few bases per division, and cells whose +mutated receptor **binds the antigen better** are preferentially expanded +(affinity maturation). The result is a genealogy with internal ancestors, +unequal branch lengths, and selective sweeps — a tree. + +The older `expand_clones` collapses all of that into a star: it takes the founder +recombination and draws `per_clone` independent descendants directly off it. That +is fine for "many reads that share a V(D)J truth", but it has no genealogy, no +generations, no selection, and no ancestral nodes — so it cannot serve as ground +truth for lineage reconstruction. `clonal_lineage` adds the missing biology. + +> **`clonal_lineage` is BCR-only.** T cells do **not** somatically hypermutate, and +> `clonal_lineage` applies S5F SHM, so calling it on a TCR locus raises a clear +> `ValueError`. A TCR "clone" is one rearrangement proliferated to many identical +> copies; the meaningful quantity is the **clone-size distribution**, not a mutation +> tree. For **TCR and flat clonal repertoires**, use +> [`clonal_repertoire`](clonal-repertoire.md) — it draws a heavy-tailed clone size +> per clone and emits `clone_id` + `duplicate_count` (see +> [Clone-size distributions](#clone-size-distributions-tcr-and-repertoire-mix)). + +## Quick start + +```python +import GenAIRR as ga + +result = ( + ga.Experiment.on("human_igh") + .recombine() # the founder: one V(D)J recombination per clone + .clonal_lineage( + n_clones=20, # grow 20 independent families + max_generations=6, # germinal-center rounds + n_max=300, # per-generation living-population carrying capacity + n_sample=30, # cells sampled per family at the end + rate=0.01, # per-base S5F SHM rate, per division + lambda_base=1.6, # mean offspring per cell per generation + selection_strength=10.0, # 0 = neutral; >0 = affinity selection + ) + .run_records(seed=0) +) + +# Per-cell AIRR records, tagged with clone + lineage metadata: +for rec in result.records[:3]: + print(rec["clone_id"], rec["lineage_node_id"], rec["lineage_generation"], + rec["lineage_abundance"], rec["lineage_affinity"], rec["v_call"]) + +# Ground-truth trees, one per clone: +tree = result.lineage_trees[0] +tree.validate() # structural invariants (raises if malformed) +newick = tree.to_newick() # true topology, branch length = per-edge mutations +fasta = tree.to_fasta() # every node's sequence (ancestral + observed) +table = tree.to_node_table_tsv() +``` + +## How it works under the hood + +Each clone is grown independently by a generation-synchronous birth–death process +in the Rust engine. The whole loop is deterministic for a given `seed`. + +```mermaid +flowchart TB + A["Founder: one V(D)J recombination
(the pre-fork plan, run once per clone)"] --> B["Generation g = 1..max_generations"] + B --> C["For each live cell:
offspring k ~ Poisson(λ_eff)"] + C --> D["λ_eff = lambda_base
× carrying-capacity damping
× affinity fitness(cell)"] + C --> E["Each child = parent + per-division S5F mutations"] + E --> F["Child affinity = exp(−β · weighted aa distance to target)"] + F --> B + B --> G["Stop at max_generations / extinction / capacity"] + G --> H["Sample n_sample cells from the final population"] + H --> I["Collapse identical genotypes → abundances
(observed nodes)"] + I --> J["Project each observed cell → AIRR record
+ emit ground-truth tree"] +``` + +### 1. The founder + +`recombine()` (everything before `clonal_lineage` in the chain) is the **per-clone** +phase: it runs once to produce the naive rearrangement — the V/D/J allele picks, +trims, NP bases, junction. That founder `Simulation` (and its recombination trace) +is the root of the tree. Clone *c* uses seed `seed + c × 1_000_000`, so families +are independent and reproducible. + +### 2. Generation-synchronous birth–death + +Growth proceeds in discrete generations. In each generation every currently-live +cell produces a number of offspring drawn from a Poisson distribution: + +``` +k ~ Poisson(λ_eff) +``` + +`k = 0` means the cell leaves no progeny (it becomes a tip); `k ≥ 1` creates `k` +children for the next generation. `λ_eff` is the base offspring rate `lambda_base` +modulated by two factors below. + +### 3. Carrying capacity (logistic damping) + +A germinal center is population-bounded, so the effective rate is damped as the +live population `P` approaches `n_max`: + +``` +λ_eff = lambda_base × max(0, 1 − P / n_max) +``` + +Near saturation `λ_eff → 0` and growth plateaus instead of exploding. A hard cap +also prevents the live set from exceeding `n_max` even on a lucky Poisson draw. + +### 4. Per-division somatic hypermutation (S5F) + +Every child is a clone of its parent's `Simulation` with a fresh round of somatic +hypermutation applied. GenAIRR reuses its **context-sensitive S5F** engine (the +same one behind `mutate(model="s5f")`): mutations are drawn from the 5-mer +mutability/substitution kernel (`s5f_model`, default `"hh_s5f"`) at per-base rate +`rate`, applied one at a time with the sequence context re-evaluated between +mutations. Because SHM only substitutes bases in place, each cell keeps the +founder's V/D/J assignments and region map — its germline ancestry stays intact, +which is what lets every node be projected to a correct AIRR record (below). + +The branch length stored on each edge is the **realized** number of substitutions +introduced on that division. + +### 5. Affinity selection + +This is what turns a neutral tree into affinity maturation. Each cell has an +**affinity** to a target *sequence*: + +``` +affinity = exp(−beta · weighted_aa_distance(cell, target)) +``` + +> **What "affinity" means here — read this.** This is a **sequence-distance +> proxy**, not a physically modeled antigen-binding affinity. It is a BLOSUM62 +> substitution-aware amino-acid distance from the cell's translated receptor to a +> **target amino-acid sequence**, mapped through `exp(−β · distance)`. There is +> **no Kd, no antigen concentration, no biophysical binding model** anywhere in +> the computation. Treat it as a tunable **selection pressure that pulls the +> lineage toward a target sequence** — the closer a cell's receptor gets to the +> target, the higher its "affinity" and the faster it divides. + +`weighted_aa_distance` is a **BLOSUM62 substitution-aware** amino-acid distance +between the cell's translated receptor and the target (region weights let CDRs be +emphasized; v1 uses uniform weights, with CDR3-weighting as a planned refinement). +`affinity` is 1.0 at the target sequence and decays toward 0 as the cell diverges. + +The target is either supplied by you (`target_aa=...`, a target amino-acid +sequence) or auto-generated as a "mature" target — the founder's amino-acid +sequence with `mature_substitutions` random residue changes (the standard +benchmark convention). + +Affinity feeds back into the offspring rate through a **fitness multiplier**: + +``` +fitness = max(0, 1 + selection_strength · (affinity − founder_affinity)) +λ_eff = lambda_base × carrying_capacity_damping × fitness +``` + +`founder_affinity` is the affinity of the naive founder, so the founder has +fitness ≈ 1, cells that improve on it divide faster, and worse cells divide +slower — producing selective sweeps. **`selection_strength = 0` makes `fitness ≡ 1`, +i.e. a neutral tree** (byte-identical to growing with no selection at all). + +> **Calibration note.** `exp(−beta · distance)` can underflow to ~0 for long +> receptor sequences at large `beta`, flattening selection. If you supply a full +> antigen `target_aa`, keep `beta` small (e.g. `1e-3`) or use the auto target. + +### 6. Sampling and genotype collapse + +When growth stops (at `max_generations` or capacity), `n_sample` cells are sampled +from the **LIVING final-generation population** — the cells that are alive when +growth stops. Cells with **identical genotypes** are then **collapsed** into +**observed cells**: the first cell seen for a genotype becomes the observed +representative and accumulates an **abundance** count (surfaced as both +`lineage_abundance` and the AIRR-standard `duplicate_count`) — so abundance-aware +tools (GCtree, Change-O, SCOPer, dowser) get observed cells with multiplicities. +The observed cells are the ones that become AIRR records. + +Because sampling draws from the **living** population, an **extinct clone** — one +whose founder draws 0 offspring — has no living cells and therefore yields **zero +observed cells and zero records**. A single founder at `lambda_base ≈ 1.5` goes +extinct roughly 25 % of the time. By default (`allow_extinction=False`) each +requested clone is **conditioned on survival**: an extinct family is re-grown with +a fresh deterministic sub-seed (up to a bounded number of attempts) so you reliably +get all `n_clones` families back. Determinism is preserved — the same top-level +`seed` always reproduces the same result. Set `allow_extinction=True` to accept +extinction instead: extinct clones are skipped and you get **fewer** than `n_clones` +families. + +The full genealogy — including every unobserved **internal ancestor** — is still +emitted in the `LineageTree` (and its Newick/FASTA), so ancestral-sequence +reconstruction can be scored against truth; note, however, that observed/sampled +nodes are always tips, not internal ancestors (direct sampling of internal +ancestors is a future addition). + +## What you get back + +### Per-cell AIRR records + +`result.records` is a list of full AIRR Rearrangement dicts, one per *observed* +(genotype-collapsed) cell. Each carries the founder's recombination provenance +(`v_call`, `d_call`, `j_call`, junction, …) — correct because the node's `Outcome` +reuses the founder's recombination trace — plus its own mutated `sequence`. Mutation +counts (`n_mutations`, `n_v_mutations`, …, and the IMGT-subregion counters) are +recomputed **from the cell's sequence vs. germline** — these are **net differences +from germline** (accumulated across all divisions from founder to leaf). Because +identical genotypes are collapsed before sampling, the number of records per clone +is ≤ `n_sample`; the `lineage_abundance` field (mirrored by the AIRR-standard +`duplicate_count`) accounts for the collapsed copies. + +> **Branch lengths vs. record `n_mutations`.** Newick branch lengths +> (as returned by `to_newick()`) count the **per-division substitution events** +> along each edge — re-mutations of a site that was already mutated count again. +> The record field `n_mutations` is the **net difference from germline** at the +> leaf (a site mutated and then back-mutated is not counted). As a result, +> summing branch lengths root→leaf will generally exceed a leaf's `n_mutations`. +> Both quantities are standard and correct: branch lengths track evolutionary +> distance along an edge (as tools like GCtree and IgPhyML expect), while +> `n_mutations` tracks the observable deviation from germline (as AIRR requires). + +Consistency check: `n_v + n_d + n_j + n_np == n_mutations` holds for every record. + +Lineage metadata stamped on every record: + +| Field | Meaning | +|---|---| +| `clone_id` | Which family (0 … n_clones−1) — the ground-truth clone label | +| `lineage_node_id` | The cell's node id within its tree | +| `lineage_parent_id` | Parent node id (−1 for the founder) | +| `lineage_generation` | Generation depth (founder = 0) | +| `lineage_abundance` | Observation count after genotype collapse | +| `duplicate_count` | AIRR-standard alias of `lineage_abundance` (read by Change-O / SCOPer / dowser) | +| `lineage_affinity` | Sequence-distance proxy to the target (see [§5](#5-affinity-selection)). `0` **only** when no target is in play — fully neutral mode (`target_aa=None` **and** `selection_strength=0`). If a `target_aa` is supplied (or selection is on), affinities are computed and reported even when `selection_strength=0` | + +### Ground-truth lineage trees + +`result.lineage_trees` is one `LineageTree` per clone. Each tree exposes the full +genealogy (every node, not just observed ones) and three exporters consumed by +inference tools: + +- `to_newick()` — standard rooted Newick; the founder is the root, branch lengths + are per-edge mutation counts. (`((node3:1)node1:1,node2:1)node0;`) +- `to_fasta()` — every node's sequence, ancestral and observed, so + ancestral-sequence-reconstruction can be scored against truth. +- `to_node_table_tsv()` — `node_id, parent_id, generation, mutations_from_parent, + abundance, observed, affinity, sequence`. +- `nodes()` / `validate()` — node access and a structural-invariant check. + +## Library-prep & sequencing artefacts + +Library-prep and sequencing passes can follow `clonal_lineage` — they run +**independently on each observed cell**, so every read picks up its own noise: + +```python +result = (ga.Experiment.on("human_igh").recombine() + .clonal_lineage(n_clones=10, n_sample=30, rate=0.01, selection_strength=10) + .sequencing_errors(rate=0.005) + .pcr_amplify(rate=0.002) + .run_records(seed=0)) +``` + +Each observed cell's post-SHM sequence is passed through the corruption plan with +its own seed, and the resulting artefacts are merged back onto the cell's record. +The founder's recombination provenance (`v_call`, `d_call`, `j_call`, trims, +junction) **and** the per-segment SHM counts are preserved; the record additionally +reports the artefact counters (`n_quality_errors`, `n_pcr_errors`, `n_indels`, …). +Supported passes are the per-read library-prep / sequencing artefact set also +used by `clonal_repertoire` and legacy `expand_clones`: +`sequencing_errors`, `pcr_amplify`, `polymerase_indels`, `end_loss_*`, +`ambiguous_base_calls`, `random_strand_orientation`. + +`mutate` is **not** allowed after `clonal_lineage` — SHM is internal to the lineage +engine (set it via `clonal_lineage(rate=...)`). `paired_end` is **not** allowed yet +either (the read layout is not wired through the per-cell corruption merge — a future +addition). + +Validation works on lineage results too: `run_records(..., validate_records=True)` +runs the per-record postcondition check and the clonal-family consistency check +(by `clone_id`), with or without a corruption pass. `run_records(..., +expose_provenance=True)` adds `truth_v_call` / `truth_d_call` / `truth_j_call` +columns from the founder assignments, and `result.outcomes` carries the per-record +`Outcome` objects index-aligned with `result.records`. + +## Clone-size distributions (TCR and repertoire mix) + +> **For TCR, use [`clonal_repertoire`](clonal-repertoire.md).** `clonal_lineage` +> itself is **BCR-only** — it still rejects TCR loci. The heavy-tailed clone-size +> model described below is now exposed as a fluent DSL workflow via +> [`clonal_repertoire`](clonal-repertoire.md); that is the TCR (and flat-BCR-abundance) +> path. This section explains the model; the dedicated guide is the place to drive it. + +Real repertoires are not uniform: a few clones are huge, most are singletons. +[`clonal_repertoire`](clonal-repertoire.md) draws **clone sizes** from a +heavy-tailed distribution (rounded power-law / Zipf-like by default, log-normal +optional) with a controllable **unexpanded fraction** (size-1, never-expanded +clones). For TCR — +which has no SHM — a clone is simply one rearrangement at copy-number `size`, with +within-clone variation coming only from the post-fork sequencing/PCR-error passes; +identical reads collapse into AIRR records carrying `clone_id` + `duplicate_count`. +That mixes large expanded families with a realistic singleton tail. See the +[Clonal repertoires guide](clonal-repertoire.md) for the full workflow. + +## Determinism + +Everything is keyed on `seed`. Clone *c* grows from `seed + c × 1_000_000`; +within a clone, generations, divisions, mutations, and sampling all draw from +seeded RNG streams (growth and sampling use separate streams so they don't +interfere). Re-running with the same seed reproduces the trees and records +byte-for-byte. + +## Parameters + +| Parameter | Default | Meaning | +|---|---|---| +| `n_clones` | — | Number of independent families to grow | +| `max_generations` | 10 | Germinal-center rounds (≤ 1000) | +| `n_max` | 1000 | **Per-generation LIVING-population carrying capacity** — the live population each generation is capped at this. It is **not** a hard cap on total cells per clone; the tree can contain more total nodes across generations | +| `n_sample` | 50 | Cells sampled per family at the end; records per clone ≤ this (genotype-collapsed) | +| `rate` | 0.05 | Per-base S5F SHM rate, per division | +| `lambda_base` | 1.5 | Mean offspring per cell per generation | +| `selection_strength` | 0.0 | `0` = neutral drift (`fitness ≡ 1`); set `> 0` for affinity selection. Note `0` makes selection neutral but does **not** force `lineage_affinity` to 0 — affinities are still computed/reported whenever a `target_aa` is supplied | +| `beta` | 1.0 | Affinity steepness in `exp(−beta·distance)` | +| `target_aa` | `None` | Target amino-acid sequence (a full translated receptor) used as the selection target (BLOSUM62-weighted distance, position-wise; only the overlapping prefix is scored when lengths differ). A **sequence-distance proxy**, not a biophysical antigen. `None` ⇒ auto "mature" target | +| `mature_substitutions` | 5 | aa substitutions for the auto target | +| `s5f_model` | `"hh_s5f"` | Bundled S5F kernel | +| `allow_extinction` | `False` | `False` ⇒ condition each clone on survival (retry extinct founders with fresh deterministic sub-seeds), so you reliably get `n_clones` families. `True` ⇒ accept extinction and skip extinct clones, producing fewer families | + +## Clone recovery: what we actually ran + +The point of planting ground-truth clones is that **other people's tools can find +them**. Two clusterers were actually run against the planted labels and both +recover them perfectly (adjusted Rand index = 1.0) at realistic SHM: + +1. **Immcantation Change-O `DefineClones`** at its **default** junction-distance + threshold (0.16) recovers the planted clones exactly. +2. An **in-repo, implementation-independent standard-heuristic clusterer** + (V/J + junction-length + single-linkage) recovers them just as cleanly. + +The export formats are **designed to feed** the broader B-cell lineage ecosystem — +tree-based tools like **GCtree, IgPhyML, and dowser** consume the +Newick/FASTA/node-table exports, and abundance-aware clustering tools like +**SCOPer** read the AIRR TSV with `duplicate_count`. Those tools were **not** run +as part of this validation; the claim here is scoped to the two clusterers above +and to format compatibility, not to having executed the full ecosystem. + +![GenAIRR clonal_lineage clones are recovered by Change-O at default settings](../assets/clonal-lineage-detection.png) + +*30 clones simulated with `clonal_lineage` (human IGH, realistic SHM). **(A)** the +repertoire has a realistic spread of clone sizes. **(B)** within-clone junction +distances sit far below Change-O's default 0.16 threshold while between-clone +distances are ~1.0 — the planted clones are cleanly separable. **(C)** Change-O at +its default threshold recovers all 30 planted clones (adjusted Rand index = 1.0, +precision = recall = 1.0).* + +### Reproduce it: run Change-O on a simulated repertoire + +```python +import GenAIRR as ga +import pandas as pd + +result = (ga.Experiment.on("human_igh").recombine() + .clonal_lineage(n_clones=30, max_generations=3, n_max=300, + n_sample=20, rate=0.008, lambda_base=1.6, + selection_strength=0.3) + .run_records(seed=42)) + +# Write an AIRR-format TSV. Keep the ground-truth clone label under a NON-AIRR +# name so it doesn't collide with the tool's inferred `clone_id` column. +df = pd.DataFrame(result.records) +df = df.rename(columns={"clone_id": "true_clone_id"}) +df.to_csv("repertoire.tsv", sep="\t", index=False) +``` + +```bash +# Immcantation Change-O (pip install changeo) infers clones from junctions: +DefineClones.py -d repertoire.tsv --format airr \ + --act set --model ham --norm len --dist 0.16 -o clones.tsv +``` + +Comparing Change-O's inferred `clone_id` against the planted `true_clone_id` +(e.g. with `sklearn.metrics.adjusted_rand_score`) gives **ARI = 1.0** — a perfect +match. + +### What the validation shows across SHM regimes + +- **Perfect precision, always.** Across every tool and threshold tested, two + *different* planted clones are never merged. The per-clone founding + rearrangements are distinct, so the planted signal is unambiguous. +- **Full recovery at a matched threshold.** Detection is a function of how mutated + the lineage is versus the caller's distance cutoff. At realistic SHM, the default + 0.16 cutoff recovers everything; for deeply matured lineages (e.g. `rate=0.05`, + 6 generations → ~21 % SHM) you raise the cutoff (a threshold sweep climbs from + ARI 0.26 at 0.16 → 0.91 at 0.30 → 1.0 at 0.45). This mirrors how these tools + behave on real data and is **not** a property of the simulator. +- **Independent of any one tool.** The same recovery holds for the in-repo + implementation-independent V/J + junction-length + single-linkage clusterer — so + the signal is not an artefact of Change-O's specific model. The exported + Newick/FASTA/AIRR-TSV (with `duplicate_count`) are **designed to feed** tree-based + and abundance-aware methods (GCtree, IgPhyML, dowser, SCOPer) directly; those + downstream tools were not run here. + +In short: the two clusterers we ran recover the planted clones exactly, and the +export formats are built to hand the same ground truth to the wider ecosystem. + +## Relationship to `expand_clones` + +`expand_clones` (the star model) is **deprecated** but still works — it remains +useful for "many reads sharing one V(D)J truth" without a genealogy. + +`clonal_lineage` is **not a drop-in replacement.** It grows real +affinity-maturation trees rather than a flat star, so the surface differs: + +- **Different parameters.** There is no `per_clone`; the number of observed records + depends on `n_sample`, genotype collapse, and selection (not a fixed + `n_clones × per_clone` product). SHM is internal (`rate=...`), not a separate + `mutate` step. +- **Different return shape.** `clonal_lineage` returns a + `SimulationResultWithLineages` with per-clone `.lineage_trees` (Newick / FASTA / + node-table exporters) alongside the per-cell records. + +What *does* carry over: the same per-read library-prep / sequencing passes +(`sequencing_errors`, `pcr_amplify`, …) can follow `clonal_lineage` exactly as they +follow other clonal workflows, applied independently per observed cell (see +[Library-prep & sequencing artefacts](#library-prep-sequencing-artefacts)). And +`run_records(..., validate_records=True)` is supported on lineage results too. diff --git a/site_docs/guides/clonal-repertoire.md b/site_docs/guides/clonal-repertoire.md new file mode 100644 index 0000000..a209d4f --- /dev/null +++ b/site_docs/guides/clonal-repertoire.md @@ -0,0 +1,216 @@ +# Clonal repertoires (TCR & abundance) + +

Where clonal_lineage grows BCR affinity-maturation trees, clonal_repertoire builds +a non-tree clonal repertoire: each clone is one rearrangement proliferated +to a clone size drawn from a heavy-tailed distribution, and those +copies are emitted as reads through the library-prep / sequencing passes. Identical +reads collapse into AIRR records carrying the AIRR-standard duplicate_count. +It is the model for TCR repertoires (T cells don't somatically +hypermutate) and for flat BCR clonal abundance — the modern +replacement for the deprecated expand_clones, with realistic clone +sizes instead of a fixed per-clone count.

+ +## What it is & when to use it + +A real repertoire is a population of clones with wildly uneven sizes: a few huge +expanded clones and a long tail of singletons. `clonal_repertoire` reproduces that +structure. For each of `n_clones` clones it: + +1. runs the pre-fork plan (`recombine()`) **once** to fix the clone's + V/D/J + trim + NP backbone — the single rearrangement that defines the clone; +2. draws a **size** from a heavy-tailed clone-size distribution (rounded + power-law / Zipf-like by default, log-normal optional), with a controllable + **unexpanded-singleton fraction**; +3. emits that many **reads** through the post-fork library-prep / sequencing passes, + so reads diverge only by technical noise; +4. **genotype-collapses** identical reads into AIRR records, each carrying a + `clone_id` (ground-truth clone label) and a `duplicate_count` (abundance). + +Reach for it when you want a repertoire whose **ground truth is clone membership + +abundance** — the input clone-callers and abundance-aware tools expect — rather than +a per-clone mutation genealogy. + +### How it compares + +| | What it models | Ground truth | Loci | +|---|---|---|---| +| **`clonal_repertoire`** | Non-tree clonal abundance; one rearrangement × N copies + technical noise | `clone_id` + `duplicate_count` | **TCR** and flat **BCR** | +| [`clonal_lineage`](clonal-lineage.md) | BCR affinity-maturation **trees** (per-division SHM, selection) | Lineage tree + per-cell records | **BCR only** | +| `expand_clones` *(deprecated)* | Star: fixed `per_clone` count, **no** size distribution | `clone_id` + `parent_id` | BCR / TCR | + +`clonal_repertoire` is the modern replacement for flat clonal expansion: instead of +`expand_clones`' fixed `per_clone` count, every clone draws a realistic heavy-tailed +size. For BCR **lineage trees** (genealogy, ancestral sequences, selection), use +[`clonal_lineage`](clonal-lineage.md) instead. + +## The biology + +A T-cell clone is the progeny of one rearranged T cell proliferated to many +**identical** copies. T cells do **not** somatically hypermutate, so all the +sequence diversity you observe *within* a TCR clone is **technical** — PCR and +sequencing error introduced during library prep — not biological. That is exactly +the shape `clonal_repertoire` produces: one rearrangement per clone, `size` copies, +divergence only through the post-fork sequencing passes. + +Clone **sizes** are empirically heavy-tailed — TCR clone-size distributions are +approximately **power-law** (a handful of enormous clones, a long tail of +singletons). The default `size_distribution="power_law"` (with `exponent≈2`) +captures that, and `unexpanded_fraction` sets the share of clones forced to be +**never-expanded singletons** (size 1) — the resting naive cells that were never +clonally expanded. + +For **BCR** the same flat model is useful when you want clonal abundance without a +genealogy. SHM is optional and applied flat across the clone's copies via a +post-fork `.mutate()` (see below) — there is no mutation tree. + +## Quick start (TCR) + +```python +import GenAIRR as ga + +result = (ga.Experiment.on("human_tcrb").allow_curatable_refdata().recombine() + .clonal_repertoire(n_clones=200, size_distribution="power_law", + exponent=2.0, max_size=500, unexpanded_fraction=0.5) + .sequencing_errors(rate=0.005) # per-read technical noise + .run_records(seed=0)) + +# Each record carries a ground-truth clone label + an abundance count: +for rec in result.records[:5]: + print(rec["clone_id"], rec["duplicate_count"], rec["v_call"], rec["j_call"]) + +# T cells don't hypermutate — every record has zero SHM: +assert all(rec.get("n_mutations", 0) == 0 for rec in result.records) +``` + +`allow_curatable_refdata()` is the usual opt-in for sampling from the bundled TCR +catalogue (which includes pseudogene/ORF alleles). `sequencing_errors(rate=...)` is +the per-read technical-noise pass — `rate` is a per-base error probability in +`[0, 1]` (drawn as `count ~ Poisson(rate × read_len)` per read). `pcr_amplify(rate=...)` +has the same shape; any of the post-fork library-prep passes +(`sequencing_errors`, `pcr_amplify`, `polymerase_indels`, `end_loss_*`, +`ambiguous_base_calls`, `random_strand_orientation`) can follow the fork. + +## Quick start (flat BCR with SHM) + +The same model works for BCR. Optionally add a post-fork `.mutate()` to apply flat +SHM independently to each copy (this is **not** a lineage tree — it's per-read +mutation off the shared founder): + +```python +import GenAIRR as ga + +result = (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=100, max_size=300, unexpanded_fraction=0.3) + .mutate(model="s5f", rate=0.01) # flat SHM on each copy + .sequencing_errors(rate=0.005) + .run_records(seed=0)) +``` + +With **no** post-fork passes at all, every copy of a clone is identical, so the +clone collapses to a **single** record whose `duplicate_count` equals the drawn +size: + +```python +r = (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=10, max_size=100) + .run_records(seed=1)) + +from collections import Counter +per_clone = Counter(rec["clone_id"] for rec in r.records) +assert all(c == 1 for c in per_clone.values()) # one record per clone +assert all(rec["duplicate_count"] >= 1 for rec in r.records) +``` + +> **`mutate` is BCR-only.** A post-fork `.mutate()` on a **TCR** experiment is +> rejected by `mutate`'s own TCR guard — T cells don't hypermutate. On TCR, leave +> SHM out; the post-fork sequencing passes provide all within-clone variation. + +## How it works + +```mermaid +flowchart TB + A["For each clone c = 0..n_clones:
recombine() once → founder rearrangement"] --> B["Draw size_c from the clone-size distribution
(deterministic for seed)"] + B --> C{"Post-fork passes?"} + C -->|"no"| D["1 record, duplicate_count = size_c"] + C -->|"yes"| E["Emit size_c reads through the
library-prep / sequencing passes
(each read its own seed)"] + E --> F["Collapse identical sequences →
records with duplicate_count"] + D --> G["Stamp clone_id = c"] + F --> G +``` + +Per clone, the engine draws the size, runs the founder recombination once, then +plays `size` reads through the post-fork plan (each with its own derived seed) and +collapses by emitted `sequence`. `clone_id` is the ground-truth clone index; +`duplicate_count` is the post-collapse abundance. + +**Determinism.** Everything is keyed on `seed`: clone sizes come from a seeded draw, +clone *c* recombines from `seed + c × 1_000_000`, and each read within a clone draws +from a derived sub-seed. Re-running with the same `seed` reproduces the records and +`duplicate_count`s byte-for-byte. + +**Read-count cost.** When post-fork passes are present, the **total reads simulated +is roughly the sum of the drawn clone sizes** (before collapse). A heavy tail with a +large `max_size` can therefore blow up runtime — keep `max_size` modest (the default +is 1000) and remember a few clones near `max_size` dominate the cost. + +## Parameters + +| Parameter | Default | Meaning | +|---|---|---| +| `n_clones` | — | Number of clones to simulate (positive int). Sets the number of distinct `clone_id`s. | +| `size_distribution` | `"power_law"` | Clone-size law: `"power_law"` (Zipf-like, heavy-tailed) or `"lognormal"`. | +| `exponent` | `2.0` | Power-law exponent (`> 0`), used when `size_distribution="power_law"`. Higher ⇒ steeper tail / more singletons; `~2–3` is typical for TCR. | +| `mu` | `1.0` | Log-normal location parameter, used when `size_distribution="lognormal"`. | +| `sigma` | `1.0` | Log-normal scale (`>= 0`), used when `size_distribution="lognormal"`. Larger ⇒ heavier tail. | +| `max_size` | `1000` | Upper clamp on any clone size. Bounds runtime (total reads ≈ Σ sizes when post-fork passes are present) — keep it modest. | +| `unexpanded_fraction` | `0.0` | Fraction of clones **forced** to size 1 (never-expanded singletons), in `[0, 1]`. The forced count is `round(n_clones × fraction)`. | + +> **Singletons come from two places.** The power-law size is drawn by a **continuous +> inverse-CDF** and then **rounded** to an integer in `[1, max_size]`, so even with +> `unexpanded_fraction=0` a large share of clones round to size 1 (for `exponent=2` +> the singleton share is **~1/3**). `unexpanded_fraction` adds a *forced* floor of +> size-1 clones **on top of** that natural tail, so raise it when you want an even +> larger never-expanded population. Because the power-law is a continuous draw that +> is rounded — not an exact discrete Zipf PMF — the realized distribution is an +> approximation of true discrete Zipf. + +## Ground truth & tooling + +Each record carries the two ground-truth fields the downstream ecosystem consumes: + +- **`clone_id`** — the planted **clone membership** label (which rearrangement this + read descends from). +- **`duplicate_count`** — the **abundance** of the collapsed record (AIRR-standard + field name). + +Write the records to an AIRR TSV (keeping the ground-truth label under a non-AIRR +column name so it doesn't collide with a tool's inferred `clone_id`): + +```python +import pandas as pd +df = pd.DataFrame(result.records).rename(columns={"clone_id": "true_clone_id"}) +df.to_csv("repertoire.tsv", sep="\t", index=False) +``` + +The output format is **what clone-callers and abundance-aware tools consume** — +Change-O `DefineClones` and SCOPer read AIRR TSV with `duplicate_count`, and TCR +tools like tcrdist read V/J + junction. You can score a tool's inferred clusters +against the planted `true_clone_id` (e.g. with `sklearn.metrics.adjusted_rand_score`). +We frame the records as **format-compatible** with these tools; this page does **not** +claim they were validated against `clonal_repertoire` output here. + +## Limitations + +- **No mutation tree.** `clonal_repertoire` is a flat, non-tree model — there are no + ancestral nodes, generations, or selection. For a BCR genealogy (Newick / FASTA / + node table, affinity maturation), use [`clonal_lineage`](clonal-lineage.md). + A post-fork `.mutate()` on BCR applies *flat* SHM per copy, not a lineage. +- **Power-law is continuous-rounded, not exact discrete Zipf.** Sizes come from a + continuous inverse-CDF rounded to integers; this approximates true discrete Zipf + rather than matching its PMF exactly. +- **Cost scales with Σ sizes.** With post-fork passes present, the simulator emits + roughly the sum of the drawn sizes in reads before collapse. Large clones combined + with heavy post-fork passes mean many reads — keep `max_size` modest. +- **One fork per pipeline.** `clonal_repertoire`, `expand_clones`, and + `clonal_lineage` are mutually exclusive in a single chain; descendant-phase steps + (e.g. `.mutate()`, the corruption passes) must come **after** the fork. diff --git a/site_docs/guides/corruption-sequencing.md b/site_docs/guides/corruption-sequencing.md index 20679c8..93c75d8 100644 --- a/site_docs/guides/corruption-sequencing.md +++ b/site_docs/guides/corruption-sequencing.md @@ -253,18 +253,18 @@ A few facts to know: reflect the strand decision automatically. Don't apply a second flip downstream. -## Ordering with clonal families +## Ordering with clonal workflows **All seven corruption passes are descendant-phase.** They model -per-read artefacts — every descendant of a clone gets independent -PCR errors, independent indel events, independent end-loss -draws, etc. +per-read artefacts — every emitted clone member or observed lineage +cell gets independent PCR errors, independent indel events, +independent end-loss draws, etc. ```python result = ( ga.Experiment.on("human_igh") .recombine() - .expand_clones(n_clones=10, per_clone=20) + .clonal_repertoire(n_clones=10, max_size=50) .mutate(model="s5f", rate=0.03) # biology — descendant-phase .pcr_amplify(count=(0, 3)) # corruption — descendant-phase .polymerase_indels(count=(0, 2)) @@ -278,9 +278,12 @@ result = ( ) ``` -Calling any of them *before* `.expand_clones(...)` raises -`ValueError` at chain time. See [Clonal families](clonal-families.md) -for the ancestor / descendant phase discipline in full. +Calling corruption before a flat fork (`clonal_repertoire` or legacy +`expand_clones`) raises `ValueError` at chain time because the artefact would be +shared by the whole clone. Corruption may also follow `clonal_lineage`; in that +case it is applied independently to each observed sampled cell. See +[Clonal simulation overview](clonal-families.md) for the phase discipline in +full. ## Per-platform calibrated profiles @@ -402,12 +405,11 @@ separate passes, separate fields, separate biology stages. The [Recombination + junction biology](recombination-junction.md#trims-vs-end-loss) guide has the side-by-side comparison. -**Putting corruption before `.expand_clones()`.** All seven -corruption passes are descendant-phase — they're per-read -artefacts that need to vary within a clonal family. The DSL -rejects this at chain time with the uniform message: " -must be called after expand_clones(); it is descendant-specific -and must be sampled independently for each clone member." +**Putting corruption before a clonal fork.** All seven corruption +passes are descendant-phase — they're per-read artefacts that need to +vary within a clonal family. The DSL rejects this before +`clonal_repertoire()` or `expand_clones()` with a message naming the +offending method and telling you to move it after the fork. **Reverse-complementing R2 again after random strand orientation.** When `.random_strand_orientation(...)` is in the pipeline, @@ -430,8 +432,8 @@ count them yourself with `rec["sequence"].count("N")`. partition. - **[Paired-end reads and FASTQ](paired-end-fastq.md)** — the read-layout projection that usually sits alongside corruption. -- **[Clonal families](clonal-families.md)** — the - ancestor / descendant phase discipline that gates every +- **[Clonal simulation overview](clonal-families.md)** — the + clonal model chooser and phase discipline that gates every corruption pass. - **[Recombination + junction biology](recombination-junction.md)** — the trim vs end-loss boundary in detail. diff --git a/site_docs/guides/experiment-builder.md b/site_docs/guides/experiment-builder.md index 7bdc1a1..d3d1916 100644 --- a/site_docs/guides/experiment-builder.md +++ b/site_docs/guides/experiment-builder.md @@ -86,7 +86,7 @@ pass actually does: | **Recombination-stage mechanisms** | `.invert_d(prob=...)`, `.receptor_revision(prob=...)` | D in reverse-complement orientation; post-recombine V replacement | | **Constraints** | `.productive_only()`, `.restrict_alleles(v=[...], d=[...], j=[...])` | Constrain the sample space (productive triad; allele subsetting) | | **Biological mutation** | `.mutate(model="s5f"\|"uniform", rate=..., segment_rates={...}, v_subregion_rates={...})` | Somatic hypermutation per descendant | -| **Clonal structure** | `.expand_clones(n_clones=..., per_clone=...)` | Fork: passes before run once per parent, passes after run per descendant | +| **Clonal structure** | `.clonal_lineage(...)`, `.clonal_repertoire(...)`, legacy `.expand_clones(...)` | BCR trees, TCR / flat-BCR abundance repertoires, or fixed-size star families | | **Library / sequencing corruption** | `.pcr_amplify(count=...)`, `.polymerase_indels(count=...)`, `.ambiguous_base_calls(count=...)`, `.sequencing_errors(count=...)`, `.end_loss_5prime(length=...)`, `.end_loss_3prime(length=...)` | Library-prep + sequencer artefacts | | **Read layout** | `.paired_end(r1_length=..., r2_length=..., insert_size=...)`, `.random_strand_orientation(prob=...)` | R1/R2 windows; strand flips | | **Bookkeeping** | `.with_metadata(experiment_id=..., tissue=...)`, `.contaminate(prob=...)` | Stamp user fields onto every record; inject background contaminants | @@ -101,9 +101,16 @@ controls how much N is added and what bases get drawn; the ## Order matters GenAIRR's API rejects pipelines whose steps biologically cannot -compose. The single most important ordering rule is the **clonal -fork**: `expand_clones(...)` partitions the pipeline into an -ancestor phase and a descendant phase. +compose. The clonal methods are the main ordering boundary: + +| Method | Use it for | Post-fork behavior | +|---|---|---| +| `clonal_lineage(...)` | BCR affinity-maturation trees | SHM is internal to the tree; optional library-prep / sequencing artefacts run once per observed cell | +| `clonal_repertoire(...)` | TCR or flat-BCR abundance repertoires | Copies are emitted through post-fork per-read passes and collapsed into `duplicate_count` | +| `expand_clones(...)` | Legacy fixed-size star families | Fixed `n_clones × per_clone` descendants | + +For flat clonal models (`clonal_repertoire` and `expand_clones`), steps before +the fork run once per clone; steps after run once per emitted read/copy. ```python exp = ( @@ -113,7 +120,7 @@ exp = ( .invert_d(prob=0.05) .receptor_revision(prob=0.05) # ──── fork ───────────────────────────────────────────── - .expand_clones(n_clones=50, per_clone=20) + .clonal_repertoire(n_clones=50, max_size=100, unexpanded_fraction=0.3) # ──── descendant phase (runs ONCE per descendant) ────── .mutate(model="s5f", rate=0.05) .pcr_amplify(count=(0, 3)) @@ -122,34 +129,30 @@ exp = ( ) result = exp.run_records(seed=42) -# Output: n_clones × per_clone = 1000 records. Every clone shares -# the same V(D)J recombination + D orientation + receptor revision; -# every descendant within a clone has independent SHM, PCR errors, -# end-loss, and R1/R2 windows. +# Each clone shares the same V(D)J recombination + D orientation + +# receptor revision; each emitted read has independent SHM, PCR errors, +# end-loss, and R1/R2 windows. Identical reads collapse into duplicate_count. ``` -**Ancestor-phase passes** (anything before `expand_clones`) run once -per clonal parent and propagate to every descendant. Use them for -the V(D)J recombination event itself, the recombination-stage -mechanisms (D inversion, receptor revision), and any constraint -that pertains to the parent (`productive_only`, `restrict_alleles`). - -**Descendant-phase passes** (anything after `expand_clones`) run -independently per descendant. Use them for somatic hypermutation -(every memory B cell mutates independently), all library / -sequencer corruption (PCR errors don't share across descendants), -and read layout (R1/R2 windows are per-read). - -Putting an SHM pass *before* `expand_clones` would mean every -descendant of every clone shares the same mutations — biologically -wrong. Putting `invert_d` *after* `expand_clones` would mean -descendants of the same parent see different D orientations — -also wrong. The DSL raises `ValueError` at chain time when these -orderings are violated; you don't have to remember the rules -exhaustively, the API does. - -If `expand_clones` is absent, every pass is "descendant phase" and -runs per-record (since each record is its own one-off lineage). +**Ancestor-phase passes** (anything before a flat clonal fork) run once per +clonal parent and propagate to every emitted copy. Use them for the V(D)J +recombination event itself, recombination-stage mechanisms (D inversion, +receptor revision), and constraints on the founder draw (`productive_only`, +`restrict_alleles`). + +**Descendant/read-phase passes** (anything after a flat clonal fork) run +independently per emitted copy. Use them for BCR flat SHM, all library / +sequencer corruption, and read layout. TCR rejects `.mutate(...)`; T cells do +not SHM. If `paired_end` follows `clonal_repertoire`, remember the result is +still abundance-collapsed by assembled sequence: `duplicate_count` carries copy +number, and FASTQ export does not expand it into multiple read pairs. + +`clonal_lineage` handles its own tree-internal SHM through +`clonal_lineage(rate=...)`; do not add `.mutate(...)` after it. Library-prep and +sequencing artefacts may follow lineage output, but `paired_end` is not wired +through `clonal_lineage` yet. + +If no clonal method is present, every pass runs per record. ## Choosing counts vs rates @@ -260,7 +263,9 @@ seed produces the same draws, which is how golden tests work. | Biology → API surface lookup | [Biology map](biology-map.md) | | Per-segment + per-V-subregion SHM targeting | [Targeted SHM rates](shm-targeting.md) | | R1/R2 windows + insert sizes + FASTQ output | [Paired-end reads and FASTQ](paired-end-fastq.md) | -| Forking into clonal families | [Clonal families](clonal-families.md) | +| Choosing a clonal model | [Clonal simulation overview](clonal-families.md) | +| BCR lineage trees | [Clonal lineage trees](clonal-lineage.md) | +| TCR / flat-BCR abundance repertoires | [Clonal repertoires](clonal-repertoire.md) | | Authoring or tuning a custom reference cartridge | [Reference cartridge](../concepts/reference-cartridge.md) | | Confirming output integrity post-run | [`validate_records`](../validation/validate-records.md) | | Per-record AIRR field catalogue | [Your first AIRR record](../getting-started/first-airr-record.md) | diff --git a/site_docs/guides/index.md b/site_docs/guides/index.md index b17a2d6..1c38e39 100644 --- a/site_docs/guides/index.md +++ b/site_docs/guides/index.md @@ -21,8 +21,14 @@ specific biological mechanisms or pipeline patterns.

- **[SHM and mutation targeting](shm-targeting.md)** — uniform vs S5F, per-segment and per-V-subregion rates, counter partitions. -- **[Clonal families](clonal-families.md)** — `expand_clones`, - the ancestor / descendant phase split, family validation. +- **[Clonal simulation overview](clonal-families.md)** — choose + between BCR lineage trees, TCR / flat-BCR clone-size repertoires, + and legacy fixed-size stars. +- **[Clonal lineage trees](clonal-lineage.md)** — BCR + affinity-maturation trees with SHM, selection, sampling, and + ground-truth exports. +- **[Clonal repertoires](clonal-repertoire.md)** — TCR and flat-BCR + abundance models with heavy-tailed clone sizes and `duplicate_count`. ## Library + sequencing diff --git a/site_docs/guides/paired-end-fastq.md b/site_docs/guides/paired-end-fastq.md index 11608a5..146ee76 100644 --- a/site_docs/guides/paired-end-fastq.md +++ b/site_docs/guides/paired-end-fastq.md @@ -178,31 +178,37 @@ few interactions are worth knowing: at projection time, exactly once; the writer never applies a second flip. -## Clonal families +## Clonal workflows `paired_end` is a descendant-phase pass — R1/R2 windows are -per-read, so each clone member gets its own independent layout -draw. If you put `.paired_end(...)` before `.expand_clones(...)`, -the DSL raises at chain time. The right order: +per-read, so each emitted clone member gets its own independent layout +draw. It is fully supported after legacy `expand_clones(...)`, and +accepted after `clonal_repertoire(...)` with one important caveat: +`clonal_repertoire` collapses identical assembled sequences into one +record with `duplicate_count`, and FASTQ export does not expand that +abundance back into multiple read pairs. `clonal_lineage(...)` does +not support paired-end projection yet. If you put `.paired_end(...)` +before a flat clonal fork, the DSL raises at chain time. The right +order for a collapsed clonal-repertoire record surface: ```python result = ( ga.Experiment.on("human_igh") .recombine() - .expand_clones(n_clones=50, per_clone=20) + .clonal_repertoire(n_clones=50, max_size=100) .mutate(model="s5f", rate=0.05) .paired_end(r1_length=150, insert_size=300) .run_records(seed=1) ) result.to_paired_fastq("reads_R1.fastq", "reads_R2.fastq") -# 1,000 paired records (50 clones × 20 descendants). Each -# descendant has its own R1 / R2 windows, possibly drawn -# from a different insert_size. +# Each emitted read has its own R1 / R2 windows, possibly drawn from +# a different insert_size; identical reads may have collapsed into +# duplicate_count before export. ``` -See [Clonal families](clonal-families.md) for the ancestor / descendant -phase rules in full. +See [Clonal simulation overview](clonal-families.md) for the clonal model +chooser and phase rules in full. ## Validation @@ -242,12 +248,14 @@ discipline. A handful of issues that show up repeatedly with paired-end. -**Calling `.paired_end()` before `.expand_clones()`.** R1/R2 -windows are per-read, so `paired_end` is descendant-phase. The -DSL rejects this at chain time: "paired_end must be called after -expand_clones(); it is descendant-specific and must be sampled -independently for each clone member." Move `.paired_end(...)` -after `.expand_clones(...)`. +**Calling `.paired_end()` before a flat clonal fork.** R1/R2 windows +are per-read, so `paired_end` is descendant-phase. Move +`.paired_end(...)` after `clonal_repertoire(...)` or legacy +`expand_clones(...)`. For exact per-copy paired FASTQ depth today, +use legacy `expand_clones`; `clonal_repertoire` is abundance-collapsed +and exposes copy number through `duplicate_count`. `clonal_lineage(...)` +currently rejects `.paired_end(...)` even after the fork; paired-end +lineage output is a future addition. **Expecting two AIRR rows per molecule.** There's still one row per record — paired-end is a layered projection, not a record @@ -278,8 +286,8 @@ instead. - **[Validation & reproducibility](../validation/index.md)** — the validator that checks paired-end geometry, plus the reproducibility model. -- **[Clonal families](clonal-families.md)** — the ancestor / - descendant phase rules and why `paired_end` must come after - `expand_clones`. +- **[Clonal simulation overview](clonal-families.md)** — the clonal + model chooser and why `paired_end` must come after flat clonal + forks. - **[The Experiment builder](experiment-builder.md)** — where `paired_end` sits in the full DSL pipeline. diff --git a/site_docs/guides/recombination-editing.md b/site_docs/guides/recombination-editing.md index 1ffab07..6a62598 100644 --- a/site_docs/guides/recombination-editing.md +++ b/site_docs/guides/recombination-editing.md @@ -65,7 +65,7 @@ print(sum(1 for r in result.records if r["d_inverted"])) - **Single call per pipeline.** A second `.invert_d(...)` raises `ValueError: invert_d already configured on this experiment; v1 accepts at most one inversion step per pipeline`. -- **Must come before `.expand_clones(...)`** (ancestor-phase). See +- **Must come before clonal forks** (ancestor-phase). See [Clonal placement](#clonal-placement) below. ### The `d_inverted` AIRR field @@ -148,7 +148,7 @@ print(sum(1 for r in result.records if r["receptor_revision_applied"])) - **Single call per pipeline.** A second call raises `ValueError: receptor_revision already configured on this experiment`. -- **Must come before `.expand_clones(...)`** (ancestor-phase). +- **Must come before clonal forks** (ancestor-phase). - The compile path also checks that the cartridge has a V pool in refdata — required because the replacement allele draws from it. @@ -209,24 +209,25 @@ after V assembly). ## Clonal placement -Both methods must be configured BEFORE `.expand_clones(...)` — +Both methods must be configured BEFORE a clonal fork +(`clonal_lineage`, `clonal_repertoire`, or legacy `expand_clones`) — they're recombination-time decisions that the family inherits. The DSL enforces this at chain time with two symmetric guards: ```python -# WRONG — invert_d after expand_clones raises ValueError +# WRONG — invert_d after a clonal fork raises ValueError ga.Experiment.on("human_igh") \ .recombine() \ - .expand_clones(n_clones=10, per_clone=20) \ + .clonal_repertoire(n_clones=10, max_size=20) \ .invert_d(prob=0.05) -# ValueError: invert_d must be called before expand_clones(); +# ValueError: invert_d must be called before the clonal fork; # D inversion is a recombination-time decision and must be # inherited by all clone descendants. Move the invert_d(...) -# call before expand_clones(...). +# call before clonal_lineage(...), clonal_repertoire(...), or expand_clones(...). ``` Symmetric message for `receptor_revision` placed after -`expand_clones`. The right order is: +a clonal fork. The right order is: ```python result = ( @@ -234,7 +235,7 @@ result = ( .recombine() .invert_d(prob=0.05) # ancestor phase .receptor_revision(prob=0.02) # ancestor phase - .expand_clones(n_clones=10, per_clone=20) + .clonal_repertoire(n_clones=10, max_size=20) .mutate(model="s5f", rate=0.03) # descendant phase .run_records(seed=1) ) @@ -248,7 +249,7 @@ checks (via the `truth_*_call` fields when provenance exposure is on; the parent-aware validator also compares `d_inverted` and `original_v_call` against the parent Outcome). -See [Clonal families](clonal-families.md) for the full +See [Clonal simulation overview](clonal-families.md) for the full ancestor-vs-descendant phase discipline. ## Validation and replay @@ -329,8 +330,8 @@ re-RCs the bytes before comparing, do it on `rec["sequence"][d_sequence_start:d_sequence_end]` — don't remap the coordinates. -**Putting `.invert_d()` or `.receptor_revision()` after -`.expand_clones()`.** The DSL rejects this immediately at chain +**Putting `.invert_d()` or `.receptor_revision()` after a clonal +fork.** The DSL rejects this immediately at chain time with the symmetric message above. Both decisions must be shared across every descendant of a clonal family; the API will not let you accidentally split them. @@ -340,7 +341,7 @@ not let you accidentally split them. - **[Recombination and junction biology](recombination-junction.md)** — what `.recombine()` produces in the first place, before either of these mechanisms run. -- **[Clonal families](clonal-families.md)** — the ancestor-vs- +- **[Clonal simulation overview](clonal-families.md)** — the ancestor-vs- descendant phase discipline both mechanisms participate in. - **[Validation & reproducibility](../validation/index.md)** — the validator's issue catalogue (including the three issue diff --git a/site_docs/guides/recombination-junction.md b/site_docs/guides/recombination-junction.md index 0cb4193..146ee32 100644 --- a/site_docs/guides/recombination-junction.md +++ b/site_docs/guides/recombination-junction.md @@ -23,10 +23,11 @@ analysis would call "the receptor": | **Assembled regions** | The final structural regions (`V`, `NP1`, `D`, `NP2`, `J`) carrying their coordinates on the record | | **Junction** | The canonical V Cys → J W/F + 3 window exposed on `junction` / `junction_aa` / `junction_length` | -Everything before `.mutate()`, `.expand_clones()`, or any -corruption pass is recombination's responsibility. After the -pass, the molecule is "finished" in the recombination-biology -sense — SHM and library prep happen on top of it. +Everything before `.mutate()`, a clonal fork (`clonal_lineage`, +`clonal_repertoire`, legacy `expand_clones`), or any corruption pass is +recombination's responsibility. After the pass, the molecule is "finished" in +the recombination-biology sense — SHM, clonal expansion, and library prep happen +on top of it. ## A minimal recombination diff --git a/site_docs/index.md b/site_docs/index.md index 96037ac..c6c28fd 100644 --- a/site_docs/index.md +++ b/site_docs/index.md @@ -81,9 +81,9 @@ an aligner.

// Lineage · 03

-

Expand clones

-

Parent rearrangement forks N descendants. SHM accumulates - per-descendant after the fork.

+

Simulate clones

+

BCR lineage trees, TCR clone-size repertoires, and flat + abundance benchmarks with planted clone IDs.

@@ -112,17 +112,18 @@ an aligner.

```python import GenAIRR as ga +# Grow real BCR clonal lineage trees — affinity maturation, with ground truth result = ( ga.Experiment.on("human_igh") .recombine() - .expand_clones(n_clones=50, per_clone=20) - .mutate(rate=0.05) - .pcr_amplify(count=(0, 3)) - .ambiguous_base_calls(count=(0, 2)) - .productive_only().run_records(seed=42) + .clonal_lineage(n_clones=50, max_generations=6, n_sample=30, + rate=0.01, selection_strength=10.0) + .sequencing_errors(rate=0.001) + .run_records(seed=42) ) -result.to_tsv("repertoire.tsv") +result.to_tsv("repertoire.tsv") # per-cell AIRR records (clone_id, lineage_*) +newick = result.lineage_trees[0].to_newick() # ground-truth lineage tree per clone ``` ## Install. One command. No compiler. @@ -145,10 +146,10 @@ your task. | If you want to ... | Start here | |---|---| -| **Simulate sequences** | [Quick start](getting-started/quick-start.md) → [The Experiment builder](guides/experiment-builder.md) | +| **Simulate sequences** | [Quick start](getting-started/quick-start.md) → [The Experiment builder](guides/experiment-builder.md) → [API reference](reference/index.md) | | **Build a reference cartridge** | [Reference cartridge concept](concepts/reference-cartridge.md) → [Build a reference cartridge](guides/build-reference-cartridge.md) | -| **Get reproducible / validated output** | [Validation hub](validation/index.md) → [Trace, replay, reproducibility](guides/trace-replay.md) | -| **Understand the biological mechanisms** | [Recombination + junction biology](guides/recombination-junction.md), [SHM and mutation targeting](guides/shm-targeting.md), [Corruption + sequencing artefacts](guides/corruption-sequencing.md), [Clonal families](guides/clonal-families.md) | +| **Get reproducible / validated output** | [Validation hub](validation/index.md) → [Validate AIRR records](validation/validate-records.md) → [Trace, replay, reproducibility](guides/trace-replay.md) | +| **Understand the biological mechanisms** | [Recombination + junction biology](guides/recombination-junction.md), [SHM and mutation targeting](guides/shm-targeting.md), [Corruption + sequencing artefacts](guides/corruption-sequencing.md), [Clonal simulation overview](guides/clonal-families.md) | | **Contribute to GenAIRR** | [Architecture (Contributor)](architecture/index.md) | The **[Choose your path](learn.md)** page expands each of these diff --git a/site_docs/learn.md b/site_docs/learn.md index bfc7f5b..9e5536a 100644 --- a/site_docs/learn.md +++ b/site_docs/learn.md @@ -73,10 +73,16 @@ where the v1 boundary sits. 4. **[SHM and mutation targeting](guides/shm-targeting.md)** — uniform vs S5F, per-segment and per-V-subregion rates, counter partitions. -5. **[Clonal families](guides/clonal-families.md)** — ancestor / - descendant phase discipline, `clone_id` / `parent_id`, family - validation. -6. **[Corruption + sequencing artefacts](guides/corruption-sequencing.md)** +5. **[Clonal simulation overview](guides/clonal-families.md)** — + choose `clonal_lineage` for BCR trees, `clonal_repertoire` for + TCR / abundance repertoires, or legacy `expand_clones`. +6. **[Clonal lineage trees](guides/clonal-lineage.md)** — BCR SHM + trees, selection, final-cell sampling, lineage metadata, and + tree exports. +7. **[Clonal repertoires](guides/clonal-repertoire.md)** — TCR and + flat-BCR clone sizes, `duplicate_count`, and AIRR clone-caller + export. +8. **[Corruption + sequencing artefacts](guides/corruption-sequencing.md)** — the observation-stage mechanisms (PCR, sequencing errors, indels, end-loss, N corruption, strand). diff --git a/site_docs/reference/experiment.md b/site_docs/reference/experiment.md index 6578b25..074eb33 100644 --- a/site_docs/reference/experiment.md +++ b/site_docs/reference/experiment.md @@ -5,21 +5,23 @@ builder. Every method returns the same Experiment extended by one pipeline stage; the pipeline runs when you call .run_records(...), .run(...), or .compile().run(...). For the conceptual walk-through — -which methods are ancestor-phase vs descendant-phase, how clonal -expansion partitions the pipeline, how compile reuse works — see +which methods are ancestor-phase vs descendant-phase, which clonal +model to choose, how compile reuse works — see the Experiment builder guide. The reference below catalogues the public surface.

## Common methods -The six methods you'll reach for in 90 % of real pipelines: +The methods you'll reach for in most real pipelines: | Method | Purpose | |---|---| | `.recombine(...)` | Add the V(D)J recombination pass — the foundational ancestor-phase mechanism | | `.productive_only()` | Constrain sampling so only productive rearrangements survive | | `.mutate(...)` | Apply biological SHM (uniform or S5F) on top of recombination | -| `.expand_clones(...)` | Partition the pipeline into ancestor + descendant phases for clonal families | +| `.clonal_lineage(...)` | Grow BCR affinity-maturation lineage trees | +| `.clonal_repertoire(...)` | Generate TCR / flat-BCR abundance repertoires with clone sizes and `duplicate_count` | +| `.expand_clones(...)` | Legacy fixed-size clonal star model | | `.paired_end(...)` | Project assembled sequences as paired R1 / R2 reads | | `.run_records(...)` | Compile + run + return a `SimulationResult` | @@ -43,6 +45,8 @@ pilot for the wider generated-reference effort. - recombine - productive_only - mutate + - clonal_lineage + - clonal_repertoire - expand_clones - paired_end - run_records diff --git a/site_docs/reference/index.md b/site_docs/reference/index.md index 17883a7..c666399 100644 --- a/site_docs/reference/index.md +++ b/site_docs/reference/index.md @@ -86,9 +86,9 @@ access). | Symbol | Purpose | Guide | |---|---|---| | `ValidationReport` | Per-record AIRR-validator output | [`validate_records`](../validation/validate-records.md) | -| `FamilyValidationReport` | Family-validator output | [Clonal families](../guides/clonal-families.md) | +| `FamilyValidationReport` | Family-validator output | [Clonal simulation overview](../guides/clonal-families.md) | | `RecordValidationFailedError` | Raised when strict record validation fails | [`validate_records`](../validation/validate-records.md) | -| `FamilyValidationFailedError` | Raised when strict family validation fails | [Clonal families](../guides/clonal-families.md) | +| `FamilyValidationFailedError` | Raised when strict family validation fails | [Clonal simulation overview](../guides/clonal-families.md) | | `StrictSamplingError` | Raised under `strict=True` on empty admissible support | [Validation hub](../validation/index.md#strict-vs-permissive) | | `productive` | Convenience contract bundle for productive sampling | [Recombination biology](../guides/recombination-junction.md#productivity) | @@ -105,7 +105,7 @@ is called. | **Recombination** | `.recombine()`, `.invert_d(prob=...)`, `.receptor_revision(prob=...)` | Ancestor-phase mechanisms | | **Constraints** | `.productive_only()`, `.restrict_alleles(v=..., d=..., j=...)` | Constrain the sample space | | **Biological mutation** | `.mutate(model="s5f"\|"uniform", rate=..., count=..., segment_rates=..., v_subregion_rates=...)` | The only pass that increments `n_mutations` | -| **Clonal structure** | `.expand_clones(n_clones=..., per_clone=...)` | Marks the ancestor/descendant fork | +| **Clonal structure** | `.clonal_lineage(...)`, `.clonal_repertoire(...)`, legacy `.expand_clones(...)` | BCR trees, TCR / flat-BCR abundance repertoires, or fixed-size star families | | **Trims override** | `.trim(v_3=..., d_5=..., d_3=..., j_5=..., enabled=...)` | Per-experiment trim distribution overrides | | **Library / sequencer artefacts** | `.pcr_amplify(...)`, `.polymerase_indels(...)`, `.ambiguous_base_calls(...)`, `.sequencing_errors(...)`, `.end_loss_5prime(...)`, `.end_loss_3prime(...)` | All descendant-phase | | **Read layout** | `.paired_end(r1_length=..., r2_length=..., insert_size=...)`, `.random_strand_orientation(prob=...)` | Per-read projection | @@ -125,7 +125,7 @@ record dicts plus the underlying `Outcome` objects. | Surface | Methods / properties | Notes | |---|---|---| | **List-like access** | `len(result)`, `result[i]`, `result[a:b]`, `for rec in result:` | One AIRR dict per element | -| **Underlying state** | `.records`, `.outcomes`, `.parents` | `outcomes` is `None` when built from records only; `parents` is `None` on non-clonal results | +| **Underlying state** | `.records`, `.outcomes`, `.parents`, `.lineage_trees` | `outcomes` is `None` when built from records only; `parents` exists only for legacy `expand_clones`; `lineage_trees` exists on lineage results | | **Validation** | `.validate_records(refdata)`, `.validate_families()`, `.validate_families_with_parents(refdata)` | See [Validation hub](../validation/index.md) | | **Export** | `.to_tsv(path, *, airr_strict=False)`, `.to_csv(path, *, airr_strict=False)`, `.to_fasta(path, *, prefix="seq")`, `.to_fastq(path, *, quality="illumina", **kw)`, `.to_paired_fastq(r1, r2, *, quality="illumina", overwrite=False, **kw)`, `.to_dataframe(*, airr_strict=False)` | See [Export the results](../getting-started/export-results.md) | | **Construction** | `SimulationResult.from_outcomes(outcomes, refdata, *, id_prefix="seq", expose_provenance=False)` | Build a result from Rust `Outcome` objects + refdata; `expose_provenance=True` injects `truth_*_call` columns | diff --git a/site_docs/reference/simulation-result.md b/site_docs/reference/simulation-result.md index 789fffd..3ca7dd6 100644 --- a/site_docs/reference/simulation-result.md +++ b/site_docs/reference/simulation-result.md @@ -3,8 +3,10 @@

SimulationResult is the output wrapper returned by Experiment.run_records(...). It holds the list of AIRR record dicts, the underlying engine Outcome -objects (for trace / replay / validation), and the parent -Outcomes when the pipeline produced clonal families. +objects (for trace / replay / validation), legacy parent +Outcomes when expand_clones produced fixed-size +families, and lineage trees when clonal_lineage produced BCR +tree output. Treat it as a list-like view of records plus the typed validators and export helpers below.

@@ -23,6 +25,11 @@ The eight methods you'll reach for in real pipelines: | `.to_fasta(path, *, prefix="seq")` | Write assembled sequences as FASTA | | `.to_fastq(...)` / `.to_paired_fastq(...)` | Write FASTQ; paired-end requires `read_layout="paired_end"` | +`clonal_repertoire` returns ordinary `SimulationResult` records with +`clone_id` and `duplicate_count`. `clonal_lineage` returns +`SimulationResultWithLineages`, a subclass that adds `.lineage_trees`. +Legacy `expand_clones` returns `SimulationResult` with `.parents`. + ### FASTQ exports (prose only) The two FASTQ-emitting methods are documented in diff --git a/site_docs/validation/index.md b/site_docs/validation/index.md index 15358b4..1332fbf 100644 --- a/site_docs/validation/index.md +++ b/site_docs/validation/index.md @@ -61,8 +61,9 @@ one-liner. ### Family validation -For workloads using `expand_clones(...)`, two sibling validators -check family-level invariants: +For workloads that stamp `clone_id` (`clonal_lineage(...)`, +`clonal_repertoire(...)`, or legacy `expand_clones(...)`), family +validation checks clone-level record consistency: ```python family_report = result.validate_families() @@ -73,16 +74,19 @@ full_report = result.validate_families_with_parents(refdata) assert full_report, full_report.summary() ``` -`validate_families` checks within-family invariants alone — every -descendant of a clone shares its V(D)J recombination, every -descendant's `clone_id` matches the parent's, no descendant has a -SHM count below the parent's, and the per-clone size matches what -`expand_clones` was asked for. No refdata required. +`validate_families` is records-only. It checks that a clonal batch is +not mixed with non-clonal records and, when `truth_v_call`, +`truth_d_call`, and `truth_j_call` are present, that those +recombination-time truth calls are invariant within each `clone_id` +group. No refdata required. -`validate_families_with_parents(refdata)` adds full per-record -validation on every parent before checking family invariants — -it's the strongest gate. Use it in release-tier CI when the -pipeline ships clonal output. +`validate_families_with_parents(refdata)` is the stronger +legacy-star diagnostic for `expand_clones(...)` outputs, where +`result.parents` exists. It compares descendant records against the +actual parent `Outcome`. Modern `clonal_repertoire` and +`clonal_lineage` do not expose `result.parents`; validate their +records with `validate_records`, use `validate_families` for +`clone_id` grouping, and validate lineage trees with `tree.validate()`. ### Runtime opt-in @@ -108,8 +112,8 @@ hot loops. | Allele calls (`v_call` / `d_call` / `j_call` matches an independent walker) | `validate_records` | | Junction + productive triad (junction content, `vj_in_frame`, `stop_codon`, `productive` predicate identification) | `validate_records` | | Paired-end geometry (R1/R2 windows, R2 reverse-complement, `insert_size`) | `validate_records` (when `paired_end()` is in the pipeline) | -| Family size, parent/descendant `clone_id` match, descendant `n_mutations ≥ parent's` | `validate_families` | -| Each family's *parent* validates as a standalone record | `validate_families_with_parents` | +| Clonal batch is consistently stamped with `clone_id`; truth V/D/J calls are invariant within clone when truth columns are present | `validate_families` | +| Legacy `expand_clones` descendants agree with their parent `Outcome` | `validate_families_with_parents` | The three layers are independent — `validate_records` doesn't require a clonal structure; `validate_families` doesn't require @@ -308,24 +312,25 @@ the `failures` list is JSON-serialisable for downstream tooling. ### Clonal output -Stack record + family validation when the pipeline ships clonal -families: +Stack record + family validation when the pipeline ships clonal records: ```python result = ( exp .recombine() - .expand_clones(n_clones=50, per_clone=20) - .mutate(model="s5f", rate=0.05) - .run_records(n=1000, seed=42) + .clonal_repertoire(n_clones=50, max_size=100) + .sequencing_errors(rate=0.001) + .run_records(seed=42, expose_provenance=True) ) assert result.validate_records(refdata), "AIRR record divergence" -assert result.validate_families_with_parents(refdata), "Family invariant divergence" +assert result.validate_families(), "Family invariant divergence" ``` -The two layers catch different things and don't substitute for -each other. +For legacy `expand_clones(...)`, add +`result.validate_families_with_parents(refdata)`. For +`clonal_lineage(...)`, also call `tree.validate()` on each +`result.lineage_trees` entry when you need topology checks. ## When validation is not enough diff --git a/site_docs/validation/validate-records.md b/site_docs/validation/validate-records.md index aadce33..09b1113 100644 --- a/site_docs/validation/validate-records.md +++ b/site_docs/validation/validate-records.md @@ -182,39 +182,40 @@ and the insert size matches. ## Family validation -For workloads using `expand_clones(...)` to generate clonal -families, two sibling validators check family-level consistency: +For workloads that stamp `clone_id` (`clonal_lineage(...)`, +`clonal_repertoire(...)`, or legacy `expand_clones(...)`), family +validation checks clone-level consistency: ```python result = ( ga.Experiment.on("human_igh") .recombine() - .expand_clones(n_clones=50, per_clone=20) - .mutate(rate=0.05) - .run_records(n=1000, seed=42) + .clonal_repertoire(n_clones=50, max_size=100) + .mutate(rate=0.01) + .run_records(seed=42, expose_provenance=True) ) -family_report = result.validate_families(refdata) +family_report = result.validate_families() assert family_report, family_report.summary() ``` -`validate_families` checks within-family invariants: every -descendant of a clone shares its V(D)J recombination, every -descendant's `clone_id` matches the parent, no descendant has a -SHM count below the parent's, and the per-clone size matches what -`expand_clones` was asked for. +`validate_families` is records-only. It checks that a clonal batch is +not mixed with non-clonal records and, when `truth_v_call`, +`truth_d_call`, and `truth_j_call` are present, that those +recombination-time truth calls are invariant within each `clone_id` +group. -When you also want to check that each family's *parent* record -itself validates against the cartridge (in addition to family -invariants), use the parent-aware variant: +For legacy `expand_clones(...)`, you can also compare each +descendant against its actual parent `Outcome`: ```python report = result.validate_families_with_parents(refdata) ``` -This is the most expensive of the three — it runs the per-record -validator on every parent before checking family invariants — but -it's the strongest gate. Use it in release-tier CI. +`clonal_repertoire` and `clonal_lineage` do not expose +`result.parents`, so `validate_families_with_parents` is not their +primary validator. For `clonal_lineage`, validate the tree objects +directly with `tree.validate()` when topology matters. ## What `validate_records` does NOT do diff --git a/src/GenAIRR/_compiled.py b/src/GenAIRR/_compiled.py index e6f8bf8..9c53d6d 100644 --- a/src/GenAIRR/_compiled.py +++ b/src/GenAIRR/_compiled.py @@ -24,7 +24,7 @@ _describe_step_sequence, _format_active_contracts, ) -from ._pipeline_ir import _ClonalForkStep +from ._pipeline_ir import _ClonalForkStep, _LineageForkStep, _RepertoireForkStep if TYPE_CHECKING: from .dataconfig import DataConfig @@ -500,3 +500,383 @@ def __repr__(self) -> str: f"" ) + + +class CompiledLineageExperiment: + """A compiled experiment that grows BCR affinity-maturation lineage trees. + + Wraps a pre-fork :class:`GenAIRR._engine.CompiledSimulator` (founder + recombination) and a :class:`~GenAIRR._pipeline_ir._LineageForkStep` + (lineage parameters). :meth:`run_records` grows one lineage tree per + clone via the Rust ``simulate_family_outcomes`` kernel and returns a + :class:`~GenAIRR.result.SimulationResultWithLineages` whose records are + per-observed-node AIRR dicts with lineage metadata. + """ + + __slots__ = ( + "_pre", + "_step", + "_refdata", + "_post", + "_post_steps", + "_dataconfig", + "_metadata", + ) + + def __init__( + self, + pre_simulator: "_engine.CompiledSimulator", + step: "_LineageForkStep", + refdata: "_engine.RefDataConfig", + post_simulator: Optional["_engine.CompiledSimulator"] = None, + post_steps: Sequence[Any] = (), + dataconfig: Optional["DataConfig"] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + self._pre = pre_simulator + self._step = step + self._refdata = refdata + # Per-observed-cell library-prep / sequencing corruption plan. + # ``None`` keeps the pristine-read path (no artefacts applied). + self._post = post_simulator + self._post_steps: Tuple[Any, ...] = tuple(post_steps) + self._dataconfig = dataconfig + self._metadata = dict(metadata) if metadata else {} + + @property + def refdata(self) -> "_engine.RefDataConfig": + return self._refdata + + def run_records( + self, + *, + seed: int = 0, + strict: bool = False, + expose_provenance: bool = False, + validate_records: bool = False, + ) -> "SimulationResultWithLineages": + """Grow lineage trees and return per-observed-node AIRR records. + + Each clone is seeded at ``seed + clone_idx * 1_000_000`` so + independent clones are reproducible and non-overlapping. + + The returned :class:`SimulationResultWithLineages` carries the + per-record ``Outcome`` that produced each AIRR record on its + ``.outcomes`` attribute (index-aligned with ``.records``). On the + pristine-read path these are the per-observed-node SHM + ``Outcome`` objects (``fam.observed_outcomes()``); on the + corruption path they are the per-node merged + recombination+SHM+artefact ``Outcome`` objects. Since each node + ``Outcome`` is self-consistent, ``result.validate_records(refdata)`` + passes. + + ``expose_provenance=True`` appends ``truth_v_call`` / + ``truth_d_call`` / ``truth_j_call`` columns to each record from + the founder allele assignments carried on the per-record + ``Outcome``. + + ``validate_records=True`` runs + :meth:`SimulationResult.validate_records` (per-record + postcondition) and :meth:`SimulationResult.validate_families` + (clonal-family consistency by ``clone_id``) on the freshly built + batch, raising the matching validation error on any failure. + """ + from GenAIRR import _engine as _eng + from ._airr_record import outcome_to_airr_record + from ._s5f_loader import load_builtin_s5f_kernel + from .result import SimulationResultWithLineages, _inject_truth_columns + + step = self._step + mutability, substitution = load_builtin_s5f_kernel(step.s5f_model) + + records: List[Dict[str, Any]] = [] + # Per-record source ``Outcome`` objects, index-aligned with + # ``records`` so ``result.validate_records`` can re-derive each + # record from the engine state that produced it. + outcomes: List["_engine.Outcome"] = [] + lineage_trees = [] + + # Bound on founder-survival retries. Sampling draws from the LIVING + # final-generation population, so an extinct founder (drew 0 offspring) + # yields zero observed cells. With ``allow_extinction=False`` (default) + # we condition each clone on survival by re-growing the family with a + # fresh deterministic sub-seed until it survives or we exhaust the + # bound. The sub-seed offset (a prime times the attempt index) keeps the + # retries deterministic — same top-level seed => same result. + _MAX_SURVIVAL_ATTEMPTS = 50 + _SUBSEED_STRIDE = 7_919 # prime; keeps retry sub-seeds well-separated + allow_extinction = getattr(step, "allow_extinction", False) + + for clone_idx in range(step.n_clones): + clone_seed = int(seed) + clone_idx * 1_000_000 + fam = None + tree = None + for attempt in range(_MAX_SURVIVAL_ATTEMPTS): + family_seed = clone_seed + attempt * _SUBSEED_STRIDE + # _pre.run() returns a single Outcome (not a list) — mirror + # CompiledClonalExperiment which calls self._pre.run(seed=...). + founder = self._pre.run(seed=family_seed, strict=strict) + candidate = _eng.simulate_family_outcomes( + founder, + self._refdata, + mutability, + substitution, + step.rate, + step.lambda_base, + 0.0, # lambda_mut: positional slot 6 (inert; hardcoded) + step.max_generations, + step.n_max, + step.n_sample, + family_seed, + selection_strength=step.selection_strength, + beta=step.beta, + target_aa=step.target_aa, + mature_substitutions=step.mature_substitutions, + ) + survived = len(candidate.observed_outcomes()) > 0 + if survived or allow_extinction: + fam = candidate + tree = candidate.tree() + break + # extinct + survival required => retry with next sub-seed + else: + # Exhausted the retry bound without a surviving family. + raise ValueError( + f"clonal_lineage: clone {clone_idx} went extinct on every " + f"one of {_MAX_SURVIVAL_ATTEMPTS} survival attempts (each " + "founder drew 0 offspring). Increase lambda_base (offspring " + "Poisson mean) and/or max_generations so families reliably " + "survive, or pass allow_extinction=True to accept extinct " + "clones (producing fewer than n_clones families)." + ) + + if fam is None: + # allow_extinction=True and the final attempt was extinct: + # skip this clone (it contributes no observed cells/records). + continue + + lineage_trees.append(tree) + if len(fam.observed_outcomes()) == 0: + # allow_extinction=True and this family is extinct: keep its + # (empty) tree for export parity but emit no records. + continue + + if self._post is None: + # Pristine-read path: project each observed node's + # synthesized SHM Outcome straight to an AIRR record. + # ``observed_outcomes()`` is index-aligned with both the + # observed-node list and ``airr_records()``. + observed_nodes = [n for n in tree.nodes() if n.observed] + recs = fam.airr_records(self._refdata) + observed_outcomes = fam.observed_outcomes() + for node, rec, node_outcome in zip( + observed_nodes, recs, observed_outcomes + ): + self._stamp_lineage_metadata(rec, clone_idx, node) + if expose_provenance and node_outcome is not None: + _inject_truth_columns(node_outcome, self._refdata, rec) + records.append(rec) + outcomes.append(node_outcome) + continue + + # Corruption path: per observed node, run the library-prep / + # sequencing corruption plan FROM the node's post-SHM + # simulation, then merge the founder-recombination + SHM + # provenance with the corruption trace / events so the AIRR + # record reports trims, v/d/j, SHM counts AND artefact + # counters (n_quality_errors, n_pcr_errors, n_indels, …). + node_outcomes = fam.node_outcomes() + for node, base_outcome in zip(tree.nodes(), node_outcomes): + if base_outcome is None: + continue + node_seed = clone_seed + 1 + node.id + corruption_outcome = self._post.run_from( + base_outcome.final_simulation(), node_seed, strict=strict + ) + merged = _eng.merge_lineage_corruption( + base_outcome, corruption_outcome + ) + rec = outcome_to_airr_record( + merged, + self._refdata, + sequence_id=f"clone{clone_idx}_node{node.id}", + ) + self._stamp_lineage_metadata(rec, clone_idx, node) + if expose_provenance: + _inject_truth_columns(merged, self._refdata, rec) + records.append(rec) + outcomes.append(merged) + + result = SimulationResultWithLineages( + records, outcomes=outcomes, lineage_trees=lineage_trees + ) + if validate_records: + from ._validation import ( + _raise_on_family_validation_failure, + _raise_on_validation_failure, + ) + + _raise_on_validation_failure(result.validate_records(self._refdata)) + _raise_on_family_validation_failure(result.validate_families()) + return result + + @staticmethod + def _stamp_lineage_metadata(rec: Dict[str, Any], clone_idx: int, node) -> None: + """Stamp clone + lineage-node provenance onto an AIRR record.""" + rec["clone_id"] = clone_idx + rec["lineage_node_id"] = node.id + rec["lineage_parent_id"] = ( + node.parent_id if node.parent_id is not None else -1 + ) + rec["lineage_generation"] = node.generation + rec["lineage_abundance"] = node.abundance + # AIRR-standard abundance field that abundance-aware tools (Change-O / + # SCOPer / dowser) read. Mirror lineage_abundance so the genotype- + # collapsed observed cell carries its represented count both ways. + rec["duplicate_count"] = node.abundance + rec["lineage_affinity"] = node.affinity + rec["sequence_id"] = f"clone{clone_idx}_node{node.id}" + + def __repr__(self) -> str: + return ( + f"" + ) + + +class CompiledRepertoireExperiment: + """A compiled non-tree clonal-repertoire experiment. + + Wraps a pre-fork :class:`GenAIRR._engine.CompiledSimulator` (the + founder recombination, run once per clone) and an optional + post-fork simulator (the per-read library-prep / sequencing passes). + Per clone a size is drawn from a heavy-tailed distribution via + ``_engine.sample_clone_sizes``; that many reads are emitted through + the post-fork passes and identical reads are genotype-collapsed into + AIRR records carrying a standard ``duplicate_count``. + + When there are no post-fork passes every read of a clone is + identical, so the clone collapses to a single record whose + ``duplicate_count`` equals the drawn size (the no-corruption + shortcut). Every record carries an integer ``clone_id``. + """ + + __slots__ = ( + "_pre", + "_post", + "_step", + "_refdata", + "_dataconfig", + "_metadata", + ) + + def __init__( + self, + pre_simulator: "_engine.CompiledSimulator", + post_simulator: Optional["_engine.CompiledSimulator"], + step: "_RepertoireForkStep", + refdata: "_engine.RefDataConfig", + dataconfig: Optional["DataConfig"] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + self._pre = pre_simulator + self._post = post_simulator + self._step = step + self._refdata = refdata + self._dataconfig = dataconfig + self._metadata = dict(metadata) if metadata else {} + + @property + def refdata(self) -> "_engine.RefDataConfig": + return self._refdata + + def run_records( + self, + *, + seed: int = 0, + strict: bool = False, + validate_records: bool = False, + expose_provenance: bool = False, + ) -> "SimulationResult": + """Draw per-clone sizes, emit reads through the post-fork passes, + collapse identical reads, and return a :class:`SimulationResult` + whose record dicts carry ``clone_id`` and ``duplicate_count``. + """ + import GenAIRR._engine as _engine + + from ._airr_record import outcome_to_airr_record + from .result import SimulationResult, _inject_truth_columns + + step = self._step + sizes = _engine.sample_clone_sizes( + step.n_clones, + int(seed), + kind=step.size_distribution, + exponent=step.exponent, + mu=step.mu, + sigma=step.sigma, + max_size=step.max_size, + unexpanded_fraction=step.unexpanded_fraction, + ) + records: List[Dict[str, Any]] = [] + outcomes: List["_engine.Outcome"] = [] + for clone_idx, size in enumerate(sizes): + clone_seed = int(seed) + clone_idx * 1_000_000 + founder = self._pre.run(seed=clone_seed, strict=strict) + founder_sim = founder.final_simulation() + if self._post is None: + # No post-fork passes: all ``size`` copies are + # identical -> one record with duplicate_count = size. + rec = outcome_to_airr_record( + founder, + self._refdata, + sequence_id=f"clone{clone_idx}_read0", + ) + rec["clone_id"] = clone_idx + rec["duplicate_count"] = int(size) + if expose_provenance: + _inject_truth_columns(founder, self._refdata, rec) + records.append(rec) + outcomes.append(founder) + continue + # Post-fork passes present: simulate ``size`` reads and + # collapse by emitted sequence. + by_seq: Dict[str, list] = {} + for read_idx in range(int(size)): + desc = self._post.run_from( + founder_sim, clone_seed + 1 + read_idx, strict=strict + ) + rec = outcome_to_airr_record( + desc, + self._refdata, + sequence_id=f"clone{clone_idx}_read{read_idx}", + ) + key = rec["sequence"] + if key in by_seq: + by_seq[key][0] += 1 + else: + by_seq[key] = [1, desc, rec] + for count, desc, rec in by_seq.values(): + rec["clone_id"] = clone_idx + rec["duplicate_count"] = count + if expose_provenance: + _inject_truth_columns(desc, self._refdata, rec) + records.append(rec) + outcomes.append(desc) + result = SimulationResult(records, outcomes=outcomes) + if validate_records: + from ._validation import ( + _raise_on_family_validation_failure, + _raise_on_validation_failure, + ) + + _raise_on_validation_failure(result.validate_records(self._refdata)) + _raise_on_family_validation_failure(result.validate_families()) + return result + + def __repr__(self) -> str: + return ( + f"" + ) diff --git a/src/GenAIRR/_pipeline_ir.py b/src/GenAIRR/_pipeline_ir.py index 0a69856..ac7b6c7 100644 --- a/src/GenAIRR/_pipeline_ir.py +++ b/src/GenAIRR/_pipeline_ir.py @@ -188,6 +188,54 @@ class _ClonalForkStep: size: int +@dataclass(frozen=True) +class _RepertoireForkStep: + """Marks a non-tree clonal-repertoire fork. + + Generalizes :class:`_ClonalForkStep`: instead of a fixed + ``per_clone`` size, each clone draws a size from a heavy-tailed + distribution (with an unexpanded-singleton fraction). That many + reads pass through the post-fork library-prep / sequencing passes + and identical reads collapse into AIRR records carrying a standard + ``duplicate_count``. + + Causes :meth:`Experiment.compile` to return a + :class:`~GenAIRR._compiled.CompiledRepertoireExperiment`. + """ + + n_clones: int + size_distribution: str + exponent: float + mu: float + sigma: float + max_size: int + unexpanded_fraction: float + + +@dataclass(frozen=True) +class _LineageForkStep: + """Marks an affinity-maturation lineage fork. + + Causes :meth:`Experiment.compile` to return a + :class:`~GenAIRR._compiled.CompiledLineageExperiment` that grows BCR + clonal lineage trees via the Rust ``simulate_family_outcomes`` kernel + and returns per-observed-node AIRR records with lineage metadata. + """ + + n_clones: int + max_generations: int + n_max: int + n_sample: int + rate: float + lambda_base: float + selection_strength: float + beta: float + target_aa: Optional[str] + mature_substitutions: int + s5f_model: str + allow_extinction: bool = False + + @dataclass(frozen=True) class _PairedEndStep: """One ``paired_end(r1_length=…, r2_length=…, insert_size=…)`` diff --git a/src/GenAIRR/experiment.py b/src/GenAIRR/experiment.py index eb34e74..88cee7a 100644 --- a/src/GenAIRR/experiment.py +++ b/src/GenAIRR/experiment.py @@ -49,7 +49,12 @@ _lower_recombine, lower_step, ) -from ._compiled import CompiledClonalExperiment, CompiledExperiment +from ._compiled import ( + CompiledClonalExperiment, + CompiledExperiment, + CompiledLineageExperiment, + CompiledRepertoireExperiment, +) from ._refdata_resolver import ( _CONFIG_ALIASES, ExperimentInput, @@ -69,10 +74,12 @@ _ClonalForkStep, _CorruptStep, _InvertDStep, + _LineageForkStep, _MutateStep, _PairedEndStep, - _ReceptorRevisionStep, _RecombineStep, + _ReceptorRevisionStep, + _RepertoireForkStep, ) @@ -330,8 +337,8 @@ def _coerce(label_for_error: str, value: object) -> float: # collapses descendant diversity (every clone member shares an # identical effect because the pass ran once on the parent IR). # -# :meth:`Experiment.expand_clones` scans the already-appended step -# list against this table; the first match is rejected with a +# The clonal fork methods scan the already-appended step list against +# this table; the first match is rejected with a # message naming the offending DSL method and the canonical fix. # # Each entry is ``(predicate, dsl_method_name)`` where the predicate @@ -343,9 +350,9 @@ def _descendant_phase_step_classifier(step): """Return the DSL method name a descendant-phase ``step`` came from, or ``None`` if ``step`` is not a descendant-phase step. - Single source of truth for the unified guard in - :meth:`Experiment.expand_clones`. Adding a new descendant-phase - DSL method means appending a clause here (and adding the + Single source of truth for the unified guard in the flat clonal + fork methods. Adding a new descendant-phase DSL method means + appending a clause here (and adding the companion spec test in ``tests/test_clonal_descendant_phase_guards.py``). """ @@ -968,6 +975,13 @@ def expand_clones( records automatically. Passing ``n`` is allowed only when ``n == n_clones * per_clone``. + .. deprecated:: + Use :meth:`clonal_lineage` for BCR affinity-maturation + trees, or :meth:`clonal_repertoire` for TCR / flat-BCR + abundance repertoires with clone-size distributions. + ``expand_clones`` remains supported for fixed-size flat + star expansion. + Constraints: - Both ``n_clones`` and ``per_clone`` must be positive ints. - At most one expansion per pipeline; calling this method @@ -980,15 +994,28 @@ def expand_clones( shares the same recombination provenance (V allele, trim, NP bases) and only diverges through the post-fork passes. """ + warnings.warn( + "Experiment.expand_clones() is deprecated. Use " + "Experiment.clonal_lineage() for BCR affinity-maturation trees " + "(internal SHM, lineage_trees, no per_clone) or " + "Experiment.clonal_repertoire() for TCR / flat-BCR abundance " + "repertoires with clone-size distributions and duplicate_count. " + "Neither is a drop-in replacement for fixed per_clone output; " + "expand_clones() remains supported for legacy fixed-size flat " + "star expansion.", + DeprecationWarning, + stacklevel=2, + ) if not isinstance(n_clones, int) or isinstance(n_clones, bool) or n_clones < 1: raise ValueError( f"n_clones must be a positive int, got {n_clones!r}" ) if not isinstance(per_clone, int) or isinstance(per_clone, bool) or per_clone < 1: raise ValueError(f"per_clone must be a positive int, got {per_clone!r}") - if any(isinstance(s, _ClonalForkStep) for s in self._steps): + if self._has_clonal_fork(): raise ValueError( - "expand_clones() can only be called once per pipeline" + "expand_clones() / clonal_lineage() / clonal_repertoire() " + "can only be called once per pipeline" ) # Unified descendant-phase ordering guard. Scan the appended # step list for any step that came from a descendant-phase @@ -1023,6 +1050,373 @@ def expand_clones( self._steps.append(_ClonalForkStep(n_clones=n_clones, size=per_clone)) return self + def clonal_repertoire( + self, + *, + n_clones: int, + size_distribution: str = "power_law", + exponent: float = 2.0, + mu: float = 1.0, + sigma: float = 1.0, + max_size: int = 1000, + unexpanded_fraction: float = 0.0, + ) -> "Experiment": + """Expand the pipeline into a non-tree clonal **repertoire**. + + This is the non-tree clonal model (contrast with + :meth:`clonal_lineage`, which grows real affinity-maturation + lineage *trees*). It generalizes the deprecated + :meth:`expand_clones`: instead of a fixed ``per_clone`` size, + each clone draws a size from a heavy-tailed distribution (with + an unexpanded-singleton fraction). That many reads pass through + the post-fork library-prep / sequencing passes, and identical + reads are genotype-collapsed into AIRR records carrying a + standard ``duplicate_count`` that records abundance. + + Like :meth:`expand_clones`, this call marks the per-clone / + per-read boundary: steps appended *before* it run **once per + clone** (typically just :meth:`recombine`, establishing the + clonal V/D/J + trim + NP backbone); steps appended *after* it + run **once per read** (the library-prep / sequencing passes + that introduce per-read divergence). + + Concrete shape:: + + exp = (Experiment.on("human_igh") + .recombine() + .clonal_repertoire(n_clones=200, max_size=500, + unexpanded_fraction=0.3) + .sequencing_errors(rate=0.005)) + result = exp.run_records(seed=0) + # Each record carries a `clone_id` and a `duplicate_count`. + + Parameters: + - ``n_clones`` — number of clones (positive int). + - ``size_distribution`` — ``"power_law"`` or ``"lognormal"``. + - ``exponent`` — power-law exponent (``> 0``; used when + ``size_distribution="power_law"``). + - ``mu`` / ``sigma`` — lognormal parameters (``sigma >= 0``; + used when ``size_distribution="lognormal"``). + - ``max_size`` — clamps the largest clone. Because the total + number of reads simulated is roughly the **sum** of the drawn + sizes when post-fork passes are present, keep ``max_size`` + modest to bound runtime. + - ``unexpanded_fraction`` — fraction of clones forced to size 1 + (unexpanded singletons), in ``[0, 1]``. + + TCR works out of the box (no SHM): the reads diverge only + through the post-fork sequencing passes. Note that ``mutate`` + after this call is rejected on TCR by :meth:`mutate`'s own TCR + guard; on BCR an optional post-fork ``mutate`` adds flat SHM. + + **No-corruption shortcut:** a clone with no post-fork passes + emits identical copies, so it collapses to a single record whose + ``duplicate_count`` equals the drawn size. + + Constraints: + - At most one fork per pipeline — calling this when an + :meth:`expand_clones`, :meth:`clonal_lineage`, or + :meth:`clonal_repertoire` fork is already present raises + ``ValueError``. + - The same descendant-phase ordering guard as + :meth:`expand_clones` applies: a descendant-phase step (e.g. + ``.mutate()``) appended *before* this call is rejected; + :meth:`recombine` is fine. + """ + if not isinstance(n_clones, int) or isinstance(n_clones, bool) or n_clones < 1: + raise ValueError( + f"n_clones must be a positive int, got {n_clones!r}" + ) + if size_distribution not in ("power_law", "lognormal"): + raise ValueError( + "size_distribution must be 'power_law' or 'lognormal', got " + f"{size_distribution!r}" + ) + if not (isinstance(exponent, (int, float)) and exponent > 0): + raise ValueError(f"exponent must be > 0, got {exponent!r}") + if not (isinstance(sigma, (int, float)) and sigma >= 0): + raise ValueError(f"sigma must be >= 0, got {sigma!r}") + if not isinstance(max_size, int) or isinstance(max_size, bool) or max_size < 1: + raise ValueError(f"max_size must be a positive int, got {max_size!r}") + if not ( + isinstance(unexpanded_fraction, (int, float)) + and 0.0 <= unexpanded_fraction <= 1.0 + ): + raise ValueError( + "unexpanded_fraction must be in [0, 1], got " + f"{unexpanded_fraction!r}" + ) + if any( + isinstance(s, (_ClonalForkStep, _RepertoireForkStep, _LineageForkStep)) + for s in self._steps + ): + raise ValueError( + "clonal_repertoire() / expand_clones() / clonal_lineage() " + "can only be called once per pipeline" + ) + # Same descendant-phase ordering guard as expand_clones: a + # descendant-phase step (mutate / corruption / paired_end) + # appended before the fork is rejected. + for step in self._steps: + offending_method = _descendant_phase_step_classifier(step) + if offending_method is None: + continue + if offending_method == "mutate": + detail = ( + "SHM is descendant-specific in GenAIRR's current " + "clonal model" + ) + else: + detail = ( + "it is descendant-specific and must be sampled " + "independently for each read" + ) + raise ValueError( + f"{offending_method} must be called after " + f"clonal_repertoire(); {detail}. Move " + f"{offending_method}(...) after clonal_repertoire(...)." + ) + self._steps.append( + _RepertoireForkStep( + n_clones=n_clones, + size_distribution=size_distribution, + exponent=float(exponent), + mu=float(mu), + sigma=float(sigma), + max_size=max_size, + unexpanded_fraction=float(unexpanded_fraction), + ) + ) + return self + + def clonal_lineage( + self, + *, + n_clones: int, + max_generations: int = 10, + n_max: int = 1000, + n_sample: int = 50, + rate: float = 0.05, + lambda_base: float = 1.5, + selection_strength: float = 0.0, + beta: float = 1.0, + target_aa: Optional[str] = None, + mature_substitutions: int = 5, + s5f_model: str = "hh_s5f", + allow_extinction: bool = False, + ) -> "Experiment": + """Grow BCR lineage trees (neutral by default; set ``selection_strength > 0`` + and optionally ``target_aa`` to enable affinity maturation). + + Each clone gets its own lineage tree produced by the Rust + ``simulate_family_outcomes`` kernel. The returned + :class:`~GenAIRR.result.SimulationResultWithLineages` carries: + + - ``.records`` — one AIRR dict per *observed* (genotype-collapsed) cell, + tagged with ``clone_id``, ``lineage_node_id``, ``lineage_parent_id``, + ``lineage_generation``, ``lineage_abundance``, and + ``lineage_affinity``. Because identical genotypes are collapsed before + sampling, the number of records per clone is ≤ ``n_sample``; the + ``lineage_abundance`` field accounts for how many sampled cells were + represented by each observed record. Mutation counts (``n_mutations``, + ``n_v_mutations``, …) are pool-derived and self-consistent. + - ``.lineage_trees`` — one :class:`~GenAIRR._engine.LineageTree` + per clone for ground-truth export (Newick, FASTA, node table TSV). + + Parameters + ---------- + n_clones: + Number of independent clonal lineages to grow. + max_generations: + Maximum depth of the lineage tree (≤ 1000). + n_max: + Per-generation LIVING-population carrying capacity: the live + population per generation is capped at this (the tree can contain + more total nodes across generations). It is NOT a hard cap on the + total number of cells per clone. + n_sample: + Number of cells to sample as observed leaves. Records returned + per clone are ≤ ``n_sample`` because identical genotypes are + collapsed (duplicates are counted in ``lineage_abundance``). + rate: + Per-base SHM rate for within-lineage mutations. + lambda_base: + Poisson mean for offspring count at affinity 0. + selection_strength: + Selection pressure; ``0.0`` = neutral drift (fitness is 1.0 for + every cell). This disables selection but does not force + ``lineage_affinity`` to 0 when a target sequence is supplied. + Set ``> 0`` to enable affinity maturation; combine with + ``target_aa`` for a fixed sequence target. + beta: + Scaling factor for the affinity term in ``exp(−beta·distance)``. + target_aa: + Amino-acid sequence of the full receptor used to compute + per-cell affinity via a BLOSUM62-weighted distance (compared + position-wise against the cell's translated receptor; only + the overlapping prefix is scored when lengths differ). Must be + a non-empty string of standard amino-acid letters + (``ACDEFGHIKLMNPQRSTVWY``). When ``None``, an auto target is + generated from the founder by applying ``mature_substitutions`` + random residue changes whenever selection is enabled. In fully + neutral mode (``selection_strength=0`` and ``target_aa=None``), + no affinity model is built and ``lineage_affinity`` is 0. + mature_substitutions: + Number of amino-acid substitutions used to build the auto + target (when ``target_aa`` is ``None``). + s5f_model: + Bundled S5F kernel name for within-lineage mutation context + (``"hh_s5f"``, ``"hkl_s5f"``, …). + allow_extinction: + Sampling draws from the LIVING final-generation population, so a + founder that draws 0 offspring goes extinct and yields zero + observed cells/records. With ``allow_extinction=False`` (default) + each requested clone is conditioned on survival: an extinct family + is retried with a fresh deterministic sub-seed (up to a bounded + number of attempts) so you reliably get ``n_clones`` families. With + ``allow_extinction=True`` extinction is accepted and the extinct + clone is skipped, producing fewer families than ``n_clones``. + + **BCR-only guard:** ``clonal_lineage`` applies S5F somatic + hypermutation, which is a B-cell process. Calling it on a TCR-configured + experiment raises ``ValueError`` (immunoglobulin / BCR loci only). TCR + clone-size primitives exist in the engine but are not yet exposed as a + DSL workflow. + """ + import math + import warnings + + # --- BCR-only guard (mirror mutate()'s TCR rejection) --- + # clonal_lineage applies S5F somatic hypermutation, a B-cell process, + # so it must reject TCR loci. ``_is_tcr_refdata`` inspects the first V + # allele name prefix (TR* => TCR, IG* => BCR) on the already-bound + # refdata, exactly as mutate() does. Firing here, at call time and + # before compile(), guarantees the clear BCR-only message instead of a + # downstream cartridge / compile error. + if self._is_tcr_refdata(): + locus = self._refdata.v_allele(0).name if self._refdata.v_pool_size() else "?" + raise ValueError( + "clonal_lineage models B-cell somatic hypermutation and " + "supports immunoglobulin (BCR) loci only; the locus " + f"'{locus}' is a TCR locus. (TCR clone-size simulation is not " + "yet exposed in the DSL.)" + ) + # --- allow_extinction --- + if not isinstance(allow_extinction, bool): + raise ValueError( + f"allow_extinction must be a bool, got {allow_extinction!r}" + ) + + # --- n_clones --- + if isinstance(n_clones, bool) or not isinstance(n_clones, int) or n_clones < 1: + raise ValueError(f"n_clones must be a positive int, got {n_clones!r}") + # --- max_generations --- + if ( + isinstance(max_generations, bool) + or not isinstance(max_generations, int) + or max_generations < 1 + or max_generations > 1000 + ): + raise ValueError( + f"max_generations must be a positive int <= 1000, got {max_generations!r}" + ) + # --- n_max --- + if isinstance(n_max, bool) or not isinstance(n_max, int) or n_max < 1: + raise ValueError(f"n_max must be a positive int, got {n_max!r}") + # --- n_sample --- + if isinstance(n_sample, bool) or not isinstance(n_sample, int) or n_sample < 1: + raise ValueError(f"n_sample must be a positive int, got {n_sample!r}") + # --- rate --- + if not isinstance(rate, (int, float)) or not math.isfinite(rate) or rate < 0 or rate > 1: + raise ValueError(f"rate must be a float in [0, 1], got {rate!r}") + # --- lambda_base --- + if not isinstance(lambda_base, (int, float)) or not math.isfinite(lambda_base) or lambda_base < 0: + raise ValueError(f"lambda_base must be a finite non-negative float, got {lambda_base!r}") + # --- beta --- + if not isinstance(beta, (int, float)) or not math.isfinite(beta) or beta < 0: + raise ValueError(f"beta must be a finite non-negative float, got {beta!r}") + # --- selection_strength --- + if not isinstance(selection_strength, (int, float)) or not math.isfinite(selection_strength) or selection_strength < 0: + raise ValueError( + f"selection_strength must be a finite non-negative float, got {selection_strength!r}" + ) + # --- mature_substitutions --- + if ( + isinstance(mature_substitutions, bool) + or not isinstance(mature_substitutions, int) + or mature_substitutions < 0 + ): + raise ValueError( + f"mature_substitutions must be a non-negative int, got {mature_substitutions!r}" + ) + # --- target_aa --- + _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY") + if target_aa is not None: + if not isinstance(target_aa, str) or len(target_aa) == 0: + raise ValueError( + "target_aa must be a non-empty amino-acid string " + "(letters from ACDEFGHIKLMNPQRSTVWY)" + ) + target_aa = target_aa.upper() + invalid = set(target_aa) - _VALID_AA + if invalid: + raise ValueError( + f"target_aa contains invalid characters {sorted(invalid)!r}; " + "only standard amino-acid letters (ACDEFGHIKLMNPQRSTVWY) are allowed" + ) + if len(target_aa) < 30: + warnings.warn( + f"target_aa has length {len(target_aa)}, which is shorter than a typical " + "receptor sequence (~300+ aa). If this is an epitope sequence rather than " + "the full receptor, affinity scoring will be based only on the overlapping " + "prefix — consider supplying the full translated receptor instead.", + UserWarning, + stacklevel=2, + ) + # --- s5f_model (validate at call time, not at run time) --- + from GenAIRR._s5f_loader import _BUILTIN_S5F_MODELS + _s5f_key = s5f_model.lower().strip() + if _s5f_key not in _BUILTIN_S5F_MODELS: + avail = ", ".join(f'"{k}"' for k in sorted(_BUILTIN_S5F_MODELS)) + raise ValueError( + f"Unknown s5f_model {s5f_model!r}. Available: {avail}" + ) + # --- reject duplicate fork steps --- + for s in self._steps: + if isinstance(s, (_ClonalForkStep, _RepertoireForkStep, _LineageForkStep)): + raise ValueError( + "clonal_lineage() / expand_clones() / clonal_repertoire() " + "can only be called once per pipeline" + ) + # --- descendant-phase guard (same as expand_clones) --- + for step in self._steps: + offending_method = _descendant_phase_step_classifier(step) + if offending_method is None: + continue + raise ValueError( + f"{offending_method} must be called after " + f"clonal_lineage(); it is descendant-specific and must be sampled " + f"independently for each clone member. Move " + f"{offending_method}(...) after clonal_lineage(...)." + ) + self._steps.append( + _LineageForkStep( + n_clones=n_clones, + max_generations=max_generations, + n_max=n_max, + n_sample=n_sample, + rate=rate, + lambda_base=lambda_base, + selection_strength=selection_strength, + beta=beta, + target_aa=target_aa, + mature_substitutions=mature_substitutions, + s5f_model=s5f_model, + allow_extinction=allow_extinction, + ) + ) + return self + def mutate( self, *, @@ -1181,17 +1575,20 @@ def _check_v_subregion_rates_satisfiable( ) def _has_clonal_fork(self) -> bool: - """Whether :meth:`expand_clones` has already been appended. + """Whether any clonal fork has already been appended. Used by the DSL ordering guards on :meth:`invert_d`, - :meth:`receptor_revision`, and :meth:`expand_clones` itself. - Each step lowers into either the pre-fork (per-clone) or - the post-fork (per-descendant) plan; misordered calls used - to silently lower into the wrong half and produce records - with empty / default fields. The guards reject those - configurations at the DSL boundary. + :meth:`receptor_revision`, and the clonal fork methods. + Each fork has a pre-fork parent/founder phase; recombination-time + mechanisms must be inherited by every descendant, emitted copy, + or lineage node. Misordered calls used to lower into the wrong + half and produce records with empty / default fields. The guards + reject those configurations at the DSL boundary. """ - return any(isinstance(s, _ClonalForkStep) for s in self._steps) + return any( + isinstance(s, (_ClonalForkStep, _RepertoireForkStep, _LineageForkStep)) + for s in self._steps + ) def _is_tcr_refdata(self) -> bool: """Detect whether the bound refdata is a TCR locus. @@ -1659,10 +2056,11 @@ def invert_d(self, *, prob: float = 0.05) -> "Experiment": # even at prob=1.0. Reject at the DSL boundary instead. if self._has_clonal_fork(): raise ValueError( - "invert_d must be called before expand_clones(); D " + "invert_d must be called before the clonal fork; D " "inversion is a recombination-time decision and must " "be inherited by all clone descendants. Move the " - "invert_d(...) call before expand_clones(...)." + "invert_d(...) call before clonal_lineage(...), " + "clonal_repertoire(...), or expand_clones(...)." ) if not isinstance(prob, (int, float)): raise ValueError( @@ -1757,10 +2155,11 @@ def receptor_revision(self, *, prob: float = 0.05) -> "Experiment": if self._has_clonal_fork(): raise ValueError( "receptor_revision must be called before " - "expand_clones(); receptor revision is a " + "the clonal fork; receptor revision is a " "recombination-time decision and must be inherited by " "all clone descendants. Move the " - "receptor_revision(...) call before expand_clones(...)." + "receptor_revision(...) call before clonal_lineage(...), " + "clonal_repertoire(...), or expand_clones(...)." ) if not isinstance(prob, (int, float)): raise ValueError( @@ -2227,6 +2626,124 @@ def compile(self, *, allow_curatable_refdata: Optional[bool] = None): metadata=self._metadata, ) + # if a `_RepertoireForkStep` is present, split the step list + # at it and compile the pre-fork (per-clone) simulator plus an + # optional post-fork (per-read) simulator, mirroring the + # `_ClonalForkStep` branch. Per-clone sizes are drawn at run + # time from the heavy-tailed distribution; identical reads are + # collapsed into `duplicate_count`-carrying records. + repertoire_idx = next( + ( + i + for i, s in enumerate(self._steps) + if isinstance(s, _RepertoireForkStep) + ), + None, + ) + if repertoire_idx is not None: + repertoire_step: _RepertoireForkStep = self._steps[repertoire_idx] + pre_steps = self._steps[:repertoire_idx] + post_steps = self._steps[repertoire_idx + 1 :] + pre_simulator = self._build_simulator( + pre_steps, + contracts, + any_lock, + replace_fn=_replace, + allow_curatable_refdata=allow_curatable_refdata, + ) + # post_steps may be empty — that's the pure-copy case + # (each clone collapses to one record with + # duplicate_count = size). When present, the post-fork + # plan inherits the parent's V/D/J/NP backbone, so + # any_lock=False and no recombination facts on this side + # (same as the clonal branch). + post_simulator = None + if post_steps: + post_simulator = self._build_simulator( + post_steps, + contracts, + any_lock=False, + replace_fn=_replace, + allow_curatable_refdata=allow_curatable_refdata, + ) + return CompiledRepertoireExperiment( + pre_simulator, + post_simulator, + repertoire_step, + self._refdata, + dataconfig=self._dataconfig, + metadata=self._metadata, + ) + + # if a `_LineageForkStep` is present, compile a + # CompiledLineageExperiment: pre-fork steps (recombine) become + # the founder simulator; post-fork steps must be empty (the + # lineage engine handles mutation internally). + lineage_idx = next( + (i for i, s in enumerate(self._steps) if isinstance(s, _LineageForkStep)), + None, + ) + if lineage_idx is not None: + lineage_step: _LineageForkStep = self._steps[lineage_idx] + pre_steps = self._steps[:lineage_idx] + post_steps = self._steps[lineage_idx + 1:] + # Steps after clonal_lineage() are per-observed-cell + # library-prep / sequencing artefact passes (the same + # post-fork set expand_clones() allows). SHM is internal + # to the lineage engine, so .mutate() is rejected; the + # paired-end read layout is not yet wired through the + # per-cell corruption merge, so reject it for now too. + for s in post_steps: + if isinstance(s, _MutateStep): + raise ValueError( + "SHM is internal to clonal_lineage; do not add " + ".mutate() after it. Set the within-lineage SHM rate " + "via clonal_lineage(rate=...)." + ) + if isinstance(s, _PairedEndStep): + raise ValueError( + "paired_end not yet supported with clonal_lineage; " + "apply per-read library-prep passes (sequencing_errors, " + "pcr_amplify, polymerase_indels, end_loss_*, " + "ambiguous_base_calls, random_strand_orientation) instead." + ) + if not isinstance(s, _CorruptStep): + raise ValueError( + "Only per-read library-prep / sequencing artefact passes " + "may follow clonal_lineage(); got " + f"{type(s).__name__}." + ) + pre_simulator = self._build_simulator( + pre_steps, + contracts, + any_lock, + replace_fn=_replace, + allow_curatable_refdata=allow_curatable_refdata, + ) + # Build the per-cell corruption simulator from the + # post-fork steps, mirroring the clonal branch's + # post_simulator (no recombination facts on this side, so + # any_lock=False and the analyzer skips the productive + # precondition check). + post_simulator = None + if post_steps: + post_simulator = self._build_simulator( + post_steps, + contracts, + any_lock=False, + replace_fn=_replace, + allow_curatable_refdata=allow_curatable_refdata, + ) + return CompiledLineageExperiment( + pre_simulator, + lineage_step, + self._refdata, + post_simulator=post_simulator, + post_steps=tuple(post_steps), + dataconfig=self._dataconfig, + metadata=self._metadata, + ) + simulator = self._build_simulator( self._steps, contracts, @@ -2374,6 +2891,33 @@ def run_records( expose_provenance=expose_provenance, validate_records=validate_records, ) + elif isinstance(compiled, CompiledRepertoireExperiment): + if n is not None: + raise ValueError( + "The 'n' parameter is not supported for clonal_repertoire " + "experiments. The number of records depends on the per-clone " + "sizes drawn from the heavy-tailed distribution and the " + "read-collapse, not a fixed product." + ) + result = compiled.run_records( + seed=seed, + strict=strict, + expose_provenance=expose_provenance, + validate_records=validate_records, + ) + elif isinstance(compiled, CompiledLineageExperiment): + if n is not None: + raise ValueError( + "The 'n' parameter is not supported for clonal_lineage experiments. " + "The number of observed records depends on the lineage trees " + "grown from n_clones / n_sample / selection, not a fixed product." + ) + result = compiled.run_records( + seed=seed, + strict=strict, + expose_provenance=expose_provenance, + validate_records=validate_records, + ) else: result = compiled.run_records( n=1 if n is None else n, @@ -2539,4 +3083,3 @@ def _steps_with_locks_resolved(self) -> List[Any]: def __repr__(self) -> str: return f"" - diff --git a/src/GenAIRR/result.py b/src/GenAIRR/result.py index f5d8054..b2baabc 100644 --- a/src/GenAIRR/result.py +++ b/src/GenAIRR/result.py @@ -1318,3 +1318,38 @@ def _write_delimited( # don't carry literal ``"None"`` strings. row = {k: ("" if v is None else v) for k, v in source.items()} writer.writerow(row) + + +class SimulationResultWithLineages(SimulationResult): + """A :class:`SimulationResult` that also carries per-clone lineage trees. + + Produced by :meth:`CompiledLineageExperiment.run_records`. Adds a + ``.lineage_trees`` property that exposes the raw + :class:`~GenAIRR._engine.LineageTree` objects (one per clone) for + ground-truth export via ``.to_newick()``, ``.to_fasta()``, and + ``.to_node_table_tsv()``. + """ + + __slots__ = ("_lineage_trees",) + + def __init__( + self, + records: "Sequence[Dict[str, Any]]", + outcomes: "Optional[Sequence]" = None, + parents: "Optional[Sequence]" = None, + lineage_trees: "Optional[Sequence]" = None, + ) -> None: + super().__init__(records, outcomes, parents) + self._lineage_trees: "Optional[List]" = ( + list(lineage_trees) if lineage_trees is not None else None + ) + + @property + def lineage_trees(self) -> "Optional[List]": + """Per-clone :class:`~GenAIRR._engine.LineageTree` objects, or ``None``. + + Each tree supports ``.validate()``, ``.to_newick()``, + ``.to_fasta()``, and ``.to_node_table_tsv()`` for ground-truth + export and downstream phylogenetic analysis. + """ + return self._lineage_trees diff --git a/tests/test_clonal_dsl_ordering_guards.py b/tests/test_clonal_dsl_ordering_guards.py index a7a0452..44a8042 100644 --- a/tests/test_clonal_dsl_ordering_guards.py +++ b/tests/test_clonal_dsl_ordering_guards.py @@ -70,7 +70,7 @@ def test_invert_d_after_expand_clones_raises() -> None: exp.invert_d(prob=1.0) msg = str(exc_info.value) # Must name BOTH the correct ordering and the biological reason. - assert "invert_d must be called before expand_clones" in msg, msg + assert "invert_d must be called before the clonal fork" in msg, msg assert "recombination" in msg.lower(), msg assert "inherited" in msg, msg @@ -93,7 +93,7 @@ def test_receptor_revision_after_expand_clones_raises() -> None: exp.receptor_revision(prob=1.0) msg = str(exc_info.value) assert ( - "receptor_revision must be called before expand_clones" in msg + "receptor_revision must be called before the clonal fork" in msg ), msg assert "recombination" in msg.lower(), msg assert "inherited" in msg, msg @@ -225,7 +225,7 @@ def test_error_message_invert_d_names_recombination_time() -> None: exp.invert_d(prob=0.5) msg = str(exc_info.value) # The correct ordering. - assert "before expand_clones" in msg + assert "before the clonal fork" in msg # The biological reason — pin the phrase that names the # mechanism. assert re.search(r"recombination[- ]?time", msg.lower()) @@ -242,7 +242,7 @@ def test_error_message_receptor_revision_names_recombination_time() -> None: with pytest.raises(ValueError) as exc_info: exp.receptor_revision(prob=0.5) msg = str(exc_info.value) - assert "before expand_clones" in msg + assert "before the clonal fork" in msg assert re.search(r"recombination[- ]?time", msg.lower()) assert "Move" in msg or "move" in msg @@ -273,7 +273,7 @@ def test_pin_guarded_invert_d_post_fork_drop_cannot_reach_runtime() -> None: is rejected at the DSL boundary so the silent runtime behaviour is structurally unreachable. Pin the rejection rather than the broken runtime output.""" - with pytest.raises(ValueError, match="before expand_clones"): + with pytest.raises(ValueError, match="before the clonal fork"): ( ga.Experiment.on("human_igh") .recombine() @@ -284,7 +284,7 @@ def test_pin_guarded_invert_d_post_fork_drop_cannot_reach_runtime() -> None: def test_pin_guarded_receptor_revision_post_fork_drop_cannot_reach_runtime() -> None: """Bug B's silent drop closed at the DSL boundary.""" - with pytest.raises(ValueError, match="before expand_clones"): + with pytest.raises(ValueError, match="before the clonal fork"): ( ga.Experiment.on("human_igh") .recombine() @@ -349,3 +349,38 @@ def test_has_clonal_fork_helper_reflects_pipeline_state() -> None: assert base._has_clonal_fork() is False forked = base.expand_clones(n_clones=1, per_clone=1) assert forked._has_clonal_fork() is True + + +# ────────────────────────────────────────────────────────────────── +# A pipeline may contain at most one clonal fork — stacking any two +# of expand_clones / clonal_lineage / clonal_repertoire raises a +# clear "once per pipeline" error at DSL time (not a confusing +# downstream TypeError at compile()). +# ────────────────────────────────────────────────────────────────── + + +@pytest.mark.parametrize( + "first, second", + [ + ("clonal_repertoire", "clonal_lineage"), + ("clonal_lineage", "clonal_repertoire"), + ("expand_clones", "clonal_lineage"), + ("expand_clones", "clonal_repertoire"), + ("clonal_lineage", "expand_clones"), + ("clonal_repertoire", "expand_clones"), + ], +) +def test_two_clonal_forks_rejected_once_per_pipeline(first: str, second: str) -> None: + """Any second clonal fork is rejected with the canonical + "once per pipeline" message, regardless of which fork came + first.""" + kwargs = { + "expand_clones": dict(n_clones=1, per_clone=2), + "clonal_lineage": dict(n_clones=1), + "clonal_repertoire": dict(n_clones=1), + } + exp = getattr( + ga.Experiment.on("human_igh").recombine(), first + )(**kwargs[first]) + with pytest.raises(ValueError, match="once per pipeline"): + getattr(exp, second)(**kwargs[second]) diff --git a/tests/test_clonal_lineage_corruption.py b/tests/test_clonal_lineage_corruption.py new file mode 100644 index 0000000..1de6e27 --- /dev/null +++ b/tests/test_clonal_lineage_corruption.py @@ -0,0 +1,76 @@ +"""Library-prep / sequencing artefact passes applied to clonal_lineage reads. + +``clonal_lineage`` produces per-observed-cell AIRR records. Prior to this +slice the compiler rejected ALL steps following the lineage fork, so the +reads were pristine — no sequencing / PCR / indel noise, a regression vs +``expand_clones``. These tests pin the corruption-aware path: each observed +cell now gets independent corruption, and its record reports BOTH the +founder recombination provenance (trims, v/d/j) AND the SHM counts AND the +corruption counts (``n_quality_errors`` etc.). +""" +import pytest + +import GenAIRR as ga + + +def test_clonal_lineage_with_corruption_applies_artefacts(): + exp = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage( + n_clones=3, max_generations=4, n_max=200, n_sample=20, rate=0.02 + ) + .sequencing_errors(rate=0.05) + ) + result = exp.run_records(seed=0) + assert len(result.records) > 0 + # Corruption was actually applied to at least one observed cell. + assert any(r.get("n_quality_errors", 0) > 0 for r in result.records) + for r in result.records: + # Founder recombination provenance survives the merge. + assert r["v_call"] + assert r["clone_id"] in (0, 1, 2) + assert "lineage_node_id" in r + # SHM per-segment counts stay self-consistent after corruption merge. + assert r["n_mutations"] == ( + r["n_v_mutations"] + + r["n_d_mutations"] + + r["n_j_mutations"] + + r["n_np_mutations"] + ) + + +def test_clonal_lineage_rejects_mutate_after(): + with pytest.raises(Exception): + ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=2, n_sample=5) + .mutate(rate=0.05) + .compile() + ) + + +def test_clonal_lineage_rejects_paired_end_after(): + with pytest.raises(Exception): + ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=2, n_sample=5) + .paired_end(r1_length=150, insert_size=300) + .compile() + ) + + +def test_clonal_lineage_without_corruption_still_works(): + result = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=2, n_sample=10, rate=0.02) + # seed chosen so the families survive: sampling draws from the living + # final generation, so an all-extinct seed (e.g. 0) yields zero records. + .run_records(seed=1) + ) + assert len(result.records) > 0 + # No corruption pass → no quality errors stamped. + assert all(r.get("n_quality_errors", 0) == 0 for r in result.records) diff --git a/tests/test_clonal_lineage_dsl.py b/tests/test_clonal_lineage_dsl.py new file mode 100644 index 0000000..5642792 --- /dev/null +++ b/tests/test_clonal_lineage_dsl.py @@ -0,0 +1,57 @@ +import pytest +import GenAIRR as ga + + +def _exp(**kw): + base = dict(n_clones=3, max_generations=6, n_max=200, n_sample=20, + rate=0.05, lambda_base=1.6) + base.update(kw) + return ga.Experiment.on("human_igh").recombine().clonal_lineage(**base) + + +def test_clonal_lineage_runs_and_tags_records(): + result = _exp().run_records(seed=0) + # Sampling draws from the LIVING final-generation population, so an extinct + # clone (founder drew 0 offspring) contributes no records. By default the + # founder-survival guard retries extinct clones with fresh deterministic + # sub-seeds, so every requested clone survives and all clone_ids are present. + assert len(result.records) > 0 + cids = {r["clone_id"] for r in result.records} + assert cids == {0, 1, 2} + for r in result.records: + assert r["v_call"] # real recombination provenance + assert r["sequence"] + assert "lineage_node_id" in r + assert "lineage_generation" in r + assert "lineage_abundance" in r + assert "lineage_affinity" in r + # mutation counts are self-consistent (pool-derived) + per_seg = (r["n_v_mutations"] + r["n_d_mutations"] + + r["n_j_mutations"] + r["n_np_mutations"]) + assert r["n_mutations"] == per_seg + + +def test_clonal_lineage_exposes_per_clone_trees(): + result = _exp(n_clones=2).run_records(seed=1) + trees = result.lineage_trees + assert trees is not None and len(trees) == 2 + for t in trees: + t.validate() # raises if malformed + assert t.to_newick().endswith(";") + assert t.to_fasta().startswith(">node") + + +def test_clonal_lineage_selection_raises_mean_affinity(): + neutral = _exp(n_clones=2, selection_strength=0.0, target_aa="A" * 100, + beta=0.001, max_generations=10, n_sample=40).run_records(seed=5) + selected = _exp(n_clones=2, selection_strength=50.0, target_aa="A" * 100, + beta=0.001, max_generations=10, n_sample=40).run_records(seed=5) + def mean_aff(res): + a = [r["lineage_affinity"] for r in res.records] + return sum(a) / max(1, len(a)) + assert mean_aff(selected) > mean_aff(neutral) + + +def test_clonal_lineage_validation_rejects_bad_args(): + with pytest.raises((ValueError, TypeError)): + ga.Experiment.on("human_igh").recombine().clonal_lineage(n_clones=0) diff --git a/tests/test_clonal_lineage_validation.py b/tests/test_clonal_lineage_validation.py new file mode 100644 index 0000000..4ae3267 --- /dev/null +++ b/tests/test_clonal_lineage_validation.py @@ -0,0 +1,294 @@ +"""Tests for clonal_lineage DSL validation (Fix 1–4).""" +import pytest +import GenAIRR as ga + + +def _base_exp(**kw): + defaults = dict(n_clones=2, max_generations=4, n_max=100, n_sample=10, + rate=0.03, lambda_base=1.5) + defaults.update(kw) + return ga.Experiment.on("human_igh").recombine().clonal_lineage(**defaults) + + +# --------------------------------------------------------------------------- +# Fix 1: lambda_mut is removed — passing it must be a TypeError +# --------------------------------------------------------------------------- + +def test_lambda_mut_removed_raises_typeerror(): + """lambda_mut is no longer a parameter; passing it must raise TypeError.""" + with pytest.raises(TypeError): + _base_exp(lambda_mut=0.1) + + +# --------------------------------------------------------------------------- +# Fix 3: target_aa validation +# --------------------------------------------------------------------------- + +def test_target_aa_empty_string_raises(): + with pytest.raises(ValueError, match="non-empty amino-acid"): + _base_exp(target_aa="") + + +def test_target_aa_invalid_chars_raises(): + with pytest.raises(ValueError, match="invalid characters"): + _base_exp(target_aa="not-an-aa-$$") + + +def test_target_aa_invalid_chars_mixed_raises(): + with pytest.raises(ValueError, match="invalid characters"): + _base_exp(target_aa="ACDEFG1HIKLM") + + +def test_target_aa_non_aa_chars_raises(): + """Characters outside the 20 standard AAs (X, B, Z, U, O, numbers) must be rejected.""" + with pytest.raises(ValueError): + _base_exp(target_aa="XBZUOACDEFGHIKL") + with pytest.raises(ValueError): + _base_exp(target_aa="ACDEFG0HIKLMN") + + +def test_target_aa_none_is_valid(): + """None (auto-target) must be accepted without error.""" + exp = _base_exp(target_aa=None) + assert exp is not None + + +def test_target_aa_valid_long_string(): + """A realistic receptor-length AA string must be accepted.""" + aa = "ACDEFGHIKLMNPQRSTVWY" * 20 # 400 aa — long enough for no warning + exp = _base_exp(target_aa=aa) + assert exp is not None + + +def test_target_aa_short_triggers_warning(recwarn): + """A very short target_aa (< 30 aa) must emit a UserWarning.""" + aa = "ACDEFGHIKLMN" # 12 aa — below the 30-aa soft threshold + import warnings + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _base_exp(target_aa=aa) + user_warnings = [w for w in caught if issubclass(w.category, UserWarning)] + assert len(user_warnings) == 1 + assert "receptor" in str(user_warnings[0].message).lower() + + +# --------------------------------------------------------------------------- +# Fix 3: s5f_model validation at call time +# --------------------------------------------------------------------------- + +def test_s5f_model_bogus_raises_at_call_time(): + """A typo in s5f_model must error at clonal_lineage() call time.""" + with pytest.raises(ValueError, match="Unknown s5f_model"): + _base_exp(s5f_model="bogus") + + +def test_s5f_model_valid_accepted(): + for model in ("hh_s5f", "hkl_s5f", "hh_s5f_60", "hh_s5f_opposite"): + exp = _base_exp(s5f_model=model) + assert exp is not None + + +# --------------------------------------------------------------------------- +# Fix 4: run_records kwargs on the lineage path +# --------------------------------------------------------------------------- + +def _compiled_lineage(): + return _base_exp() + + +def test_n_not_none_raises_for_lineage(): + """Record count is not a fixed product for lineage; passing n is rejected.""" + with pytest.raises(ValueError): + _compiled_lineage().run_records(n=5) + + +def test_seed_and_strict_still_work_for_lineage(): + """seed and strict must work.""" + result = _compiled_lineage().run_records(seed=42, strict=False) + assert len(result.records) > 0 + + +# --------------------------------------------------------------------------- +# validate_records / expose_provenance now SUPPORTED on the lineage path +# --------------------------------------------------------------------------- + +def _lineage_exp(**kw): + """A clonal_lineage experiment that reliably yields observed records.""" + defaults = dict( + n_clones=2, max_generations=6, n_max=300, n_sample=15, + rate=0.05, lambda_base=1.5, selection_strength=0.0, target_aa=None, + ) + defaults.update(kw) + return ga.Experiment.on("human_igh").recombine().clonal_lineage(**defaults) + + +# seed chosen so the (single-founder) families survive: sampling now draws from +# the living final generation, so an all-extinct seed yields zero records. +_SURVIVING_SEED = 1 + + +def test_validate_records_true_no_corruption_does_not_raise(): + result = _lineage_exp().run_records(seed=_SURVIVING_SEED, validate_records=True) + assert len(result.records) > 0 + + +def test_validate_records_true_with_corruption_does_not_raise(): + exp = _lineage_exp().sequencing_errors(rate=0.02) + result = exp.run_records(seed=_SURVIVING_SEED, validate_records=True) + assert len(result.records) > 0 + + +def test_outcomes_are_retained_and_index_aligned(): + result = _lineage_exp().run_records(seed=0) + assert result.outcomes is not None + assert len(result.outcomes) == len(result.records) + + +def test_outcomes_retained_with_corruption(): + result = _lineage_exp().sequencing_errors(rate=0.02).run_records(seed=0) + assert result.outcomes is not None + assert len(result.outcomes) == len(result.records) + + +def test_validate_records_called_directly_is_clean_no_corruption(): + exp = _lineage_exp() + refdata = exp.compile().refdata + result = exp.run_records(seed=0) + report = result.validate_records(refdata) + assert report.ok, report.summary() + + +def test_validate_records_called_directly_is_clean_with_corruption(): + exp = _lineage_exp().sequencing_errors(rate=0.02) + refdata = exp.compile().refdata + result = exp.run_records(seed=0) + report = result.validate_records(refdata) + assert report.ok, report.summary() + + +def test_expose_provenance_adds_truth_v_call_no_corruption(): + result = _lineage_exp().run_records(seed=_SURVIVING_SEED, expose_provenance=True) + assert len(result.records) > 0 + for rec in result.records: + assert "truth_v_call" in rec + assert rec["truth_v_call"] + + +def test_expose_provenance_adds_truth_v_call_with_corruption(): + exp = _lineage_exp().sequencing_errors(rate=0.02) + result = exp.run_records(seed=_SURVIVING_SEED, expose_provenance=True) + assert len(result.records) > 0 + for rec in result.records: + assert "truth_v_call" in rec + assert rec["truth_v_call"] + + +# --------------------------------------------------------------------------- +# TCR guard: clonal_lineage models B-cell SHM => BCR/Ig only +# --------------------------------------------------------------------------- + +def test_clonal_lineage_rejects_tcr_locus(): + """clonal_lineage applies S5F SHM (a B-cell process) so it must reject + TCR loci with a clear BCR-only message — mirroring mutate()'s TCR guard. + The guard must fire at clonal_lineage() call time (before compile) so the + error is the BCR-only message, not a cartridge/compile error. + """ + with pytest.raises(ValueError) as excinfo: + ( + ga.Experiment.on("human_tcrb") + .allow_curatable_refdata() + .recombine() + .clonal_lineage(n_clones=2, n_sample=10) + ) + msg = str(excinfo.value) + assert "BCR" in msg + assert "TCR" in msg + + +def test_clonal_lineage_allows_bcr_locus(): + """Sanity: the BCR-only guard does not fire on an Ig locus.""" + exp = ga.Experiment.on("human_igh").recombine().clonal_lineage( + n_clones=1, n_sample=5 + ) + assert exp is not None + + +# --------------------------------------------------------------------------- +# Founder-survival guard: every requested clone yields a surviving family +# --------------------------------------------------------------------------- + +def test_survival_guard_yields_all_clones_by_default(): + """By default (allow_extinction=False) every requested clone survives via + deterministic retry, so clone_ids == {0..n_clones-1} — even at a + lambda_base where a single founder goes extinct ~25% of the time. + """ + result = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=5, max_generations=6, n_max=200, + n_sample=15, rate=0.05, lambda_base=1.5) + .run_records(seed=0) + ) + cids = {r["clone_id"] for r in result.records} + assert cids == {0, 1, 2, 3, 4} + + +def test_survival_guard_is_deterministic(): + """Same top-level seed => same result, even with retries.""" + def run(): + return ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=5, max_generations=6, n_max=200, + n_sample=15, rate=0.05, lambda_base=1.5) + .run_records(seed=0) + ) + a = run().records + b = run().records + assert len(a) == len(b) + assert [r["sequence"] for r in a] == [r["sequence"] for r in b] + assert [r["clone_id"] for r in a] == [r["clone_id"] for r in b] + + +def test_allow_extinction_true_allows_missing_clones(): + """allow_extinction=True accepts extinction and skips extinct clones, so + clone_ids is a (possibly proper) subset of the requested range. + """ + result = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=5, max_generations=6, n_max=200, + n_sample=15, rate=0.05, lambda_base=1.5, + allow_extinction=True) + .run_records(seed=0) + ) + cids = {r["clone_id"] for r in result.records} + assert cids <= {0, 1, 2, 3, 4} + + +# --------------------------------------------------------------------------- +# AIRR-standard duplicate_count mirrors lineage_abundance +# --------------------------------------------------------------------------- + +def test_records_emit_duplicate_count_equal_to_abundance(): + result = _base_exp(n_clones=3, lambda_base=1.6).run_records(seed=0) + assert len(result.records) > 0 + for rec in result.records: + assert "duplicate_count" in rec + assert rec["duplicate_count"] == rec["lineage_abundance"] + + +# --------------------------------------------------------------------------- +# Smoke test: a valid call still works end-to-end +# --------------------------------------------------------------------------- + +def test_valid_clonal_lineage_call_works(): + result = _base_exp( + n_clones=2, + selection_strength=0.0, + target_aa=None, + ).run_records(seed=7) + assert len(result.records) > 0 + for rec in result.records: + assert "lineage_affinity" in rec + assert "clone_id" in rec diff --git a/tests/test_clonal_plan_split_contract.py b/tests/test_clonal_plan_split_contract.py index b65d1b9..6d4a45c 100644 --- a/tests/test_clonal_plan_split_contract.py +++ b/tests/test_clonal_plan_split_contract.py @@ -188,7 +188,10 @@ def test_pin_scaffold_ancestor_phase_post_fork_rejects(method) -> None: .recombine() .expand_clones(n_clones=1, per_clone=2) ) - with pytest.raises(ValueError, match="before expand_clones"): + # Message generalized to cover all three clonal forks + # (clonal_lineage / clonal_repertoire / expand_clones); the + # rejection behaviour is unchanged. + with pytest.raises(ValueError, match="before the clonal fork"): _ANCESTOR_PHASE_BUILDERS[method](exp) diff --git a/tests/test_clonal_repertoire.py b/tests/test_clonal_repertoire.py new file mode 100644 index 0000000..a774b3a --- /dev/null +++ b/tests/test_clonal_repertoire.py @@ -0,0 +1,123 @@ +"""Tests for the non-tree clonal repertoire DSL (``clonal_repertoire``). + +``clonal_repertoire`` generalizes the deprecated ``expand_clones``: per +clone a size is drawn from a heavy-tailed distribution (with an +unexpanded-singleton fraction), that many reads are emitted through the +post-fork library-prep / sequencing passes, and identical reads are +genotype-collapsed into AIRR records carrying ``duplicate_count``. +""" + +from collections import Counter + +import pytest + +import GenAIRR as ga + + +def test_bcr_repertoire_with_corruption_has_clone_ids_and_duplicate_count(): + r = (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=20, max_size=50, unexpanded_fraction=0.3) + .sequencing_errors(rate=0.005) + .run_records(seed=0)) + assert len(r.records) > 0 + assert all("duplicate_count" in rec and rec["duplicate_count"] >= 1 for rec in r.records) + assert all("clone_id" in rec for rec in r.records) + per_clone = Counter(rec["clone_id"] for rec in r.records) + assert min(per_clone.values()) >= 1 + + +def test_pure_copy_no_postfork_one_record_per_clone(): + r = (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=10, max_size=100).run_records(seed=1)) + # no post-fork passes -> each clone collapses to ONE record with + # duplicate_count = size + per_clone = Counter(rec["clone_id"] for rec in r.records) + assert all(c == 1 for c in per_clone.values()) + assert all(rec["duplicate_count"] >= 1 for rec in r.records) + + +def test_tcr_repertoire_runs_no_shm(): + r = (ga.Experiment.on("human_tcrb").allow_curatable_refdata().recombine() + .clonal_repertoire(n_clones=10, max_size=50).run_records(seed=0)) + assert len(r.records) > 0 + assert all(rec.get("n_mutations", 0) == 0 for rec in r.records) + assert all("duplicate_count" in rec for rec in r.records) + + +def test_tcr_repertoire_rejects_mutate(): + with pytest.raises(Exception): + (ga.Experiment.on("human_tcrb").allow_curatable_refdata().recombine() + .clonal_repertoire(n_clones=5, max_size=10).mutate(rate=0.05).compile()) + + +def test_determinism(): + mk = lambda: (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=8, max_size=40).run_records(seed=3)) + a, b = mk(), mk() + assert [r["sequence"] for r in a.records] == [r["sequence"] for r in b.records] + assert [r["duplicate_count"] for r in a.records] == [r["duplicate_count"] for r in b.records] + + +def test_validation_works(): + r = (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=5, max_size=30).sequencing_errors(rate=0.003) + .run_records(seed=0, validate_records=True)) + assert len(r.records) > 0 + + +def test_clonal_repertoire_rejects_premature_mutate(): + # .mutate() before clonal_repertoire is a descendant-phase step and + # must be rejected (SHM is descendant-specific). + with pytest.raises(ValueError): + (ga.Experiment.on("human_igh").recombine() + .mutate(rate=0.05).clonal_repertoire(n_clones=5, max_size=10)) + + +def test_clonal_repertoire_rejects_double_fork(): + with pytest.raises(ValueError): + (ga.Experiment.on("human_igh").recombine() + .clonal_repertoire(n_clones=5, max_size=10) + .clonal_repertoire(n_clones=3, max_size=5)) + + +@pytest.mark.parametrize("method", ["invert_d", "receptor_revision"]) +def test_recombination_time_edits_reject_after_clonal_repertoire(method): + exp = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_repertoire(n_clones=5, max_size=10) + ) + with pytest.raises(ValueError, match="before the clonal fork"): + getattr(exp, method)(prob=0.05) + + +@pytest.mark.parametrize("method", ["invert_d", "receptor_revision"]) +def test_recombination_time_edits_reject_after_clonal_lineage(method): + exp = ( + ga.Experiment.on("human_igh") + .recombine() + .clonal_lineage(n_clones=1, max_generations=2, n_sample=2) + ) + with pytest.raises(ValueError, match="before the clonal fork"): + getattr(exp, method)(prob=0.05) + + +def test_clonal_repertoire_validates_args(): + base = ga.Experiment.on("human_igh").recombine() + with pytest.raises(ValueError): + base.clonal_repertoire(n_clones=0) + with pytest.raises(ValueError): + ga.Experiment.on("human_igh").recombine().clonal_repertoire( + n_clones=5, size_distribution="bogus") + with pytest.raises(ValueError): + ga.Experiment.on("human_igh").recombine().clonal_repertoire( + n_clones=5, exponent=0.0) + with pytest.raises(ValueError): + ga.Experiment.on("human_igh").recombine().clonal_repertoire( + n_clones=5, sigma=-1.0) + with pytest.raises(ValueError): + ga.Experiment.on("human_igh").recombine().clonal_repertoire( + n_clones=5, max_size=0) + with pytest.raises(ValueError): + ga.Experiment.on("human_igh").recombine().clonal_repertoire( + n_clones=5, unexpanded_fraction=1.5) diff --git a/tests/test_clone_size_sampler.py b/tests/test_clone_size_sampler.py new file mode 100644 index 0000000..becb4ca --- /dev/null +++ b/tests/test_clone_size_sampler.py @@ -0,0 +1,29 @@ +from GenAIRR import _engine + + +def test_sample_clone_sizes_basic(): + sizes = _engine.sample_clone_sizes(500, 0) # defaults: power_law exp 2 + assert len(sizes) == 500 + assert all(s >= 1 for s in sizes) + assert max(sizes) > 1 # heavy tail reaches >1 + + +def test_sample_clone_sizes_deterministic(): + a = _engine.sample_clone_sizes(200, 7, kind="power_law", exponent=2.0) + b = _engine.sample_clone_sizes(200, 7, kind="power_law", exponent=2.0) + assert a == b + + +def test_unexpanded_fraction_forces_singletons(): + sizes = _engine.sample_clone_sizes(1000, 3, unexpanded_fraction=0.4) + assert sum(1 for s in sizes if s == 1) >= 400 + + +def test_lognormal_and_validation(): + import pytest + sizes = _engine.sample_clone_sizes(100, 1, kind="lognormal", mu=1.0, sigma=1.0, max_size=10000) + assert len(sizes) == 100 and all(1 <= s <= 10000 for s in sizes) + with pytest.raises(ValueError): + _engine.sample_clone_sizes(10, 0, kind="bogus") + with pytest.raises(ValueError): + _engine.sample_clone_sizes(10, 0, unexpanded_fraction=2.0) diff --git a/tests/test_docs_website_contract.py b/tests/test_docs_website_contract.py index 1078e0b..f6c9712 100644 --- a/tests/test_docs_website_contract.py +++ b/tests/test_docs_website_contract.py @@ -42,6 +42,7 @@ _AUDIT_DOC = _REPO_ROOT / "audit-docs" / "docs_website_audit.md" _README = _REPO_ROOT / "README.md" _WEBSITE = _REPO_ROOT / "website" +_SITE_DOCS = _REPO_ROOT / "site_docs" _DOCS = _REPO_ROOT / "docs" _OLD_DOCS = _REPO_ROOT / "_old_docs" _DEPLOY_WORKFLOW = _REPO_ROOT / ".github" / "workflows" / "deploy-docs.yml" @@ -302,9 +303,16 @@ def test_pin_present_readme_hosted_doc_urls_resolve_to_live_website_pages() -> N continue # Strip query string / fragment if present. path = path.split("?", 1)[0].split("#", 1)[0] - target = _WEBSITE / path - if not target.is_file(): - missing.append((url, str(target))) + # The live site is the MkDocs build of `site_docs/` (deploy-docs.yml). + # A hosted `

.html` URL resolves if EITHER it exists as a flat page + # in the preserved `website/` rollback tree, OR it maps to a MkDocs + # source page `site_docs/

.md` (use_directory_urls: false ⇒ + # `foo/bar.md` → `foo/bar.html`). + resolves = (_WEBSITE / path).is_file() + if not resolves and path.endswith(".html"): + resolves = (_SITE_DOCS / (path[:-len(".html")] + ".md")).is_file() + if not resolves: + missing.append((url, str(_WEBSITE / path))) assert not missing, ( "README hosted-doc URLs do not resolve to live website " "pages:\n" diff --git a/tests/test_expand_clones_deprecation.py b/tests/test_expand_clones_deprecation.py new file mode 100644 index 0000000..5ef1532 --- /dev/null +++ b/tests/test_expand_clones_deprecation.py @@ -0,0 +1,32 @@ +"""Deprecation contract for ``Experiment.expand_clones``. + +Verifies that: + +1. Calling ``expand_clones()`` emits a ``DeprecationWarning`` that + mentions ``clonal_lineage``. +2. Despite the warning, ``expand_clones()`` continues to produce the + correct star-topology clonal output (n_clones * per_clone records + with the expected ``clone_id`` values). +""" + +import warnings + +import pytest + +import GenAIRR as ga + + +def test_expand_clones_emits_deprecation_warning_but_still_works(): + exp = ga.Experiment.on("human_igh").recombine() + with pytest.warns(DeprecationWarning, match="clonal_lineage"): + exp = exp.expand_clones(n_clones=2, per_clone=3) + # Pipeline continues to work after the deprecation warning. + exp = exp.mutate(count=5) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + result = exp.run_records(seed=0) + # 2 clones × 3 descendants = 6 records. + assert len(result) == 6 + # Exactly the expected clone IDs are present. + clone_ids = {r["clone_id"] for r in result} + assert clone_ids == {0, 1} diff --git a/tests/test_lineage_engine.py b/tests/test_lineage_engine.py new file mode 100644 index 0000000..b8f4aba --- /dev/null +++ b/tests/test_lineage_engine.py @@ -0,0 +1,207 @@ +import pytest +import GenAIRR as ga +from GenAIRR import _engine +from GenAIRR._s5f_loader import load_builtin_s5f_kernel + +S5F_MODEL = "hh_s5f" + + +def _founder(): + compiled = ga.Experiment.on("human_igh").recombine().compile() + outcome = compiled.run(n=1, seed=0)[0] + return outcome.final_simulation() + + +def _kernel(): + return load_builtin_s5f_kernel(S5F_MODEL) + + +def test_simulate_lineage_produces_valid_tree(): + founder = _founder() + mut, sub = _kernel() + tree = _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 2024) + tree.validate() + assert len(tree) >= 1 + nodes = tree.nodes() + assert nodes[0].parent_id is None + assert nodes[0].generation == 0 + assert any(n.mutations_from_parent > 0 for n in nodes) or len(tree) == 1 + + +def test_simulate_lineage_exports_are_wellformed(): + founder = _founder() + mut, sub = _kernel() + tree = _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 7) + nwk = tree.to_newick() + assert nwk.endswith(";") + assert nwk.count("(") == nwk.count(")") + assert tree.to_fasta().startswith(">node") + tsv = tree.to_node_table_tsv() + assert tsv.splitlines()[0].startswith("node_id\t") + assert len(tsv.splitlines()) == len(tree) + 1 + + +def test_simulate_lineage_is_deterministic(): + founder = _founder() + mut, sub = _kernel() + a = _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 99) + b = _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 99) + assert a.to_newick() == b.to_newick() + assert a.to_fasta() == b.to_fasta() + + +def test_simulate_lineage_rejects_bad_kernel(): + founder = _founder() + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, [0.1] * 10, [0.1] * 10, 0.05, 1.5, 0.0, 8, 500, 30, 0) + + +def test_simulate_lineage_rejects_nonfinite_or_negative_kernel(): + founder = _founder() + _mut, sub = _kernel() + # NaN in the mutability table must raise (not panic across the FFI boundary). + with pytest.raises(ValueError): + _engine.simulate_lineage( + founder, [float("nan")] * 1024, sub, 0.05, 1.5, 0.0, 8, 500, 30, 0 + ) + # Negative value in the mutability table must raise too. + with pytest.raises(ValueError): + _engine.simulate_lineage( + founder, [-1.0] + [0.0] * 1023, sub, 0.05, 1.5, 0.0, 8, 500, 30, 0 + ) + + +def test_simulate_lineage_rejects_zero_n_sample(): + founder = _founder() + mut, sub = _kernel() + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 0, 0) + + +def test_simulate_lineage_rejects_nonfinite_lambda(): + founder = _founder() + mut, sub = _kernel() + # NaN/inf/negative lambda_base must raise, not silently return a founder-only tree. + for bad in (float("nan"), float("inf"), -1.0): + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, mut, sub, 0.05, bad, 0.0, 8, 500, 30, 0) + # lambda_mut is validated too. + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, float("nan"), 8, 500, 30, 0) + + +def test_simulate_lineage_rejects_excessive_max_generations(): + founder = _founder() + mut, sub = _kernel() + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 5000, 500, 30, 0) + + +def test_affinity_node_getter_and_neutral_default(): + founder = _founder() + mut, sub = _kernel() + # Default call (no affinity args) is the neutral path; affinity getter exists. + tree = _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 2024) + nodes = tree.nodes() + assert all(hasattr(n, "affinity") for n in nodes) + # neutral path leaves affinity at 0.0 + assert all(n.affinity == 0.0 for n in nodes) + + +def test_affinity_selection_raises_mean_affinity(): + founder = _founder() + mut, sub = _kernel() + founder_aa_len = 100 # target length need not match; distance uses min length + # Use "A" (alanine) target with small beta so exp(-beta*d) stays in (0,1) + # and doesn't underflow. "W"*100 + beta=1.0 collapses to denorm for IgH + # sequences, making neutral and selected indistinguishable. + target = "A" * founder_aa_len # a fixed explicit target both runs share + common = dict(target_aa=target, beta=0.001, mature_substitutions=5) + # neutral: selection_strength = 0 (affinities populated but not selected on) + neutral = _engine.simulate_lineage( + founder, mut, sub, 0.1, 1.6, 0.0, 12, 800, 60, 4242, + selection_strength=0.0, **common, + ) + # strong selection toward the same target, same seed + selected = _engine.simulate_lineage( + founder, mut, sub, 0.1, 1.6, 0.0, 12, 800, 60, 4242, + selection_strength=50.0, **common, + ) + + def mean_aff(tree): + ns = tree.nodes() + return sum(n.affinity for n in ns) / max(1, len(ns)) + + # selection should enrich high-affinity cells → higher mean affinity + assert mean_aff(selected) > mean_aff(neutral), ( + f"selected {mean_aff(selected)} !> neutral {mean_aff(neutral)}" + ) + # affinities are populated (in (0,1]) when a model is active + assert all(0.0 < n.affinity <= 1.0 + 1e-9 for n in selected.nodes()) + + +def test_affinity_auto_target_is_deterministic(): + founder = _founder() + mut, sub = _kernel() + a = _engine.simulate_lineage(founder, mut, sub, 0.1, 1.5, 0.0, 10, 500, 40, 7, + selection_strength=5.0) + b = _engine.simulate_lineage(founder, mut, sub, 0.1, 1.5, 0.0, 10, 500, 40, 7, + selection_strength=5.0) + assert a.to_newick() == b.to_newick() + assert [n.affinity for n in a.nodes()] == [n.affinity for n in b.nodes()] + + +def test_simulate_family_outcomes_extinct_founder_yields_no_observed(): + """lambda_base=0 => founder never divides => extinct => zero observed cells. + + Sampling must draw from the LIVING final-generation population, not all tree + leaves. An extinct clone (no living cells) is not observed, so it contributes + no node/observed Outcomes. + """ + c = ga.Experiment.on("human_igh").recombine().compile() + founder = c.run(n=1, seed=0)[0] # full Outcome (not just the Simulation) + refdata = c.refdata + mut, sub = _kernel() + fam = _engine.simulate_family_outcomes( + founder, refdata, mut, sub, 0.05, 0.0, 0.0, 6, 300, 30, 2024 + ) + assert len(fam.observed_outcomes()) == 0 + assert all(o is None for o in fam.node_outcomes()) + assert all(not n.observed and n.abundance == 0 for n in fam.tree().nodes()) + + +def test_simulate_family_outcomes_healthy_lambda_produces_observed(): + """A healthy lambda_base produces observed cells whose abundances sum to n_sample. + + Scans seeds because a single founder can go extinct even at a healthy rate. + """ + c = ga.Experiment.on("human_igh").recombine().compile() + founder = c.run(n=1, seed=0)[0] # full Outcome (not just the Simulation) + refdata = c.refdata + mut, sub = _kernel() + n_sample = 30 + for seed in range(20): + fam = _engine.simulate_family_outcomes( + founder, refdata, mut, sub, 0.05, 1.6, 0.0, 6, 300, n_sample, seed + ) + observed = [n for n in fam.tree().nodes() if n.observed] + if not observed: + continue # extinct for this seed + assert len(fam.observed_outcomes()) == len(observed) + assert sum(n.abundance for n in observed) == n_sample + max_gen = max(n.generation for n in fam.tree().nodes()) + assert all(n.generation == max_gen for n in observed) + break + else: + raise AssertionError("no seed in 0..20 produced a surviving family") + + +def test_affinity_rejects_bad_params(): + founder = _founder() + mut, sub = _kernel() + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 0, + selection_strength=float("nan")) + with pytest.raises(ValueError): + _engine.simulate_lineage(founder, mut, sub, 0.05, 1.5, 0.0, 8, 500, 30, 0, + beta=-1.0) diff --git a/tests/test_lineage_mutation_counts.py b/tests/test_lineage_mutation_counts.py new file mode 100644 index 0000000..ae6771b --- /dev/null +++ b/tests/test_lineage_mutation_counts.py @@ -0,0 +1,30 @@ +import GenAIRR as ga +from GenAIRR import _engine +from GenAIRR._s5f_loader import load_builtin_s5f_kernel + + +def _founder_and_refdata(): + compiled = ga.Experiment.on("human_igh").recombine().compile() + return compiled.run(n=1, seed=0)[0], compiled.refdata + + +def test_lineage_record_mutation_counts_are_consistent(): + founder, refdata = _founder_and_refdata() + mut, sub = load_builtin_s5f_kernel("hh_s5f") + fam = _engine.simulate_family_outcomes(founder, refdata, mut, sub, 0.1, 1.6, 0.0, 8, 300, 30, 7) + recs = fam.airr_records(refdata) # NEW method (Task 1b) + assert len(recs) >= 1 + saw_mutated = False + for r in recs: + per_seg = r["n_v_mutations"] + r["n_d_mutations"] + r["n_j_mutations"] + r["n_np_mutations"] + assert r["n_mutations"] == per_seg, ( + f"n_mutations {r['n_mutations']} != sum per-segment {per_seg}" + ) + v_sub = (r["n_fwr1_mutations"] + r["n_cdr1_mutations"] + r["n_fwr2_mutations"] + + r["n_cdr2_mutations"] + r["n_fwr3_mutations"] + r["n_v_unannotated_mutations"]) + assert r["n_v_mutations"] == v_sub, ( + f"n_v_mutations {r['n_v_mutations']} != sum subregion {v_sub}" + ) + if r["n_mutations"] > 0: + saw_mutated = True + assert saw_mutated, "expected at least one mutated node record" diff --git a/tests/test_lineage_outcome_validation.py b/tests/test_lineage_outcome_validation.py new file mode 100644 index 0000000..9e98ca8 --- /dev/null +++ b/tests/test_lineage_outcome_validation.py @@ -0,0 +1,102 @@ +""" +Correctness test: per-node Outcomes from simulate_family_outcomes must be +self-consistent so that native build_airr_record + validate_record produce +correct mutation-count fields and pass mutation-count invariants. + +Bug fixed: node Outcomes were built with an empty event ledger but a +nonzero Simulation.mutation_count, so validate_record would false-fail the +mutation-count sum invariant on mutated nodes. + +Note: AlleleCallTieSetMismatch issues on D can still appear for SHM-mutated +nodes (SHM changes D-segment match scores, making formerly-unique calls +ambiguous). These are a biological reality of the lineage path, not caused +by the event-ledger bug, and are not checked here. +""" +import GenAIRR as ga +from GenAIRR import _engine +from GenAIRR._s5f_loader import load_builtin_s5f_kernel + +# Mutation-count issue kinds that this fix must eliminate. +_MUTATION_COUNT_KINDS = { + "NMutationsMismatch", + "NVMutationsMismatch", + "NDMutationsMismatch", + "NJMutationsMismatch", + "NNpMutationsMismatch", + "MutationCountSumMismatch", + "NFwr1MutationsMismatch", + "NCdr1MutationsMismatch", + "NFwr2MutationsMismatch", + "NCdr2MutationsMismatch", + "NFwr3MutationsMismatch", + "NVUnannotatedMutationsMismatch", + "VSubregionMutationCountSumMismatch", +} + + +def _founder_refdata(): + c = ga.Experiment.on("human_igh").recombine().compile() + return c.run(n=1, seed=0)[0], c.refdata + + +def test_node_outcomes_no_mutation_count_issues(): + """After the fix, no mutation-count validation issues on any lineage node.""" + founder, refdata = _founder_refdata() + mut, sub = load_builtin_s5f_kernel("hh_s5f") + fam = _engine.simulate_family_outcomes(founder, refdata, mut, sub, 0.05, 1.6, 0.0, 6, 300, 30, 7) + saw_mut = False + for o in fam.observed_outcomes(): + issues = o.validate_record(refdata) + mut_issues = [i for i in issues if i["kind"] in _MUTATION_COUNT_KINDS] + assert len(mut_issues) == 0, f"mutation-count validate_record issues: {mut_issues}" + rec = o.airr_record(refdata) + if rec["n_mutations"] > 0: + saw_mut = True + per_seg = (rec["n_v_mutations"] + rec["n_d_mutations"] + + rec["n_j_mutations"] + rec["n_np_mutations"]) + assert rec["n_mutations"] == per_seg, ( + f"n_mutations {rec['n_mutations']} != per-segment sum {per_seg}" + ) + v_sub = (rec["n_fwr1_mutations"] + rec["n_cdr1_mutations"] + + rec["n_fwr2_mutations"] + rec["n_cdr2_mutations"] + + rec["n_fwr3_mutations"] + rec["n_v_unannotated_mutations"]) + assert rec["n_v_mutations"] == v_sub, ( + f"n_v_mutations {rec['n_v_mutations']} != subregion sum {v_sub}" + ) + assert saw_mut, "expected at least one mutated observed node" + + +def test_node_outcomes_validate_clean_under_heavy_shm(): + """Heavy SHM lineage: every observed node Outcome must validate cleanly. + + Regression for the stale live-call cache bug: node sims carried the + founder's allele calls instead of the mutated sequence's, so under heavy + SHM validate_record reported AlleleCallTieSetMismatch. After refreshing + the live-call cache on each observed node sim, this must be clean. + """ + founder, refdata = _founder_refdata() + mut, sub = load_builtin_s5f_kernel("hh_s5f") + fam = _engine.simulate_family_outcomes(founder, refdata, mut, sub, 0.05, 1.6, 0.0, 6, 300, 30, 7) + issues_total = 0 + for o in fam.observed_outcomes(): + issues_total += len(o.validate_record(refdata)) + assert issues_total == 0, f"validate_record reported {issues_total} issues under heavy SHM" + + +def test_airr_records_mutation_counts_consistent(): + """PyFamilyOutcome.airr_records() produces consistent mutation counts.""" + founder, refdata = _founder_refdata() + mut, sub = load_builtin_s5f_kernel("hh_s5f") + fam = _engine.simulate_family_outcomes(founder, refdata, mut, sub, 0.05, 1.6, 0.0, 6, 300, 30, 7) + recs = fam.airr_records(refdata) + assert len(recs) >= 1 + saw_mut = False + for rec in recs: + per_seg = (rec["n_v_mutations"] + rec["n_d_mutations"] + + rec["n_j_mutations"] + rec["n_np_mutations"]) + assert rec["n_mutations"] == per_seg, ( + f"n_mutations {rec['n_mutations']} != per-segment sum {per_seg}" + ) + if rec["n_mutations"] > 0: + saw_mut = True + assert saw_mut, "expected at least one mutated node record"