From 8c108dbab4772969e3ba304d58f57510c2183321 Mon Sep 17 00:00:00 2001 From: sfilges Date: Tue, 3 Mar 2026 15:19:08 +0100 Subject: [PATCH 1/3] Further tuning of BLAST parameters --- CHANGELOG.md | 18 +++++++++ config/designer_default_config.json | 11 +++-- config/designer_lenient_config.json | 9 ++++- src/plexus/blast/blast_runner.py | 15 ++++++- src/plexus/blast/specificity.py | 19 ++++++++- src/plexus/config.py | 45 +++++++++++++++++++-- src/plexus/designer/design.py | 2 + src/plexus/pipeline.py | 4 ++ src/plexus/utils/utils.py | 50 ++++++++++++++++++++++- src/plexus/version.py | 2 +- tests/test_blast_runner.py | 45 +++++++++++++++++++++ tests/test_blast_specificity.py | 11 +++++ tests/test_config.py | 6 ++- tests/test_utils.py | 62 ++++++++++++++++++++++++++++- 14 files changed, 283 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7714c56..378a0d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.1] - 2026-03-03 + +### Changed + +- **BLAST E-value raised to Primer-BLAST default** (`config.py`): `blast_evalue` 1000 → 30000 to match the Primer-BLAST default and ensure short, gapped primer alignments are not silently dropped. +- **BLAST word size now explicit** (`config.py`, `specificity.py`, `pipeline.py`): New `blast_word_size` field (default 7) passed to `blastn -word_size`, matching the Primer-BLAST / blastn-short default. Previously relied on the implicit blastn-short default; now configurable. +- **Tightened poly-X default** (`config.py`): `primer_max_poly_x` default lowered from 5 to 4. +- **Increased BLAST `max_mismatches` default** (`config.py`): 2 → 3 so that 3-mismatch paralog alignments (e.g. DCAF12L1) are classified as "predicted bound". +- **BLAST mismatch penalty** (`config.py`): New `blast_penalty` default changed from the blastn-short default of -3 to -1. With reward=1/penalty=-3, a 21bp primer with 3 mismatches scores only 9 (E ≈ 190,000) and is silently dropped by BLAST. With penalty=-1, the same alignment scores 15 (E ≈ 70), making paralog off-targets like DCAF12L1 detectable. +- Updated both `designer_default_config.json` and `designer_lenient_config.json` to match the new defaults. + +### Added + +- **G/C-specific poly-run filter** (`utils.py`): New `find_max_poly_gc()` function rejects primers with homopolymer runs of G or C exceeding `primer_max_poly_gc` (default 3). This catches primers like the DCAF12L2 `GGGGCCGGATGTTCTGCTG` forward primer that passed the old poly-X filter. Wired through `check_kmer()`, `generate_kmers()`, `filter_kmers()`, and `design.py`. +- **BLAST `-evalue` parameter** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_evalue` field (default 30000.0) passed to `blastn -evalue`, ensuring weak but real off-target alignments are not silently dropped before the annotator can evaluate them. +- **BLAST `-reward` / `-penalty` parameters** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_reward` (default 1) and `blast_penalty` (default -1) fields override the blastn-short scoring defaults, improving sensitivity for mismatched paralog alignments. +- Tests for `find_max_poly_gc`, `check_kmer` poly-GC integration, BLAST evalue/reward/penalty/word_size parameter forwarding, and specificity check threading. + ## [1.0.0] - 2026-03-03 First stable release. All v1.0 roadmap items complete — see `docs/ROADMAP.md` for the full list. diff --git a/config/designer_default_config.json b/config/designer_default_config.json index a0e9dda..95e5dba 100644 --- a/config/designer_default_config.json +++ b/config/designer_default_config.json @@ -8,7 +8,7 @@ "primer_min_length": 15, "primer_max_length": 30, "PRIMER_OPT_BOUND": 98.0, - "PRIMER_MIN_BOUND": -10.0, + "PRIMER_MIN_BOUND": 35.0, "PRIMER_MAX_BOUND": 120.0, "junction_padding_bases": 3, "forward_tail": "GGACACTCTTTCCCTACACGACGCTCTTCCGATCTAAAAAAAAAAAAAAAAAAAATGGGAAAGAGTGTCC", @@ -20,7 +20,8 @@ "primer_min_gc": 30, "primer_max_gc": 70, "primer_gc_clamp": 1, - "primer_max_poly_x": 5, + "primer_max_poly_x": 4, + "primer_max_poly_gc": 3, "primer_max_n": 0, "PRIMER_MAX_SELF_ANY_TH": 45.0, "PRIMER_MAX_SELF_END_TH": 35.0, @@ -73,7 +74,11 @@ "blast_parameters": { "length_threshold": 15, "evalue_threshold": 10.0, - "max_mismatches": 2, + "max_mismatches": 3, + "blast_evalue": 30000.0, + "blast_word_size": 7, + "blast_reward": 1, + "blast_penalty": -1, "max_amplicon_size": 2000, "ontarget_tolerance": 5 }, diff --git a/config/designer_lenient_config.json b/config/designer_lenient_config.json index 759a34c..352875d 100644 --- a/config/designer_lenient_config.json +++ b/config/designer_lenient_config.json @@ -20,7 +20,8 @@ "primer_min_gc": 30, "primer_max_gc": 70, "primer_gc_clamp": 0, - "primer_max_poly_x": 5, + "primer_max_poly_x": 4, + "primer_max_poly_gc": 3, "primer_max_n": 0, "PRIMER_MAX_SELF_ANY_TH": 45.0, "PRIMER_MAX_SELF_END_TH": 35.0, @@ -73,7 +74,11 @@ "blast_parameters": { "length_threshold": 15, "evalue_threshold": 10.0, - "max_mismatches": 2, + "max_mismatches": 3, + "blast_evalue": 30000.0, + "blast_word_size": 7, + "blast_reward": 1, + "blast_penalty": -1, "max_amplicon_size": 2000, "ontarget_tolerance": 5 }, diff --git a/src/plexus/blast/blast_runner.py b/src/plexus/blast/blast_runner.py index cf9bb9f..4433c7a 100644 --- a/src/plexus/blast/blast_runner.py +++ b/src/plexus/blast/blast_runner.py @@ -91,7 +91,14 @@ def create_database(self): return self def run( - self, output_archive, word_size=None, task="blastn-short", num_threads: int = 1 + self, + output_archive, + word_size=None, + task="blastn-short", + num_threads: int = 1, + evalue: float | None = None, + reward: int | None = None, + penalty: int | None = None, ): """ Run blast, writing a BLAST archive to `output_archive`. @@ -120,6 +127,12 @@ def run( ] if word_size is not None: cmd.extend(["-word_size", str(word_size)]) + if evalue is not None: + cmd.extend(["-evalue", str(evalue)]) + if reward is not None: + cmd.extend(["-reward", str(reward)]) + if penalty is not None: + cmd.extend(["-penalty", str(penalty)]) if num_threads > 1: cmd.extend(["-num_threads", str(num_threads)]) diff --git a/src/plexus/blast/specificity.py b/src/plexus/blast/specificity.py index bc1a8b0..4db1f74 100644 --- a/src/plexus/blast/specificity.py +++ b/src/plexus/blast/specificity.py @@ -16,9 +16,13 @@ def run_specificity_check( *, length_threshold: int = 15, evalue_threshold: float = 10.0, - max_mismatches: int = 2, + max_mismatches: int = 3, max_amplicon_size: int = 2000, ontarget_tolerance: int = 5, + blast_evalue: float = 30000.0, + blast_word_size: int = 7, + blast_reward: int = 1, + blast_penalty: int = -1, ): """ Run BLAST on all candidate primers in the panel to check for specificity @@ -36,6 +40,10 @@ def run_specificity_check( max_mismatches: Maximum mismatches in a 3'-anchored alignment. max_amplicon_size: Maximum amplicon size (bp) to consider. ontarget_tolerance: Coordinate tolerance (bp) for on-target classification. + blast_evalue: E-value passed to blastn via -evalue to control search sensitivity. + blast_word_size: Word size passed to blastn via -word_size. + blast_reward: Match reward passed to blastn via -reward. + blast_penalty: Mismatch penalty passed to blastn via -penalty. """ logger.info("Starting specificity check (BLAST)...") os.makedirs(work_dir, exist_ok=True) @@ -59,7 +67,14 @@ def run_specificity_check( runner = BlastRunner(input_fasta, genome_fasta) runner.create_database() try: - runner.run(output_archive=blast_archive, num_threads=num_threads) + runner.run( + output_archive=blast_archive, + num_threads=num_threads, + evalue=blast_evalue, + word_size=blast_word_size, + reward=blast_reward, + penalty=blast_penalty, + ) runner.reformat_output_as_table(blast_table) finally: if os.path.exists(blast_archive): diff --git a/src/plexus/config.py b/src/plexus/config.py index 6af183c..68a26ab 100644 --- a/src/plexus/config.py +++ b/src/plexus/config.py @@ -70,7 +70,8 @@ class SingleplexDesignParameters(BaseModel): ) # Sequence composition constraints - primer_max_poly_x: int = Field(default=5, ge=1, le=10) + primer_max_poly_x: int = Field(default=4, ge=1, le=10) + primer_max_poly_gc: int = Field(default=3, ge=1, le=10) primer_max_n: int = Field(default=0, ge=0, le=5) # Thermodynamic thresholds (°C) @@ -272,15 +273,53 @@ class BlastParameters(BaseModel): ), ) max_mismatches: int = Field( - default=2, + default=3, ge=0, le=5, description=( "Maximum mismatches allowed in a 3'-anchored alignment for it to be " - "classified as 'predicted bound'. A primer with 1-2 mismatches in a " + "classified as 'predicted bound'. A primer with 1-3 mismatches in a " "15+ bp 3' stretch will still extend in PCR." ), ) + blast_evalue: float = Field( + default=30000.0, + gt=0.0, + description=( + "E-value passed to blastn via -evalue. Matches the Primer-BLAST " + "default (30000) to ensure short, gapped primer alignments are " + "not silently dropped by BLAST before plexus can evaluate them." + ), + ) + blast_word_size: int = Field( + default=7, + ge=4, + le=28, + description=( + "Word size passed to blastn via -word_size. Matches the " + "Primer-BLAST / blastn-short default of 7, which is appropriate " + "for primer-length queries (<30 bp)." + ), + ) + blast_reward: int = Field( + default=1, + ge=1, + le=5, + description="Match reward passed to blastn via -reward.", + ) + blast_penalty: int = Field( + default=-1, + ge=-5, + le=-1, + description=( + "Mismatch penalty passed to blastn via -penalty. " + "The blastn-short default (-3) causes 3-mismatch paralog alignments " + "to score too low (E > 100,000) for BLAST to report them. " + "A gentler penalty (-1) keeps these alignments reportable " + "(E ~ 70 for a 21bp primer with 3 mismatches) while the annotator's " + "3'-anchored filters still reject true noise." + ), + ) max_amplicon_size: int = Field( default=2000, ge=100, diff --git a/src/plexus/designer/design.py b/src/plexus/designer/design.py index 34ccfb6..0248243 100644 --- a/src/plexus/designer/design.py +++ b/src/plexus/designer/design.py @@ -64,6 +64,7 @@ def design_multiplex_primers(panel: MultiplexPanel) -> MultiplexPanel: min_primer_length = singleplex.primer_min_length max_primer_length = singleplex.primer_max_length max_poly_X = singleplex.primer_max_poly_x + max_poly_gc = singleplex.primer_max_poly_gc max_N = singleplex.primer_max_n min_gc = singleplex.primer_min_gc max_gc = singleplex.primer_max_gc @@ -135,6 +136,7 @@ def design_multiplex_primers(panel: MultiplexPanel) -> MultiplexPanel: min_gc=min_gc, max_gc=max_gc, gc_clamp=gc_clamp, + max_poly_gc=max_poly_gc, ) if orientation == "forward": left_kmers = kmers diff --git a/src/plexus/pipeline.py b/src/plexus/pipeline.py index e63412e..59ddc59 100644 --- a/src/plexus/pipeline.py +++ b/src/plexus/pipeline.py @@ -534,6 +534,10 @@ def run_pipeline( max_mismatches=blast_config.max_mismatches, max_amplicon_size=blast_config.max_amplicon_size, ontarget_tolerance=blast_config.ontarget_tolerance, + blast_evalue=blast_config.blast_evalue, + blast_word_size=blast_config.blast_word_size, + blast_reward=blast_config.blast_reward, + blast_penalty=blast_config.blast_penalty, ) result.steps_completed.append("specificity_checked") logger.info("Specificity check complete") diff --git a/src/plexus/utils/utils.py b/src/plexus/utils/utils.py index f9ec903..a80c4c1 100644 --- a/src/plexus/utils/utils.py +++ b/src/plexus/utils/utils.py @@ -210,6 +210,43 @@ def reverse_complement(dna: str) -> str: # =============================================== +def find_max_poly_gc(kmer: str, n: int) -> bool: + """Check if a DNA sequence has a homopolymer run of G or C exceeding *n*. + + Same logic as :func:`find_max_poly_X` (counts consecutive *identical* + bases) but only triggers when the repeated base is G or C. + + Parameters + ---------- + kmer : str + The DNA sequence to check. + n : int + Maximum allowed consecutive identical G/C bases. + + Returns + ------- + bool + True if a run of G or C exceeding *n* is found, False otherwise. + """ + if len(kmer) <= n: + return False + + kmer = kmer.upper() + current_base = kmer[0] + count = 1 + + for i in range(1, len(kmer)): + if kmer[i] == current_base: + count += 1 + if count > n and current_base in "GC": + return True + else: + current_base = kmer[i] + count = 1 + + return False + + def generate_kmers( target_name: str, target_sequence: str, @@ -222,6 +259,7 @@ def generate_kmers( min_gc: int = 30, max_gc: int = 70, gc_clamp: int = 0, + max_poly_gc: int = 3, ) -> list: """ Generate k-mers as candidate primers. @@ -237,6 +275,7 @@ def generate_kmers( max_N - Max times N bases can occur anywhere in the kmer min_gc - Minimum GC content of kmer max_gc - Maximum GC content of kmer + max_poly_gc - Max consecutive G or C bases allowed """ from plexus.designer.primer import Primer @@ -279,6 +318,7 @@ def generate_kmers( min_gc=min_gc, max_gc=max_gc, gc_clamp=gc_clamp, + max_poly_gc=max_poly_gc, ): kmer_filtered_counter += 1 @@ -331,23 +371,25 @@ def check_kmer( min_gc: int = 30, max_gc: int = 70, gc_clamp: int = 0, + max_poly_gc: int = 3, ) -> bool: """ Filter kmers (putative primers) based on GC-content, number of 'N' bases, - and number of repeated bases (polyX). + number of repeated bases (polyX), and consecutive G/C runs. """ kmer_gc = gc_content(kmer) passed = ( min_gc <= kmer_gc <= max_gc # Should be True and not check_N_in_kmers(kmer, max_N) # Should be False and not find_max_poly_X(kmer, max_poly_X) # Should be False + and not find_max_poly_gc(kmer, max_poly_gc) # Should be False ) if passed and gc_clamp > 0 and not check_gc_clamp(kmer): return False return passed -def filter_kmers(kmers, max_poly_X=4, max_N=0): +def filter_kmers(kmers, max_poly_X=4, max_N=0, max_poly_gc=3): """ Filter kmers (putative primers). """ @@ -366,6 +408,10 @@ def filter_kmers(kmers, max_poly_X=4, max_N=0): if find_max_poly_X(kmer, max_poly_X): continue + # Exclude kmers with too many consecutive G/C bases. + if find_max_poly_gc(kmer, max_poly_gc): + continue + filtered_kmers.append(kmer) return filtered_kmers diff --git a/src/plexus/version.py b/src/plexus/version.py index 5becc17..5c4105c 100644 --- a/src/plexus/version.py +++ b/src/plexus/version.py @@ -1 +1 @@ -__version__ = "1.0.0" +__version__ = "1.0.1" diff --git a/tests/test_blast_runner.py b/tests/test_blast_runner.py index 6cd3755..cc77ac6 100644 --- a/tests/test_blast_runner.py +++ b/tests/test_blast_runner.py @@ -151,6 +151,51 @@ def test_run_omits_num_threads_when_1(runner, tmp_path): assert "-num_threads" not in cmd +def test_run_includes_evalue_when_provided(runner, tmp_path): + """evalue parameter is included in the blastn command when set.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, evalue=1000.0) + cmd = mock_run.call_args[0][0] + assert "-evalue" in cmd + assert "1000.0" in cmd + + +def test_run_omits_evalue_when_none(runner, tmp_path): + """evalue parameter is omitted from the blastn command when None.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, evalue=None) + cmd = mock_run.call_args[0][0] + assert "-evalue" not in cmd + + +def test_run_includes_reward_penalty_when_provided(runner, tmp_path): + """reward and penalty parameters are included in the blastn command.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, reward=1, penalty=-1) + cmd = mock_run.call_args[0][0] + assert "-reward" in cmd + assert "1" in cmd + assert "-penalty" in cmd + assert "-1" in cmd + + +def test_run_omits_reward_penalty_when_none(runner, tmp_path): + """reward and penalty are omitted from the blastn command when None.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, reward=None, penalty=None) + cmd = mock_run.call_args[0][0] + assert "-reward" not in cmd + assert "-penalty" not in cmd + + def test_get_dataframe(runner, tmp_path): output_table = tmp_path / "output.txt" runner.output_table = str(output_table) diff --git a/tests/test_blast_specificity.py b/tests/test_blast_specificity.py index 55ccfe4..87ef6f7 100644 --- a/tests/test_blast_specificity.py +++ b/tests/test_blast_specificity.py @@ -178,6 +178,10 @@ def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): evalue_threshold=5.0, max_mismatches=1, max_amplicon_size=5000, + blast_evalue=500.0, + blast_word_size=11, + blast_reward=2, + blast_penalty=-3, ) # Verify annotator received custom thresholds @@ -188,6 +192,13 @@ def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): # Verify finder received custom max amplicon size finder_instance.find_amplicons.assert_called_once_with(max_size_bp=5000) + # Verify blast parameters were forwarded to runner.run() + _, run_kwargs = runner_instance.run.call_args + assert run_kwargs.get("evalue") == 500.0 + assert run_kwargs.get("word_size") == 11 + assert run_kwargs.get("reward") == 2 + assert run_kwargs.get("penalty") == -3 + def test_run_specificity_check_no_hits(mock_panel, tmp_path): with ( diff --git a/tests/test_config.py b/tests/test_config.py index b080d21..4d6c5b6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -191,7 +191,11 @@ def test_default_values(self): params = BlastParameters() assert params.length_threshold == 15 assert params.evalue_threshold == 10.0 - assert params.max_mismatches == 2 + assert params.max_mismatches == 3 + assert params.blast_evalue == 30000.0 + assert params.blast_word_size == 7 + assert params.blast_reward == 1 + assert params.blast_penalty == -1 assert params.max_amplicon_size == 2000 assert params.ontarget_tolerance == 5 diff --git a/tests/test_utils.py b/tests/test_utils.py index 425b77e..4e83614 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ # Tests for utility functions (GC clamp, check_kmer, etc.) # ================================================================================ -from plexus.utils.utils import check_gc_clamp, check_kmer +from plexus.utils.utils import check_gc_clamp, check_kmer, find_max_poly_gc class TestCheckGcClamp: @@ -62,3 +62,63 @@ def test_gc_clamp_enabled_accepts_good_3prime(self): """gc_clamp=1 accepts primers with 1-3 G/C at 3' end.""" seq = "GCGCGATATAATATC" # last 5 = ATATC, GC=1 assert check_kmer(seq, gc_clamp=1) is True + + +class TestFindMaxPolyGc: + """Tests for the G/C-specific poly-run filter.""" + + def test_four_g_rejected(self): + """GGGG (4 > 3) should be rejected.""" + assert find_max_poly_gc("ATGGGGAT", 3) is True + + def test_four_c_rejected(self): + """CCCC (4 > 3) should be rejected.""" + assert find_max_poly_gc("ATCCCCAT", 3) is True + + def test_three_g_passes(self): + """GGG (3 <= 3) should pass.""" + assert find_max_poly_gc("ATGGGATC", 3) is False + + def test_three_c_passes(self): + """CCC (3 <= 3) should pass.""" + assert find_max_poly_gc("ATCCCATG", 3) is False + + def test_four_a_passes(self): + """AAAA is not G/C, should pass.""" + assert find_max_poly_gc("GCAAAAGC", 3) is False + + def test_four_t_passes(self): + """TTTT is not G/C, should pass.""" + assert find_max_poly_gc("GCTTTTGC", 3) is False + + def test_mixed_gc_passes(self): + """GGCC has G-run=2 and C-run=2, both <= 3.""" + assert find_max_poly_gc("ATGGCCAT", 3) is False + + def test_short_sequence_passes(self): + """Sequence shorter than threshold always passes.""" + assert find_max_poly_gc("GGG", 3) is False + + def test_lowercase_handled(self): + """Lowercase G/C should be detected.""" + assert find_max_poly_gc("atggggatc", 3) is True + + +class TestCheckKmerPolyGc: + """Test max_poly_gc integration in check_kmer.""" + + def test_gggg_rejected_by_default(self): + """A primer with GGGG is rejected by the default max_poly_gc=3.""" + # GGGGCCGGATGTTCTGCTG — the real DCAF12L2 primer + seq = "GGGGCCGGATGTTCTGCTG" + assert check_kmer(seq, max_poly_X=5, max_poly_gc=3) is False + + def test_ggg_passes(self): + """A primer with GGG passes max_poly_gc=3.""" + seq = "GGGACCGGATGTTCTGCTG" + assert check_kmer(seq, max_poly_X=5, max_poly_gc=3) is True + + def test_poly_gc_can_be_relaxed(self): + """Setting max_poly_gc=4 allows GGGG.""" + seq = "GGGGCCGGATGTTCTGCTG" + assert check_kmer(seq, max_poly_X=5, max_poly_gc=4) is True From df2e5df168f4aa45aa490a380601436183df17ee Mon Sep 17 00:00:00 2001 From: sfilges Date: Tue, 3 Mar 2026 16:17:22 +0100 Subject: [PATCH 2/3] Added additional blast parameters for better performance --- CHANGELOG.md | 1 + config/designer_default_config.json | 6 +++-- config/designer_lenient_config.json | 2 ++ src/plexus/blast/blast_runner.py | 6 +++++ src/plexus/blast/specificity.py | 4 +++ src/plexus/config.py | 20 ++++++++++++++ src/plexus/pipeline.py | 2 ++ tests/test_blast_runner.py | 42 +++++++++++++++++++++++++++++ tests/test_blast_specificity.py | 4 +++ tests/test_config.py | 2 ++ 10 files changed, 87 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 378a0d0..2828667 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **G/C-specific poly-run filter** (`utils.py`): New `find_max_poly_gc()` function rejects primers with homopolymer runs of G or C exceeding `primer_max_poly_gc` (default 3). This catches primers like the DCAF12L2 `GGGGCCGGATGTTCTGCTG` forward primer that passed the old poly-X filter. Wired through `check_kmer()`, `generate_kmers()`, `filter_kmers()`, and `design.py`. - **BLAST `-evalue` parameter** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_evalue` field (default 30000.0) passed to `blastn -evalue`, ensuring weak but real off-target alignments are not silently dropped before the annotator can evaluate them. - **BLAST `-reward` / `-penalty` parameters** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_reward` (default 1) and `blast_penalty` (default -1) fields override the blastn-short scoring defaults, improving sensitivity for mismatched paralog alignments. +- **BLAST `-max_hsps` and `-dust` flags** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_max_hsps` (default 100) limits HSPs per query-subject pair, and `blast_dust` (default `"yes"`) enables low-complexity masking. Together these cut BLAST runtime significantly after the evalue/penalty sensitivity increase, without affecting off-target detection (the annotator already discards low-quality HSPs). - Tests for `find_max_poly_gc`, `check_kmer` poly-GC integration, BLAST evalue/reward/penalty/word_size parameter forwarding, and specificity check threading. ## [1.0.0] - 2026-03-03 diff --git a/config/designer_default_config.json b/config/designer_default_config.json index 95e5dba..1b4ae7c 100644 --- a/config/designer_default_config.json +++ b/config/designer_default_config.json @@ -8,7 +8,7 @@ "primer_min_length": 15, "primer_max_length": 30, "PRIMER_OPT_BOUND": 98.0, - "PRIMER_MIN_BOUND": 35.0, + "PRIMER_MIN_BOUND": 30.0, "PRIMER_MAX_BOUND": 120.0, "junction_padding_bases": 3, "forward_tail": "GGACACTCTTTCCCTACACGACGCTCTTCCGATCTAAAAAAAAAAAAAAAAAAAATGGGAAAGAGTGTCC", @@ -20,7 +20,7 @@ "primer_min_gc": 30, "primer_max_gc": 70, "primer_gc_clamp": 1, - "primer_max_poly_x": 4, + "primer_max_poly_x": 5, "primer_max_poly_gc": 3, "primer_max_n": 0, "PRIMER_MAX_SELF_ANY_TH": 45.0, @@ -79,6 +79,8 @@ "blast_word_size": 7, "blast_reward": 1, "blast_penalty": -1, + "blast_max_hsps": 100, + "blast_dust": "yes", "max_amplicon_size": 2000, "ontarget_tolerance": 5 }, diff --git a/config/designer_lenient_config.json b/config/designer_lenient_config.json index 352875d..02d7b2e 100644 --- a/config/designer_lenient_config.json +++ b/config/designer_lenient_config.json @@ -79,6 +79,8 @@ "blast_word_size": 7, "blast_reward": 1, "blast_penalty": -1, + "blast_max_hsps": 100, + "blast_dust": "yes", "max_amplicon_size": 2000, "ontarget_tolerance": 5 }, diff --git a/src/plexus/blast/blast_runner.py b/src/plexus/blast/blast_runner.py index 4433c7a..34641cc 100644 --- a/src/plexus/blast/blast_runner.py +++ b/src/plexus/blast/blast_runner.py @@ -99,6 +99,8 @@ def run( evalue: float | None = None, reward: int | None = None, penalty: int | None = None, + max_hsps: int | None = None, + dust: str | None = None, ): """ Run blast, writing a BLAST archive to `output_archive`. @@ -133,6 +135,10 @@ def run( cmd.extend(["-reward", str(reward)]) if penalty is not None: cmd.extend(["-penalty", str(penalty)]) + if max_hsps is not None: + cmd.extend(["-max_hsps", str(max_hsps)]) + if dust is not None: + cmd.extend(["-dust", dust]) if num_threads > 1: cmd.extend(["-num_threads", str(num_threads)]) diff --git a/src/plexus/blast/specificity.py b/src/plexus/blast/specificity.py index 4db1f74..295bfda 100644 --- a/src/plexus/blast/specificity.py +++ b/src/plexus/blast/specificity.py @@ -23,6 +23,8 @@ def run_specificity_check( blast_word_size: int = 7, blast_reward: int = 1, blast_penalty: int = -1, + blast_max_hsps: int = 100, + blast_dust: str = "yes", ): """ Run BLAST on all candidate primers in the panel to check for specificity @@ -74,6 +76,8 @@ def run_specificity_check( word_size=blast_word_size, reward=blast_reward, penalty=blast_penalty, + max_hsps=blast_max_hsps, + dust=blast_dust, ) runner.reformat_output_as_table(blast_table) finally: diff --git a/src/plexus/config.py b/src/plexus/config.py index 68a26ab..04bbf2f 100644 --- a/src/plexus/config.py +++ b/src/plexus/config.py @@ -320,6 +320,26 @@ class BlastParameters(BaseModel): "3'-anchored filters still reject true noise." ), ) + blast_max_hsps: int = Field( + default=100, + ge=1, + le=10000, + description=( + "Maximum HSPs per query-subject pair passed to blastn via -max_hsps. " + "Each chromosome is one subject; 100 is generous for real binding sites " + "and avoids wasting time on thousands of low-quality HSPs." + ), + ) + blast_dust: str = Field( + default="yes", + pattern=r"^(yes|no|\d+ \d+ \d+)$", + description=( + "Low-complexity filter passed to blastn via -dust. " + "blastn-short defaults to 'no'; enabling it ('yes') avoids extending " + "seeds in Alu/LINE regions, improving runtime without affecting " + "off-target detection." + ), + ) max_amplicon_size: int = Field( default=2000, ge=100, diff --git a/src/plexus/pipeline.py b/src/plexus/pipeline.py index 59ddc59..25ab685 100644 --- a/src/plexus/pipeline.py +++ b/src/plexus/pipeline.py @@ -538,6 +538,8 @@ def run_pipeline( blast_word_size=blast_config.blast_word_size, blast_reward=blast_config.blast_reward, blast_penalty=blast_config.blast_penalty, + blast_max_hsps=blast_config.blast_max_hsps, + blast_dust=blast_config.blast_dust, ) result.steps_completed.append("specificity_checked") logger.info("Specificity check complete") diff --git a/tests/test_blast_runner.py b/tests/test_blast_runner.py index cc77ac6..0d84bbd 100644 --- a/tests/test_blast_runner.py +++ b/tests/test_blast_runner.py @@ -196,6 +196,48 @@ def test_run_omits_reward_penalty_when_none(runner, tmp_path): assert "-penalty" not in cmd +def test_run_includes_max_hsps_when_provided(runner, tmp_path): + """max_hsps parameter is included in the blastn command when set.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, max_hsps=100) + cmd = mock_run.call_args[0][0] + assert "-max_hsps" in cmd + assert "100" in cmd + + +def test_run_omits_max_hsps_when_none(runner, tmp_path): + """max_hsps parameter is omitted from the blastn command when None.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, max_hsps=None) + cmd = mock_run.call_args[0][0] + assert "-max_hsps" not in cmd + + +def test_run_includes_dust_when_provided(runner, tmp_path): + """dust parameter is included in the blastn command when set.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, dust="yes") + cmd = mock_run.call_args[0][0] + assert "-dust" in cmd + assert "yes" in cmd + + +def test_run_omits_dust_when_none(runner, tmp_path): + """dust parameter is omitted from the blastn command when None.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + with patch("subprocess.run") as mock_run: + runner.run(output_archive, dust=None) + cmd = mock_run.call_args[0][0] + assert "-dust" not in cmd + + def test_get_dataframe(runner, tmp_path): output_table = tmp_path / "output.txt" runner.output_table = str(output_table) diff --git a/tests/test_blast_specificity.py b/tests/test_blast_specificity.py index 87ef6f7..c3cf862 100644 --- a/tests/test_blast_specificity.py +++ b/tests/test_blast_specificity.py @@ -182,6 +182,8 @@ def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): blast_word_size=11, blast_reward=2, blast_penalty=-3, + blast_max_hsps=50, + blast_dust="no", ) # Verify annotator received custom thresholds @@ -198,6 +200,8 @@ def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): assert run_kwargs.get("word_size") == 11 assert run_kwargs.get("reward") == 2 assert run_kwargs.get("penalty") == -3 + assert run_kwargs.get("max_hsps") == 50 + assert run_kwargs.get("dust") == "no" def test_run_specificity_check_no_hits(mock_panel, tmp_path): diff --git a/tests/test_config.py b/tests/test_config.py index 4d6c5b6..f5d1949 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -196,6 +196,8 @@ def test_default_values(self): assert params.blast_word_size == 7 assert params.blast_reward == 1 assert params.blast_penalty == -1 + assert params.blast_max_hsps == 100 + assert params.blast_dust == "yes" assert params.max_amplicon_size == 2000 assert params.ontarget_tolerance == 5 From a2b9c7fad72048bb45c31b18ecdb4f2c3198b3b0 Mon Sep 17 00:00:00 2001 From: sfilges Date: Tue, 3 Mar 2026 17:02:50 +0100 Subject: [PATCH 3/3] Further tuning to the BLAST off-target detection --- CHANGELOG.md | 2 + config/designer_default_config.json | 1 + config/designer_lenient_config.json | 1 + docs/ROADMAP.md | 1 + src/plexus/blast/annotator.py | 21 ++++-- src/plexus/blast/blast_runner.py | 44 +++++++++---- src/plexus/blast/specificity.py | 32 ++++------ src/plexus/config.py | 13 ++++ src/plexus/pipeline.py | 1 + tests/test_blast_annotator.py | 48 ++++++++++++++ tests/test_blast_runner.py | 99 +++++++++++++++++++++-------- tests/test_blast_specificity.py | 31 +++++---- 12 files changed, 220 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2828667..e10a55f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **BLAST `-evalue` parameter** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_evalue` field (default 30000.0) passed to `blastn -evalue`, ensuring weak but real off-target alignments are not silently dropped before the annotator can evaluate them. - **BLAST `-reward` / `-penalty` parameters** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_reward` (default 1) and `blast_penalty` (default -1) fields override the blastn-short scoring defaults, improving sensitivity for mismatched paralog alignments. - **BLAST `-max_hsps` and `-dust` flags** (`config.py`, `blast_runner.py`, `specificity.py`, `pipeline.py`): New `blast_max_hsps` (default 100) limits HSPs per query-subject pair, and `blast_dust` (default `"yes"`) enables low-complexity masking. Together these cut BLAST runtime significantly after the evalue/penalty sensitivity increase, without affecting off-target detection (the annotator already discards low-quality HSPs). +- **Direct tabular BLAST output** (`blast_runner.py`, `specificity.py`): `BlastRunner.run()` now accepts `output_table=` to write tabular results (`-outfmt 6`) directly, eliminating the intermediate archive (`-outfmt 11`) write/read and the `blast_formatter` subprocess. The specificity check uses this new path by default, removing the `try/finally` archive cleanup. The legacy `output_archive=` parameter is deprecated but still functional. +- **3'-end tolerance for BLAST annotation** (`annotator.py`, `config.py`, `specificity.py`, `pipeline.py`): New `three_prime_tolerance` parameter (default 3) relaxes the `from_3prime` check from `qend == qlen` to `qlen - qend <= tolerance`. BLAST's local alignment clips terminal mismatches, causing hits like the DCAF12L1 reverse primer (19/21bp aligned, 2bp clipped at 3' end) to be wrongly discarded as "not 3'-anchored" even though Primer-BLAST detects them via semi-global alignment. - Tests for `find_max_poly_gc`, `check_kmer` poly-GC integration, BLAST evalue/reward/penalty/word_size parameter forwarding, and specificity check threading. ## [1.0.0] - 2026-03-03 diff --git a/config/designer_default_config.json b/config/designer_default_config.json index 1b4ae7c..9fb5a25 100644 --- a/config/designer_default_config.json +++ b/config/designer_default_config.json @@ -75,6 +75,7 @@ "length_threshold": 15, "evalue_threshold": 10.0, "max_mismatches": 3, + "three_prime_tolerance": 3, "blast_evalue": 30000.0, "blast_word_size": 7, "blast_reward": 1, diff --git a/config/designer_lenient_config.json b/config/designer_lenient_config.json index 02d7b2e..037e371 100644 --- a/config/designer_lenient_config.json +++ b/config/designer_lenient_config.json @@ -75,6 +75,7 @@ "length_threshold": 15, "evalue_threshold": 10.0, "max_mismatches": 3, + "three_prime_tolerance": 3, "blast_evalue": 30000.0, "blast_word_size": 7, "blast_reward": 1, diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 77df852..148ac86 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -639,6 +639,7 @@ Add a `max_candidates_per_junction` parameter to `MultiplexPickerParameters` (de **File: `src/plexus/resources.py`** Add presets for: + - `hg19` / `GRCh37` (legacy clinical datasets) - `mm39` (mouse, common model organism) - `GRCh38_no_alt` (analysis-set FASTA without alt contigs, preferred for clinical pipelines) diff --git a/src/plexus/blast/annotator.py b/src/plexus/blast/annotator.py index ed83742..562872f 100644 --- a/src/plexus/blast/annotator.py +++ b/src/plexus/blast/annotator.py @@ -20,14 +20,27 @@ def __init__(self, blast_df, target_map=None): self.target_map = target_map or {} def build_annotation_dict( - self, length_threshold=12, evalue_threshold=4, max_mismatches=2 + self, + length_threshold=12, + evalue_threshold=4, + max_mismatches=2, + three_prime_tolerance=0, ): """ - Build a dictionary specifying conditions for annotation - + Build a dictionary specifying conditions for annotation. + + Parameters + ---------- + three_prime_tolerance : int + Maximum number of unaligned bases at the 3' end of the primer + for a hit to still be considered "3'-anchored". BLAST's local + alignment clips terminal mismatches, so a primer with a mismatch + at or near the 3' end will have ``qend < qlen``. A tolerance + of 2-3 catches these cases without accepting 5'-only hits. """ self.annotations = { - "from_3prime": lambda row: row["qend"] == row["qlen"], + "from_3prime": lambda row: (row["qlen"] - row["qend"]) + <= three_prime_tolerance, "length_pass_3prime": lambda row: ( row["length"] >= length_threshold and row["mismatch"] <= max_mismatches diff --git a/src/plexus/blast/blast_runner.py b/src/plexus/blast/blast_runner.py index 34641cc..ccbba9b 100644 --- a/src/plexus/blast/blast_runner.py +++ b/src/plexus/blast/blast_runner.py @@ -1,4 +1,5 @@ import os +import warnings import pandas as pd from loguru import logger @@ -30,8 +31,7 @@ def __init__(self, input_fasta, reference_fasta): """ Interface for running BLAST: - Creates BLAST database - - Runs BLAST in archive mode - - Reformats to a table + - Runs BLAST (direct tabular output or legacy archive mode) - Converts table to a pandas data frame params @@ -92,7 +92,9 @@ def create_database(self): def run( self, - output_archive, + output_archive=None, + *, + output_table=None, word_size=None, task="blastn-short", num_threads: int = 1, @@ -103,16 +105,34 @@ def run( dust: str | None = None, ): """ - Run blast, writing a BLAST archive to `output_archive`. + Run blastn and write results. Uses ``-task blastn-short`` by default, which is tuned for primer-length queries (<30 bp) with word_size=7, reward 1, penalty −3, and gap costs 5/2. - Note that we run with output format 11 ``-outfmt 11`` to - produce the archive; from this format you can convert to - all other formats. + Parameters + ---------- + output_table : str + Path for direct tabular output (``-outfmt 6``). Preferred. + output_archive : str + *Deprecated.* Path for a BLAST archive (``-outfmt 11``). + Use ``output_table`` instead to skip the archive/reformat step. """ + if output_table is not None: + outfmt = f"6 {self.BLAST_COLS}" + out_path = output_table + elif output_archive is not None: + warnings.warn( + "output_archive is deprecated; use output_table for direct " + "tabular output and avoid the archive/reformat step.", + DeprecationWarning, + stacklevel=2, + ) + outfmt = "11" + out_path = output_archive + else: + raise ValueError("Either output_table or output_archive must be provided.") cmd = [ "blastn", @@ -123,9 +143,9 @@ def run( "-task", task, "-outfmt", - "11", + outfmt, "-out", - output_archive, + out_path, ] if word_size is not None: cmd.extend(["-word_size", str(word_size)]) @@ -144,8 +164,10 @@ def run( run_command(cmd, check=True, retries=2) - # Save as instance variable, for reformattings - self.output_archive = output_archive + if output_table is not None: + self.output_table = output_table + else: + self.output_archive = output_archive return self diff --git a/src/plexus/blast/specificity.py b/src/plexus/blast/specificity.py index 295bfda..ccf4f2b 100644 --- a/src/plexus/blast/specificity.py +++ b/src/plexus/blast/specificity.py @@ -17,6 +17,7 @@ def run_specificity_check( length_threshold: int = 15, evalue_threshold: float = 10.0, max_mismatches: int = 3, + three_prime_tolerance: int = 3, max_amplicon_size: int = 2000, ontarget_tolerance: int = 5, blast_evalue: float = 30000.0, @@ -40,6 +41,8 @@ def run_specificity_check( length_threshold: Minimum 3'-anchored alignment length (bp) to predict binding. evalue_threshold: E-value cutoff for predicted binding. max_mismatches: Maximum mismatches in a 3'-anchored alignment. + three_prime_tolerance: Max unaligned bases at the 3' end for a hit to be + considered 3'-anchored. Compensates for BLAST clipping terminal mismatches. max_amplicon_size: Maximum amplicon size (bp) to consider. ontarget_tolerance: Coordinate tolerance (bp) for on-target classification. blast_evalue: E-value passed to blastn via -evalue to control search sensitivity. @@ -63,28 +66,20 @@ def run_specificity_check( panel.save_candidate_primers_to_fasta(input_fasta) # 3. Run BLAST - blast_archive = os.path.join(work_dir, "blast_archive") blast_table = os.path.join(work_dir, "blast_table.txt") runner = BlastRunner(input_fasta, genome_fasta) runner.create_database() - try: - runner.run( - output_archive=blast_archive, - num_threads=num_threads, - evalue=blast_evalue, - word_size=blast_word_size, - reward=blast_reward, - penalty=blast_penalty, - max_hsps=blast_max_hsps, - dust=blast_dust, - ) - runner.reformat_output_as_table(blast_table) - finally: - if os.path.exists(blast_archive): - os.remove(blast_archive) - logger.debug(f"Removed BLAST archive: {blast_archive}") - + runner.run( + output_table=blast_table, + num_threads=num_threads, + evalue=blast_evalue, + word_size=blast_word_size, + reward=blast_reward, + penalty=blast_penalty, + max_hsps=blast_max_hsps, + dust=blast_dust, + ) blast_df = runner.get_dataframe() if blast_df.empty: @@ -98,6 +93,7 @@ def run_specificity_check( length_threshold=length_threshold, evalue_threshold=evalue_threshold, max_mismatches=max_mismatches, + three_prime_tolerance=three_prime_tolerance, ) annotator.add_annotations() diff --git a/src/plexus/config.py b/src/plexus/config.py index 04bbf2f..d117038 100644 --- a/src/plexus/config.py +++ b/src/plexus/config.py @@ -282,6 +282,19 @@ class BlastParameters(BaseModel): "15+ bp 3' stretch will still extend in PCR." ), ) + three_prime_tolerance: int = Field( + default=3, + ge=0, + le=5, + description=( + "Maximum number of unaligned bases at the 3' end of the primer for " + "a BLAST hit to still be considered '3'-anchored'. BLAST's local " + "alignment clips terminal mismatches, so a primer with mismatches " + "at or near the 3' end will have qend < qlen. A tolerance of 3 " + "catches these real binding events (e.g. DCAF12L1 paralogs) that " + "Primer-BLAST detects via its semi-global alignment." + ), + ) blast_evalue: float = Field( default=30000.0, gt=0.0, diff --git a/src/plexus/pipeline.py b/src/plexus/pipeline.py index 25ab685..eb8cbb5 100644 --- a/src/plexus/pipeline.py +++ b/src/plexus/pipeline.py @@ -532,6 +532,7 @@ def run_pipeline( length_threshold=blast_config.length_threshold, evalue_threshold=blast_config.evalue_threshold, max_mismatches=blast_config.max_mismatches, + three_prime_tolerance=blast_config.three_prime_tolerance, max_amplicon_size=blast_config.max_amplicon_size, ontarget_tolerance=blast_config.ontarget_tolerance, blast_evalue=blast_config.blast_evalue, diff --git a/tests/test_blast_annotator.py b/tests/test_blast_annotator.py index e32aac6..f0c1a2f 100644 --- a/tests/test_blast_annotator.py +++ b/tests/test_blast_annotator.py @@ -154,3 +154,51 @@ def test_length_pass_rejects_high_mismatch(): # 2 mismatches: from_3prime=True and length_pass_3prime=True assert bool(df.iloc[1]["from_3prime"]) is True assert bool(df.iloc[1]["length_pass_3prime"]) is True + + +def test_three_prime_tolerance_catches_clipped_terminal_mismatches(): + """ + BLAST's local alignment clips terminal mismatches, so a primer with a + mismatch at or near the 3' end will have qend < qlen. With + three_prime_tolerance > 0, these hits should be classified as from_3prime. + + This is the DCAF12L1 scenario: a 21bp reverse primer aligns 19bp + (qend=19, qlen=21) because BLAST clips 2 terminal mismatched bases. + """ + cols = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sstrand qlen".split() + data = [ + # Exact 3' match: qend == qlen + ["SEQ_0", "chr1", 100.0, 21, 0, 0, 1, 21, 100, 120, 0.4, 35.0, "minus", 21], + # 2bp clipped at 3' end: qend=19, qlen=21 (the DCAF12L1 hit) + ["SEQ_0", "chr1", 89.5, 19, 2, 0, 1, 19, 200, 218, 298.0, 25.4, "minus", 21], + # 5bp clipped at 3' end: qend=16, qlen=21 — too far from 3' end + ["SEQ_0", "chr1", 87.5, 16, 2, 0, 1, 16, 300, 315, 900.0, 22.0, "minus", 21], + ] + df = pd.DataFrame(data, columns=cols) + + # tolerance=0 (old behaviour): only exact 3' match passes + annotator0 = BlastResultsAnnotator(df.copy()) + annotator0.build_annotation_dict( + length_threshold=15, + evalue_threshold=10.0, + max_mismatches=3, + three_prime_tolerance=0, + ) + annotator0.add_annotations() + assert bool(annotator0.blast_df.iloc[0]["from_3prime"]) is True + assert bool(annotator0.blast_df.iloc[1]["from_3prime"]) is False # missed! + assert bool(annotator0.blast_df.iloc[2]["from_3prime"]) is False + + # tolerance=3: the 2bp-clipped hit now passes + annotator3 = BlastResultsAnnotator(df.copy()) + annotator3.build_annotation_dict( + length_threshold=15, + evalue_threshold=10.0, + max_mismatches=3, + three_prime_tolerance=3, + ) + annotator3.add_annotations() + assert bool(annotator3.blast_df.iloc[0]["from_3prime"]) is True + assert bool(annotator3.blast_df.iloc[1]["from_3prime"]) is True # now caught + assert bool(annotator3.blast_df.iloc[1]["predicted_bound"]) is True + assert bool(annotator3.blast_df.iloc[2]["from_3prime"]) is False # still too far diff --git a/tests/test_blast_runner.py b/tests/test_blast_runner.py index 0d84bbd..7fdc00d 100644 --- a/tests/test_blast_runner.py +++ b/tests/test_blast_runner.py @@ -1,3 +1,4 @@ +import warnings from unittest.mock import patch import pandas as pd @@ -62,12 +63,12 @@ def isfile_v4(path): def test_run_command_defaults(runner, tmp_path): - """Default run uses -task blastn-short and omits -word_size.""" - output_archive = tmp_path / "output.asn" + """Default run with output_table uses -task blastn-short, outfmt 6, and omits -word_size.""" + output_table = tmp_path / "output.txt" runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(str(output_archive)) + runner.run(output_table=str(output_table)) args, kwargs = mock_run.call_args command = args[0] @@ -80,17 +81,18 @@ def test_run_command_defaults(runner, tmp_path): assert "blastn-short" in command assert "-word_size" not in command assert "-outfmt" in command + assert any("6 qseqid sseqid" in arg for arg in command) assert "-out" in command - assert str(output_archive) in command + assert str(output_table) in command def test_run_command_explicit_word_size(runner, tmp_path): """Explicit word_size is included in the command.""" - output_archive = tmp_path / "output.asn" + output_table = tmp_path / "output.txt" runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(str(output_archive), word_size=11) + runner.run(output_table=str(output_table), word_size=11) args, kwargs = mock_run.call_args command = args[0] @@ -102,11 +104,11 @@ def test_run_command_explicit_word_size(runner, tmp_path): def test_run_command_custom_task(runner, tmp_path): """Custom task parameter is passed through.""" - output_archive = tmp_path / "output.asn" + output_table = tmp_path / "output.txt" runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(str(output_archive), task="blastn") + runner.run(output_table=str(output_table), task="blastn") args, kwargs = mock_run.call_args command = args[0] @@ -133,30 +135,30 @@ def test_reformat_output_command(runner, tmp_path): def test_run_emits_num_threads_when_gt_1(runner, tmp_path): - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, num_threads=4) + runner.run(output_table=output_table, num_threads=4) cmd = mock_run.call_args[0][0] assert "-num_threads" in cmd assert "4" in cmd def test_run_omits_num_threads_when_1(runner, tmp_path): - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, num_threads=1) + runner.run(output_table=output_table, num_threads=1) cmd = mock_run.call_args[0][0] assert "-num_threads" not in cmd def test_run_includes_evalue_when_provided(runner, tmp_path): """evalue parameter is included in the blastn command when set.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, evalue=1000.0) + runner.run(output_table=output_table, evalue=1000.0) cmd = mock_run.call_args[0][0] assert "-evalue" in cmd assert "1000.0" in cmd @@ -164,20 +166,20 @@ def test_run_includes_evalue_when_provided(runner, tmp_path): def test_run_omits_evalue_when_none(runner, tmp_path): """evalue parameter is omitted from the blastn command when None.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, evalue=None) + runner.run(output_table=output_table, evalue=None) cmd = mock_run.call_args[0][0] assert "-evalue" not in cmd def test_run_includes_reward_penalty_when_provided(runner, tmp_path): """reward and penalty parameters are included in the blastn command.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, reward=1, penalty=-1) + runner.run(output_table=output_table, reward=1, penalty=-1) cmd = mock_run.call_args[0][0] assert "-reward" in cmd assert "1" in cmd @@ -187,10 +189,10 @@ def test_run_includes_reward_penalty_when_provided(runner, tmp_path): def test_run_omits_reward_penalty_when_none(runner, tmp_path): """reward and penalty are omitted from the blastn command when None.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, reward=None, penalty=None) + runner.run(output_table=output_table, reward=None, penalty=None) cmd = mock_run.call_args[0][0] assert "-reward" not in cmd assert "-penalty" not in cmd @@ -198,10 +200,10 @@ def test_run_omits_reward_penalty_when_none(runner, tmp_path): def test_run_includes_max_hsps_when_provided(runner, tmp_path): """max_hsps parameter is included in the blastn command when set.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, max_hsps=100) + runner.run(output_table=output_table, max_hsps=100) cmd = mock_run.call_args[0][0] assert "-max_hsps" in cmd assert "100" in cmd @@ -209,20 +211,20 @@ def test_run_includes_max_hsps_when_provided(runner, tmp_path): def test_run_omits_max_hsps_when_none(runner, tmp_path): """max_hsps parameter is omitted from the blastn command when None.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, max_hsps=None) + runner.run(output_table=output_table, max_hsps=None) cmd = mock_run.call_args[0][0] assert "-max_hsps" not in cmd def test_run_includes_dust_when_provided(runner, tmp_path): """dust parameter is included in the blastn command when set.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, dust="yes") + runner.run(output_table=output_table, dust="yes") cmd = mock_run.call_args[0][0] assert "-dust" in cmd assert "yes" in cmd @@ -230,10 +232,10 @@ def test_run_includes_dust_when_provided(runner, tmp_path): def test_run_omits_dust_when_none(runner, tmp_path): """dust parameter is omitted from the blastn command when None.""" - output_archive = str(tmp_path / "out.asn") + output_table = str(tmp_path / "out.txt") runner.db_path = "/fake/db" with patch("subprocess.run") as mock_run: - runner.run(output_archive, dust=None) + runner.run(output_table=output_table, dust=None) cmd = mock_run.call_args[0][0] assert "-dust" not in cmd @@ -251,3 +253,44 @@ def test_get_dataframe(runner, tmp_path): assert isinstance(df, pd.DataFrame) assert len(df) == 1 assert df.iloc[0]["qseqid"] == "Q1" + + +def test_run_direct_tabular_output(runner, tmp_path): + """output_table= produces outfmt 6 and sets self.output_table.""" + output_table = str(tmp_path / "table.txt") + runner.db_path = "/fake/db" + + with patch("subprocess.run"): + runner.run(output_table=output_table) + + assert runner.output_table == output_table + assert not hasattr(runner, "output_archive") + + +def test_run_legacy_archive_emits_deprecation_warning(runner, tmp_path): + """output_archive= still works but emits DeprecationWarning.""" + output_archive = str(tmp_path / "out.asn") + runner.db_path = "/fake/db" + + with patch("subprocess.run") as mock_run: + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + runner.run(output_archive) + + assert len(w) == 1 + assert issubclass(w[0].category, DeprecationWarning) + assert "output_archive is deprecated" in str(w[0].message) + + cmd = mock_run.call_args[0][0] + assert "-outfmt" in cmd + idx = cmd.index("-outfmt") + assert cmd[idx + 1] == "11" + assert runner.output_archive == output_archive + + +def test_run_raises_when_no_output_specified(runner): + """ValueError when neither output_table nor output_archive is given.""" + runner.db_path = "/fake/db" + + with pytest.raises(ValueError, match="Either output_table or output_archive"): + runner.run() diff --git a/tests/test_blast_specificity.py b/tests/test_blast_specificity.py index c3cf862..07b230e 100644 --- a/tests/test_blast_specificity.py +++ b/tests/test_blast_specificity.py @@ -43,22 +43,14 @@ def mock_panel(): return panel -def test_blast_archive_removed_after_run(mock_panel, tmp_path): - """The BLAST archive file is cleaned up after a successful specificity check.""" - blast_archive = tmp_path / "blast_archive" - +def test_direct_tabular_no_archive(mock_panel, tmp_path): + """Specificity check uses output_table directly, no archive file is created.""" with ( patch("plexus.blast.specificity.BlastRunner") as MockRunner, patch("plexus.blast.specificity.BlastResultsAnnotator") as MockAnnotator, patch("plexus.blast.specificity.AmpliconFinder") as MockFinder, ): runner_instance = MockRunner.return_value - - def fake_run(output_archive, **kwargs): - # Simulate BLAST creating the archive file - open(output_archive, "w").close() - - runner_instance.run.side_effect = fake_run runner_instance.get_dataframe.return_value = pd.DataFrame({"dummy": [1]}) annotator_instance = MockAnnotator.return_value @@ -71,7 +63,14 @@ def fake_run(output_archive, **kwargs): run_specificity_check(mock_panel, str(tmp_path), "fake_genome.fa") - assert not blast_archive.exists(), "BLAST archive should be removed after run" + # Verify output_table= was used (not output_archive) + _, run_kwargs = runner_instance.run.call_args + assert "output_table" in run_kwargs + assert run_kwargs["output_table"].endswith("blast_table.txt") + # output_archive should NOT appear in the call + assert run_kwargs.get("output_archive") is None + # reformat_output_as_table should never be called + runner_instance.reformat_output_as_table.assert_not_called() def test_run_specificity_check_integration(mock_panel, tmp_path): @@ -149,6 +148,7 @@ def test_run_specificity_check_forwards_num_threads(mock_panel, tmp_path): _, kwargs = runner_instance.run.call_args assert kwargs.get("num_threads") == 6 + assert "output_table" in kwargs def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): @@ -186,9 +186,13 @@ def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): blast_dust="no", ) - # Verify annotator received custom thresholds + # Verify annotator received custom thresholds (three_prime_tolerance + # uses default since not overridden in this call) annotator_instance.build_annotation_dict.assert_called_once_with( - length_threshold=20, evalue_threshold=5.0, max_mismatches=1 + length_threshold=20, + evalue_threshold=5.0, + max_mismatches=1, + three_prime_tolerance=3, ) # Verify finder received custom max amplicon size @@ -202,6 +206,7 @@ def test_run_specificity_check_forwards_blast_parameters(mock_panel, tmp_path): assert run_kwargs.get("penalty") == -3 assert run_kwargs.get("max_hsps") == 50 assert run_kwargs.get("dust") == "no" + assert "output_table" in run_kwargs def test_run_specificity_check_no_hits(mock_panel, tmp_path):