dholab · nrminor · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -0,0 +1,72 @@
+name: Integration
+
+on:
+  push:
+    branches: [main, develop]
+    paths:
+      - "main.nf"
+      - "nextflow.config"
+      - "conf/**"
+      - "bin/**"
+      - "lib/**"
+      - "modules/**/*.nf"
+      - "subworkflows/**/*.nf"
+      - "workflows/**/*.nf"
+      - "tests/**"
+      - "pyproject.toml"
+      - "pixi.lock"
+      - "uv.lock"
+      - ".github/workflows/integration.yml"
+  pull_request:
+    branches: [main, develop]
+    paths:
+      - "main.nf"
+      - "nextflow.config"
+      - "conf/**"
+      - "bin/**"
+      - "lib/**"
+      - "modules/**/*.nf"
+      - "subworkflows/**/*.nf"
+      - "workflows/**/*.nf"
+      - "tests/**"
+      - "pyproject.toml"
+      - "pixi.lock"
+      - "uv.lock"
+      - ".github/workflows/integration.yml"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  mini-sra-nextflow:
+    name: Mini SRA Nextflow integration
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Java 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: "temurin"
+          java-version: "17"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v2
+
+      - name: Set up Pixi environment
+        uses: prefix-dev/setup-pixi@v0.9.6
+        with:
+          pixi-version: v0.68.0
+          cache: true
+
+      - name: Check Pixi-provided Nextflow
+        run: pixi run nextflow -version
+
+      - name: Run mini SRA integration test
+        env:
+          NVD_INTEGRATION_PROFILE: test
+        run: pixi run e2e-test-ci
diff --git a/.gitignore b/.gitignore
@@ -97,5 +97,8 @@
 
 # testing data
 !/tests
+!/tests/*.py
 !/tests/data
 !/tests/data/*
+!/tests/scripts
+!/tests/scripts/*.py
diff --git a/README.md b/README.md
@@ -173,6 +173,14 @@ For local FASTQ files:
 nvd samplesheet generate --from-dir ./fastqs --platform illumina --output samplesheet.csv
 ```
 
+If your Illumina FASTQs use CASAVA-style names like `patient-001_S7_L003_R1_001.fastq.gz`, add `--sanitize` when you want the generated `sample_id` to be `patient-001` rather than `patient-001_S7_L003`:
+
+```bash
+nvd samplesheet generate --from-dir ./fastqs --platform illumina --sanitize --output samplesheet.csv
+```
+
+`--sanitize` only changes generated sample IDs. It does not concatenate multiple lanes; multi-lane Illumina inputs still produce one row per discovered read pair.
+
 For Nanopore/ONT files, use `--platform ont`:
 
 ```bash
@@ -350,6 +358,9 @@ nvd taxonomy ensure --taxonomy-dir /path/to/taxdump
 nvd taxonomy status --taxonomy-dir /path/to/taxdump
 ```
 
+Existing taxonomy data is reused even when it is older than the freshness warning window. NVD downloads or rebuilds taxonomy only when required files are absent, which avoids mutating shared HPC taxonomy directories merely because they are old.
+
+
 You can also set:
 
 ```bash

diff --git a/bin/annotate_blast_lca.py b/bin/annotate_blast_lca.py
@@ -36,7 +36,6 @@
 """
 
 import argparse
-import os
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -54,6 +53,28 @@
 DEFAULT_MAX_E = 1e-10
 DEFAULT_DELTA_S = 5.0  # ΔS window for "near ties"
 DEFAULT_MIN_SUPPORT = 0.8  # dominance threshold
+INPUT_COLUMNS = [
+    "task",
+    "sample",
+    "qseqid",
+    "qlen",
+    "sseqid",
+    "stitle",
+    "length",
+    "pident",
+    "evalue",
+    "bitscore",
+    "sscinames",
+    "staxids",
+    "rank",
+]
+LCA_COLUMNS = [
+    "adjusted_taxid",
+    "adjusted_taxid_name",
+    "adjusted_taxid_rank",
+    "adjustment_method",
+]
+OUTPUT_COLUMNS = [*INPUT_COLUMNS, *LCA_COLUMNS]
 
 
 @dataclass
@@ -78,6 +99,20 @@ class LcaParams:
     min_support: float = DEFAULT_MIN_SUPPORT
 
 
+def input_has_data_rows(path: Path) -> bool:
+    """Return true when a TSV has at least one row after its header."""
+    if path.stat().st_size == 0:
+        return False
+    with path.open(encoding="utf-8") as handle:
+        next(handle, None)
+        return next(handle, None) is not None
+
+
+def write_empty_lca_output(path: Path) -> None:
+    """Write a header-only LCA TSV for valid no-hit samples."""
+    path.write_text("\t".join(OUTPUT_COLUMNS) + "\n", encoding="utf-8")
+
+
 def filter_blast_hits(unfiltered_hits: pl.LazyFrame, params: LcaParams) -> pl.LazyFrame:
     """
     Filter BLAST hits by quality thresholds and identify near-tie cases.
@@ -477,10 +512,11 @@ def main() -> None:
     )
 
     logger.info("Loading NCBI taxonomy database...")
-    # Handle empty input file
-    if os.path.getsize(args.input_file) == 0:
-        logger.warning("Input file is empty. Creating empty output file.")
-        Path(args.output_file).touch()
+    input_path = Path(args.input_file)
+    output_path = Path(args.output_file)
+    if not input_has_data_rows(input_path):
+        logger.warning("Input file has no BLAST data rows. Creating header-only output file.")
+        write_empty_lca_output(output_path)
         return
 
     with taxonomy.open(

diff --git a/bin/filter_non_virus_blast_nodes.py b/bin/filter_non_virus_blast_nodes.py
@@ -11,6 +11,8 @@
 
 import pandas as pd
 
+LINEAGE_TOKEN_PARTS = 2
+
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
@@ -21,10 +23,19 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
+def is_virus_lineage(lineage: object) -> bool:
+    """Return true when a lineage token names Viruses exactly."""
+    for token in str(lineage).split(";"):
+        parts = token.strip().split(":", maxsplit=1)
+        if len(parts) == LINEAGE_TOKEN_PARTS and parts[1].strip() == "Viruses":
+            return True
+    return False
+
+
 def contains_non_phage_viruses(group: pd.DataFrame) -> bool:
-    virus_hits = group[group["rank"].str.contains("root:Viruses", na=False)]
+    virus_hits = group[group["rank"].apply(is_virus_lineage)]
     non_phage_viruses = virus_hits[
-        ~virus_hits["stitle"].str.contains("phage", case=False)
+        ~virus_hits["stitle"].str.contains("phage", case=False, na=False)
     ]
     return len(non_phage_viruses) > 0
 

diff --git a/bin/select_top_blast_hits.py b/bin/select_top_blast_hits.py
@@ -202,7 +202,7 @@ def main() -> None:
         return
 
     top_k_lf = select_top_hits(args.input_file, args.blast_retention_count)
-    top_k_lf.sink_csv(args.output_file, separator="\t")
+    top_k_lf.collect().write_csv(args.output_file, separator="\t")
 
 
 if __name__ == "__main__":

diff --git a/bin/test_annotate_blast_lca.py b/bin/test_annotate_blast_lca.py
@@ -8,13 +8,15 @@
 - End-to-end output format
 """
 
+import csv
 import sys
 from pathlib import Path
 
 import polars as pl
 import polars.exceptions
 import pytest
 from annotate_blast_lca import (
+    OUTPUT_COLUMNS,
     LcaParams,
     _find_lca_list,
     filter_blast_hits,
@@ -474,7 +476,7 @@ def test_empty_input_creates_empty_output(
         mock_taxonomy_open,
         tmp_path: Path,
     ):
-        """Empty input file creates empty output file without error."""
+        """Empty input file creates a header-only output without error."""
         # Create empty input file
         blast_file = tmp_path / "empty.txt"
         blast_file.touch()
@@ -494,6 +496,41 @@ def test_empty_input_creates_empty_output(
         finally:
             sys.argv = original_argv
 
-        # Output file should exist and be empty
         assert output_file.exists()
-        assert output_file.stat().st_size == 0
+        with output_file.open(newline="") as handle:
+            rows = list(csv.reader(handle, delimiter="\t"))
+
+        assert rows == [OUTPUT_COLUMNS]
+
+    def test_header_only_input_creates_header_only_output(
+        self,
+        test_taxonomy_sqlite: Path,
+        mock_taxonomy_open,
+        tmp_path: Path,
+    ):
+        """Header-only merged BLAST tables are valid no-hit sentinels."""
+        blast_file = tmp_path / "header_only.txt"
+        blast_file.write_text(
+            "task\tsample\tqseqid\tqlen\tsseqid\tstitle\tlength\tpident\tevalue\tbitscore\tsscinames\tstaxids\trank\n",
+        )
+
+        output_file = tmp_path / "output.txt"
+
+        original_argv = sys.argv
+        try:
+            sys.argv = [
+                "annotate_blast_lca.py",
+                "-i",
+                str(blast_file),
+                "-o",
+                str(output_file),
+            ]
+            main()
+        finally:
+            sys.argv = original_argv
+
+        assert output_file.exists()
+        with output_file.open(newline="") as handle:
+            rows = list(csv.reader(handle, delimiter="\t"))
+
+        assert rows == [OUTPUT_COLUMNS]
diff --git a/bin/test_filter_non_virus_blast_nodes.py b/bin/test_filter_non_virus_blast_nodes.py
@@ -0,0 +1,55 @@
+"""Tests for filtering annotated BLAST rows to viral query groups."""
+
+from __future__ import annotations
+
+import pandas as pd
+from filter_non_virus_blast_nodes import contains_non_phage_viruses
+
+
+def group_with(*, rank: str, stitle: str = "viral reference") -> pd.DataFrame:
+    """Build a minimal qseqid group for filter predicate tests."""
+    return pd.DataFrame(
+        {
+            "qseqid": ["contig1"],
+            "rank": [rank],
+            "stitle": [stitle],
+        },
+    )
+
+
+def test_superkingdom_viruses_lineage_is_kept() -> None:
+    """Legacy superkingdom lineage strings are still viral."""
+    group = group_with(rank="root:cellular organisms; superkingdom:Viruses")
+
+    assert contains_non_phage_viruses(group)
+
+
+def test_acellular_root_viruses_lineage_is_kept() -> None:
+    """Current taxonomy strings can identify Viruses at acellular root."""
+    group = group_with(rank="acellular root:Viruses; realm:Riboviria")
+
+    assert contains_non_phage_viruses(group)
+
+
+def test_non_viral_lineage_is_rejected() -> None:
+    """Non-viral taxonomy strings should not keep the query group."""
+    group = group_with(rank="domain:Bacteria; phylum:Pseudomonadota")
+
+    assert not contains_non_phage_viruses(group)
+
+
+def test_viral_phage_only_group_is_rejected() -> None:
+    """Phage-only viral query groups keep the existing exclusion behavior."""
+    group = group_with(
+        rank="acellular root:Viruses; realm:Duplodnaviria",
+        stitle="Escherichia phage lambda",
+    )
+
+    assert not contains_non_phage_viruses(group)
+
+
+def test_substring_lookalike_is_not_viral() -> None:
+    """The viral predicate should match exact token names, not substrings."""
+    group = group_with(rank="note:NotVirusesMaybe; domain:Bacteria")
+
+    assert not contains_non_phage_viruses(group)
diff --git a/docs/nvd_cli_guide.md b/docs/nvd_cli_guide.md
@@ -27,7 +27,7 @@ The CLI is not meant to hide the pipeline from you. It is meant to keep importan
 ```bash
 nvd setup # run on install; does not need to be run for each nvd run
 
-nvd samplesheet generate --from-dir ./fastqs --platform illumina --output samplesheet.csv
+nvd samplesheet generate --from-dir ./fastqs --platform illumina --sanitize --output samplesheet.csv
 nvd params init run.yaml # then edit run.yaml in an editor with YAML language support
 nvd params check run.yaml
 nvd run --params-file run.yaml
@@ -149,6 +149,14 @@ Generate a samplesheet from a directory of FASTQ files:
 nvd samplesheet generate --from-dir ./fastqs --platform illumina --output samplesheet.csv
 ```
 
+For Illumina/CASAVA filenames, add `--sanitize` when generated sample IDs should drop the sample-number and lane suffixes. For example, `patient-001_S7_L003_R1_001.fastq.gz` and `patient-001_S7_L003_R2_001.fastq.gz` become sample ID `patient-001` instead of `patient-001_S7_L003`:
+
+```bash
+nvd samplesheet generate --from-dir ./fastqs --platform illumina --sanitize --output samplesheet.csv
+```
+
+`--sanitize` only changes generated sample IDs. It does not concatenate multiple lanes; multi-lane Illumina samples still produce one row per discovered read pair.
+
 For Nanopore/ONT reads:
 
 ```bash
@@ -190,6 +198,9 @@ NVD uses NCBI taxonomy for BLAST annotation and LCA resolution. For local work,
 nvd taxonomy ensure --taxonomy-dir /path/to/taxdump
 ```
 
+Existing taxonomy data is reused even when old, while missing required files still trigger download/build. This is the safest behavior for shared HPC references because worker jobs do not mutate a shared taxonomy directory merely because `nodes.dmp` is older than a freshness window.
+
+
 Inspect the current state:
 
 ```bash