Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: Integration

on:
push:
branches: [main, develop]
paths:
- "main.nf"
- "nextflow.config"
- "conf/**"
- "bin/**"
- "lib/**"
- "modules/**/*.nf"
- "subworkflows/**/*.nf"
- "workflows/**/*.nf"
- "tests/**"
- "pyproject.toml"
- "pixi.lock"
- "uv.lock"
- ".github/workflows/integration.yml"
pull_request:
branches: [main, develop]
paths:
- "main.nf"
- "nextflow.config"
- "conf/**"
- "bin/**"
- "lib/**"
- "modules/**/*.nf"
- "subworkflows/**/*.nf"
- "workflows/**/*.nf"
- "tests/**"
- "pyproject.toml"
- "pixi.lock"
- "uv.lock"
- ".github/workflows/integration.yml"
workflow_dispatch:

permissions:
contents: read

jobs:
mini-sra-nextflow:
name: Mini SRA Nextflow integration
runs-on: ubuntu-latest
timeout-minutes: 120

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup Java 17
uses: actions/setup-java@v4
with:
distribution: "temurin"
java-version: "17"

- name: Install uv
uses: astral-sh/setup-uv@v2

- name: Set up Pixi environment
uses: prefix-dev/setup-pixi@v0.9.6
with:
pixi-version: v0.68.0
cache: true

- name: Check Pixi-provided Nextflow
run: pixi run nextflow -version

- name: Run mini SRA integration test
env:
NVD_INTEGRATION_PROFILE: test
run: pixi run e2e-test-ci
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,8 @@

# testing data
!/tests
!/tests/*.py
!/tests/data
!/tests/data/*
!/tests/scripts
!/tests/scripts/*.py
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,14 @@ For local FASTQ files:
nvd samplesheet generate --from-dir ./fastqs --platform illumina --output samplesheet.csv
```

If your Illumina FASTQs use CASAVA-style names like `patient-001_S7_L003_R1_001.fastq.gz`, add `--sanitize` when you want the generated `sample_id` to be `patient-001` rather than `patient-001_S7_L003`:

```bash
nvd samplesheet generate --from-dir ./fastqs --platform illumina --sanitize --output samplesheet.csv
```

`--sanitize` only changes generated sample IDs. It does not concatenate multiple lanes; multi-lane Illumina inputs still produce one row per discovered read pair.

For Nanopore/ONT files, use `--platform ont`:

```bash
Expand Down Expand Up @@ -350,6 +358,9 @@ nvd taxonomy ensure --taxonomy-dir /path/to/taxdump
nvd taxonomy status --taxonomy-dir /path/to/taxdump
```

Existing taxonomy data is reused even when it is older than the freshness warning window. NVD downloads or rebuilds taxonomy only when required files are absent, which avoids mutating shared HPC taxonomy directories merely because they are old.


You can also set:

```bash
Expand Down
46 changes: 41 additions & 5 deletions bin/annotate_blast_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
"""

import argparse
import os
from dataclasses import dataclass
from pathlib import Path

Expand All @@ -54,6 +53,28 @@
DEFAULT_MAX_E = 1e-10
DEFAULT_DELTA_S = 5.0 # ΔS window for "near ties"
DEFAULT_MIN_SUPPORT = 0.8 # dominance threshold
INPUT_COLUMNS = [
"task",
"sample",
"qseqid",
"qlen",
"sseqid",
"stitle",
"length",
"pident",
"evalue",
"bitscore",
"sscinames",
"staxids",
"rank",
]
LCA_COLUMNS = [
"adjusted_taxid",
"adjusted_taxid_name",
"adjusted_taxid_rank",
"adjustment_method",
]
OUTPUT_COLUMNS = [*INPUT_COLUMNS, *LCA_COLUMNS]


@dataclass
Expand All @@ -78,6 +99,20 @@ class LcaParams:
min_support: float = DEFAULT_MIN_SUPPORT


def input_has_data_rows(path: Path) -> bool:
"""Return true when a TSV has at least one row after its header."""
if path.stat().st_size == 0:
return False
with path.open(encoding="utf-8") as handle:
next(handle, None)
return next(handle, None) is not None


def write_empty_lca_output(path: Path) -> None:
"""Write a header-only LCA TSV for valid no-hit samples."""
path.write_text("\t".join(OUTPUT_COLUMNS) + "\n", encoding="utf-8")


def filter_blast_hits(unfiltered_hits: pl.LazyFrame, params: LcaParams) -> pl.LazyFrame:
"""
Filter BLAST hits by quality thresholds and identify near-tie cases.
Expand Down Expand Up @@ -477,10 +512,11 @@ def main() -> None:
)

logger.info("Loading NCBI taxonomy database...")
# Handle empty input file
if os.path.getsize(args.input_file) == 0:
logger.warning("Input file is empty. Creating empty output file.")
Path(args.output_file).touch()
input_path = Path(args.input_file)
output_path = Path(args.output_file)
if not input_has_data_rows(input_path):
logger.warning("Input file has no BLAST data rows. Creating header-only output file.")
write_empty_lca_output(output_path)
return

with taxonomy.open(
Expand Down
15 changes: 13 additions & 2 deletions bin/filter_non_virus_blast_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import pandas as pd

LINEAGE_TOKEN_PARTS = 2


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
Expand All @@ -21,10 +23,19 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()


def is_virus_lineage(lineage: object) -> bool:
"""Return true when a lineage token names Viruses exactly."""
for token in str(lineage).split(";"):
parts = token.strip().split(":", maxsplit=1)
if len(parts) == LINEAGE_TOKEN_PARTS and parts[1].strip() == "Viruses":
return True
return False


def contains_non_phage_viruses(group: pd.DataFrame) -> bool:
virus_hits = group[group["rank"].str.contains("root:Viruses", na=False)]
virus_hits = group[group["rank"].apply(is_virus_lineage)]
non_phage_viruses = virus_hits[
~virus_hits["stitle"].str.contains("phage", case=False)
~virus_hits["stitle"].str.contains("phage", case=False, na=False)
]
return len(non_phage_viruses) > 0

Expand Down
2 changes: 1 addition & 1 deletion bin/select_top_blast_hits.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def main() -> None:
return

top_k_lf = select_top_hits(args.input_file, args.blast_retention_count)
top_k_lf.sink_csv(args.output_file, separator="\t")
top_k_lf.collect().write_csv(args.output_file, separator="\t")


if __name__ == "__main__":
Expand Down
43 changes: 40 additions & 3 deletions bin/test_annotate_blast_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
- End-to-end output format
"""

import csv
import sys
from pathlib import Path

import polars as pl
import polars.exceptions
import pytest
from annotate_blast_lca import (
OUTPUT_COLUMNS,
LcaParams,
_find_lca_list,
filter_blast_hits,
Expand Down Expand Up @@ -474,7 +476,7 @@ def test_empty_input_creates_empty_output(
mock_taxonomy_open,
tmp_path: Path,
):
"""Empty input file creates empty output file without error."""
"""Empty input file creates a header-only output without error."""
# Create empty input file
blast_file = tmp_path / "empty.txt"
blast_file.touch()
Expand All @@ -494,6 +496,41 @@ def test_empty_input_creates_empty_output(
finally:
sys.argv = original_argv

# Output file should exist and be empty
assert output_file.exists()
assert output_file.stat().st_size == 0
with output_file.open(newline="") as handle:
rows = list(csv.reader(handle, delimiter="\t"))

assert rows == [OUTPUT_COLUMNS]

def test_header_only_input_creates_header_only_output(
self,
test_taxonomy_sqlite: Path,
mock_taxonomy_open,
tmp_path: Path,
):
"""Header-only merged BLAST tables are valid no-hit sentinels."""
blast_file = tmp_path / "header_only.txt"
blast_file.write_text(
"task\tsample\tqseqid\tqlen\tsseqid\tstitle\tlength\tpident\tevalue\tbitscore\tsscinames\tstaxids\trank\n",
)

output_file = tmp_path / "output.txt"

original_argv = sys.argv
try:
sys.argv = [
"annotate_blast_lca.py",
"-i",
str(blast_file),
"-o",
str(output_file),
]
main()
finally:
sys.argv = original_argv

assert output_file.exists()
with output_file.open(newline="") as handle:
rows = list(csv.reader(handle, delimiter="\t"))

assert rows == [OUTPUT_COLUMNS]
55 changes: 55 additions & 0 deletions bin/test_filter_non_virus_blast_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Tests for filtering annotated BLAST rows to viral query groups."""

from __future__ import annotations

import pandas as pd
from filter_non_virus_blast_nodes import contains_non_phage_viruses


def group_with(*, rank: str, stitle: str = "viral reference") -> pd.DataFrame:
"""Build a minimal qseqid group for filter predicate tests."""
return pd.DataFrame(
{
"qseqid": ["contig1"],
"rank": [rank],
"stitle": [stitle],
},
)


def test_superkingdom_viruses_lineage_is_kept() -> None:
"""Legacy superkingdom lineage strings are still viral."""
group = group_with(rank="root:cellular organisms; superkingdom:Viruses")

assert contains_non_phage_viruses(group)


def test_acellular_root_viruses_lineage_is_kept() -> None:
"""Current taxonomy strings can identify Viruses at acellular root."""
group = group_with(rank="acellular root:Viruses; realm:Riboviria")

assert contains_non_phage_viruses(group)


def test_non_viral_lineage_is_rejected() -> None:
"""Non-viral taxonomy strings should not keep the query group."""
group = group_with(rank="domain:Bacteria; phylum:Pseudomonadota")

assert not contains_non_phage_viruses(group)


def test_viral_phage_only_group_is_rejected() -> None:
"""Phage-only viral query groups keep the existing exclusion behavior."""
group = group_with(
rank="acellular root:Viruses; realm:Duplodnaviria",
stitle="Escherichia phage lambda",
)

assert not contains_non_phage_viruses(group)


def test_substring_lookalike_is_not_viral() -> None:
"""The viral predicate should match exact token names, not substrings."""
group = group_with(rank="note:NotVirusesMaybe; domain:Bacteria")

assert not contains_non_phage_viruses(group)
13 changes: 12 additions & 1 deletion docs/nvd_cli_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ The CLI is not meant to hide the pipeline from you. It is meant to keep importan
```bash
nvd setup # run on install; does not need to be run for each nvd run

nvd samplesheet generate --from-dir ./fastqs --platform illumina --output samplesheet.csv
nvd samplesheet generate --from-dir ./fastqs --platform illumina --sanitize --output samplesheet.csv
nvd params init run.yaml # then edit run.yaml in an editor with YAML language support
nvd params check run.yaml
nvd run --params-file run.yaml
Expand Down Expand Up @@ -149,6 +149,14 @@ Generate a samplesheet from a directory of FASTQ files:
nvd samplesheet generate --from-dir ./fastqs --platform illumina --output samplesheet.csv
```

For Illumina/CASAVA filenames, add `--sanitize` when generated sample IDs should drop the sample-number and lane suffixes. For example, `patient-001_S7_L003_R1_001.fastq.gz` and `patient-001_S7_L003_R2_001.fastq.gz` become sample ID `patient-001` instead of `patient-001_S7_L003`:

```bash
nvd samplesheet generate --from-dir ./fastqs --platform illumina --sanitize --output samplesheet.csv
```

`--sanitize` only changes generated sample IDs. It does not concatenate multiple lanes; multi-lane Illumina samples still produce one row per discovered read pair.

For Nanopore/ONT reads:

```bash
Expand Down Expand Up @@ -190,6 +198,9 @@ NVD uses NCBI taxonomy for BLAST annotation and LCA resolution. For local work,
nvd taxonomy ensure --taxonomy-dir /path/to/taxdump
```

Existing taxonomy data is reused even when old, while missing required files still trigger download/build. This is the safest behavior for shared HPC references because worker jobs do not mutate a shared taxonomy directory merely because `nodes.dmp` is older than a freshness window.


Inspect the current state:

```bash
Expand Down
Loading
Loading