diff --git a/README.md b/README.md index d6aca64a..93f08552 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ **nf-cmgg/preprocessing** is a bioinformatics pipeline that demultiplexes and aligns raw sequencing data. It also performs basic QC and coverage analysis. +The pipeline also includes an optional per-sample FGUMI consensus branch for UMI-aware DNA processing. The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. @@ -24,6 +25,7 @@ Steps include: - Run QC using [`MultiQC SAV`](https://github.com/MultiQC/MultiQC_SAV) - Read QC and trimming using [`fastp`](https://github.com/OpenGene/fastp) or [`falco`](https://github.com/smithlabcode/falco) - Alignment using either [`bwa`](https://github.com/lh3/bwa), [`bwa-mem2`](https://github.com/bwa-mem2/bwa-mem2), [`bowtie2`](https://github.com/BenLangmead/bowtie2), [`dragmap`](https://github.com/Illumina/DRAGMAP), [`snap`](https://github.com/amplab/snap) or [`strobe`](https://github.com/ksahlin/strobealign) for DNA-seq and [`STAR`](https://github.com/alexdobin/STAR) for RNA-seq +- Optional FGUMI consensus branch for `fgumi_aware` DNA samples (`extract -> snap/zipper -> group -> simplex -> filter -> sort`) - Duplicate marking using [`bamsormadup`](https://gitlab.com/german.tischler/biobambam2) or [`samtools markdup`](http://www.htslib.org/doc/samtools-markdup.html) - Coverage analysis using [`mosdepth`](https://github.com/brentp/mosdepth) and [`samtools coverage`](http://www.htslib.org/doc/samtools-coverage.html) - Alignment QC using [`samtools flagstat`](http://www.htslib.org/doc/samtools-flagstat.html), [`samtools stats`](http://www.htslib.org/doc/samtools-stats.html), [`samtools idxstats`](http://www.htslib.org/doc/samtools-idxstats.html) and [`picard CollectHsMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectHsMetrics), [`picard CollectWgsMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics), [`picard CollectMultipleMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectMultipleMetrics) diff --git a/assets/schema_input.json b/assets/schema_input.json index 8f666f27..74a0a44a 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -53,6 +53,24 @@ "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name.", "default": false }, + "fgumi_aware": { + "meta": ["fgumi_aware"], + "type": "boolean", + "description": "Enable UMI-aware consensus processing through the fgumi branch.", + "default": false + }, + "fgumi_read_structures": { + "meta": ["fgumi_read_structures"], + "type": "string", + "description": "Read structures passed to fgumi extract for this sample.", + "default": null + }, + "fgumi_extract_umis_from_read_names": { + "meta": ["fgumi_extract_umis_from_read_names"], + "type": "boolean", + "description": "Override fgumi extraction from read names for this sample.", + "default": null + }, "skip_trimming": { "meta": ["skip_trimming"], "type": "boolean", diff --git a/assets/schema_sampleinfo.json b/assets/schema_sampleinfo.json index 092cc130..ab6dfda3 100644 --- a/assets/schema_sampleinfo.json +++ b/assets/schema_sampleinfo.json @@ -93,6 +93,24 @@ "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name.", "default": false }, + "fgumi_aware": { + "meta": ["fgumi_aware"], + "type": "boolean", + "description": "Enable UMI-aware consensus processing through the fgumi branch.", + "default": false + }, + "fgumi_read_structures": { + "meta": ["fgumi_read_structures"], + "type": "string", + "description": "Read structures passed to fgumi extract for this sample.", + "default": null + }, + "fgumi_extract_umis_from_read_names": { + "meta": ["fgumi_extract_umis_from_read_names"], + "type": "boolean", + "description": "Override fgumi extraction from read names for this sample.", + "default": null + }, "skip_trimming": { "meta": ["skip_trimming"], "type": "boolean", diff --git a/conf/modules.config b/conf/modules.config index 95978df0..28ca1730 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -231,6 +231,163 @@ process { } } + //// FGUMI extract (step 1) + withName: '.*FGUMI_EXTRACT' { + cpus = 4 + memory = 16.GB + ext.prefix = { "${meta.id}.fgumi.unmapped" } + ext.args = { + [ + "--read-group-id ${meta.readgroup?.ID ?: meta.id}", + meta.readgroup?.PL ? "--platform ${meta.readgroup.PL}" : "", + meta.readgroup?.PU ? "--platform-unit \"${meta.readgroup.PU}\"" : "", + meta.readgroup?.PM ? "--platform-model \"${meta.readgroup.PM}\"" : "", + meta.readgroup?.CN ? "--sequencing-center \"${meta.readgroup.CN}\"" : "", + meta.readgroup?.PI ? "--predicted-insert-size ${meta.readgroup.PI}" : "", + meta.readgroup?.DS ? "--description \"${meta.readgroup.DS}\"" : "", + meta.readgroup?.DT ? "--run-date \"${meta.readgroup.DT}\"" : "", + "--read-structures ${meta.fgumi_read_structures ?: '+T +T'}", + ((meta.fgumi_extract_umis_from_read_names != null ? meta.fgumi_extract_umis_from_read_names : true) ? "--extract-umis-from-read-names" : ""), + "--compression-level ${params.fgumi_compression_level}", + ].join(" ").trim() + } + } + + //// FGUMI fastq | SNAP | zipper (step 3a) + withName: '.*FGUMI_SNAP_ZIPPER:FGUMI_SNAP_ALIGN' { + cpus = 16 + memory = 64.GB + ext.prefix = { "${meta.id}.fgumi" } + ext.args = { + [ + "-b-", + "-sm 20", + params.fgumi_snap_ignore_mismatched_pairs ? "-I" : "", + "-hc-", + "-S id", + "-sa", + "-xf 2", + meta.readgroup ? "-R \"@RG\\t" + meta.readgroup.findResults { rg -> rg.value?.trim() ? "${rg.key}:${rg.value}" : null }.join("\\t") + "\"" : "", + ].join(" ").trim() + } + } + + //// Queryname sort unmapped BAM before zipper (step 3b) + withName: '.*FGUMI_SNAP_ZIPPER:SAMTOOLS_QNAME_SORT_UNMAPPED' { + cpus = 16 + memory = 64.GB + ext.prefix = { "${meta.id}.fgumi.unmapped.queryname" } + ext.args = { + [ + "-n", + ].join(" ").trim() + } + } + + //// Queryname sort mapped stream before zipper (step 3c) + withName: '.*FGUMI_SNAP_ZIPPER:SAMTOOLS_QNAME_SORT_MAPPED' { + cpus = 16 + memory = 64.GB + ext.prefix = { "${meta.id}.fgumi.snap.queryname" } + ext.args = { + [ + "-n", + "--output-fmt sam", + ].join(" ").trim() + } + } + + //// FGUMI zipper between queryname-sorted streams (step 3d) + withName: '.*FGUMI_SNAP_ZIPPER:FGUMI_ZIPPER' { + cpus = 16 + memory = 64.GB + ext.prefix = { "${meta.id}.fgumi" } + ext.args = { + [ + "--threads ${task.cpus}", + ].join(" ").trim() + } + } + + //// FGUMI template-coordinate sort after zipper (step 3e) + withName: '.*FGUMI_SNAP_ZIPPER:FGUMI_TEMPLATE_SORT' { + cpus = 16 + memory = 64.GB + ext.prefix = { "${meta.id}.fgumi.template" } + ext.args = { + [ + "--order template-coordinate", + "--max-memory ${params.fgumi_sort_max_memory}", + "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}", + ].join(" ").trim() + } + } + + //// FGUMI group (step 4) + withName: '.*FGUMI_GROUP' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.group" } + ext.args = { + [ + "--edits ${meta.fgumi_group_edits != null ? meta.fgumi_group_edits : 1}", + "--compression-level ${meta.fgumi_compression_level != null ? meta.fgumi_compression_level : 1}", + ].join(" ").trim() + } + } + + //// FGUMI simplex (step 5) + withName: '.*FGUMI_SIMPLEX' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.simplex" } + ext.args = { + [ + "--queue-memory ${params.fgumi_queue_memory}", + params.fgumi_queue_memory_per_thread ? "--queue-memory-per-thread" : "", + "--compression-level ${params.fgumi_compression_level}", + ].join(" ").trim() + } + } + + //// FGUMI filter + coordinate sort/index (step 7) + withName: '.*FGUMI_FILTER' { + cpus = 4 + memory = 16.GB + ext.prefix = { "${meta.id}.fgumi.filter" } + ext.args = '' + } + + //// FGUMI coordinate sort/index after filter (step 7) + withName: '.*FGUMI_SORT' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.filter" } + ext.args = { + [ + "--order coordinate", + "--write-index", + "--threads ${task.cpus}", + "--max-memory ${params.fgumi_sort_max_memory}", + "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}", + ].join(" ").trim() + } + } + + //// UMI step 7b now uses samtools sort to produce CRAM directly. + withName: '.*UMI_CONSENSUS_FGUMI:SAMTOOLS_SORT' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.filter.sorted" } + ext.args = { + [ + "--write-index", + "--output-fmt cram,version=3.0", + "--output-fmt-option archive", + ].join(" ").trim() + } + } + // coverage //// Mosdepth withName: '.*COVERAGE:MOSDEPTH' { diff --git a/docs/README.md b/docs/README.md index 78d3e752..f255a916 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,6 +4,8 @@ The nf-cmgg/preprocessing documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. +- [Analysis flow](analysis_flow.md) + - Mermaid overview of the current fgumi-aware analysis path and integration into the FASTQ to CRAM workflow. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. - [Parameters](parameters.md) diff --git a/docs/analysis_flow.md b/docs/analysis_flow.md new file mode 100644 index 00000000..951e905b --- /dev/null +++ b/docs/analysis_flow.md @@ -0,0 +1,68 @@ +# nf-cmgg/preprocessing: Analysis Flow + +## FGUMI-aware DNA flow (new branch) + +The diagram below summarizes the current fgumi-aware branch and how it joins the common FASTQ to CRAM path. +It reflects the current wiring in `FASTQ_TO_CRAM` and `UMI_CONSENSUS_FGUMI`, including the early branching and data joins. + +```mermaid +flowchart TD + A[Input channel: meta + reads + aligner + index + fasta + gtf] --> B{sample_type} + B -->|RNA| R1[FASTQ_ALIGN_RNA] + B -->|DNA| C{meta.fgumi_aware == true} + + C -->|false| D1[FASTQ_ALIGN_DNA non-UMI] + C -->|true| U0[Enter UMI_CONSENSUS_FGUMI] + + subgraph UMI_CONSENSUS_FGUMI + U1[Step 1: FGUMI_EXTRACT\n(reads -> unmapped BAM with UMI tags)] + U1J[Join with reference assets\nSNAP index + fasta + dict from meta.genome_data] + U2[Step 3: FGUMI_SNAP_ZIPPER] + U2a[samtools sort -n\nunmapped BAM] + U2b[fgumi fastq] + U2c[snap-aligner paired] + U2d[samtools sort -n\npost-SNAP] + U2e[fgumi zipper] + U2f[fgumi sort --order template-coordinate] + U3[Step 4: FGUMI_GROUP] + U4[Step 5: FGUMI_SIMPLEX] + U5J[Join simplex BAM with fasta] + U5[Step 7a: FGUMI_FILTER] + U6[Step 7b: SAMTOOLS_SORT\ncoordinate sort + CRAM index] + + U1 --> U1J --> U2 + U2 --> U2a --> U2b --> U2c --> U2d --> U2e --> U2f + U2f --> U3 --> U4 --> U5J --> U5 --> U6 + end + + D1 --> M1[Markdup branch selector\nbamsormadup | samtools | sort] + R1 --> M1 + + U6 --> MIX1[Mix UMI CRAM/CRAI into common postprocess stream] + U3 --> MET1[grouping_metrics] + U3 --> MET2[family_size_histogram] + U4 --> MET3[consensus_metrics] + U5 --> MET4[filtering_metrics] + U6 --> MET5[filtered_consensus_cram] + + M1 --> P1[BIOBAMBAM_BAMSORMADUP or SAMTOOLS_SORMADUP or SAMTOOLS_SORT] + P1 --> COMP{bam or cram} + MIX1 --> COMP + + COMP -->|bam| CVT[SAMTOOLS_CONVERT to CRAM] + COMP -->|cram| OUT1[cram + crai] + CVT --> OUT1 + + OUT1 --> E1[emit: cram_crai] + MET1 --> E2[emit: sormadup_metrics] + MET2 --> E3[emit: family_size_histogram] + MET5 --> E4[emit: filtered_consensus_cram] +``` + +## Notes + +- The fgumi branch is opt-in per sample via fgumi_aware. + - Step numbering mirrors the implementation comments: steps 1, 3, 4, 5, and 7 are executed in this branch. + - The "step 2" nomenclature from upstream fgumi Basic Workflow is intentionally absent in this pipeline path. +- UMI-specific metrics are emitted and mixed into the common reporting stream. +- The output still converges to the shared CRAM/crai downstream path. diff --git a/docs/usage.md b/docs/usage.md index f59c809a..61755d15 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,6 +1,8 @@ # nf-cmgg/preprocessing: Usage -Parameter documentation can be found [here](parameters.md) +Parameter documentation can be found in the [pipeline parameters reference](parameters.md). + +The current fgumi-aware branch and overall integration into the FASTQ to CRAM flow are documented in the [analysis flow diagram](analysis_flow.md). ## Introduction @@ -26,6 +28,7 @@ A `fastq` samplesheet file consisting of paired-end data may look something like aligner: bwamem markdup: bamsormadup umi_aware: false + fgumi_aware: false skip_trimming: false trim_front: 0 trim_tail: 0 @@ -67,6 +70,10 @@ Following table shows the fields that are used by the `fastq` samplesheet: An [example samplesheet](../tests/inputs/test.yml) has been provided with the pipeline. +> [!NOTE] +> `umi_aware` and `fgumi_aware` are independent options. +> Use `umi_aware` for samtools markdup UMI mode, and `fgumi_aware` to run the fgumi consensus branch. + ### Flowcell samplesheet A `flowcell` samplesheet file consisting of one sequencing run may look something like the one below. @@ -102,6 +109,7 @@ A `flowcell` sample info JSON/YML file consisting for one sequencing run may loo aligner: bwamem markdup: bamsormadup umi_aware: false + fgumi_aware: false skip_trimming: false trim_front: 0 trim_tail: 0 diff --git a/main.nf b/main.nf index 8d7667e2..7f9d4e24 100644 --- a/main.nf +++ b/main.nf @@ -178,6 +178,9 @@ workflow { rna_junctions = PREPROCESSING.out.rna_junctions align_reports = PREPROCESSING.out.align_reports sormadup_metrics = PREPROCESSING.out.sormadup_metrics + // Additional UMI consensus outputs. + family_size_histogram = PREPROCESSING.out.family_size_histogram + umi_filtered_consensus_cram = PREPROCESSING.out.umi_filtered_consensus_cram mosdepth_global = PREPROCESSING.out.mosdepth_global mosdepth_summary = PREPROCESSING.out.mosdepth_summary mosdepth_regions = PREPROCESSING.out.mosdepth_regions @@ -275,6 +278,17 @@ output { metrics >> (meta.library ? "${meta.library}/${meta.samplename}/${meta.samplename}.duplicate_metrics.txt" : "${meta.samplename}/${meta.samplename}.duplicate_metrics.txt") } } + // UMI consensus artefacts are published per sample next to CRAM outputs. + family_size_histogram { + path { meta, histogram -> + histogram >> (meta.library ? "${meta.library}/${meta.samplename}/${histogram.name}" : "${meta.samplename}/${histogram.name}") + } + } + umi_filtered_consensus_cram { + path { meta, cram -> + cram >> (meta.library ? "${meta.library}/${meta.samplename}/${cram.name}" : "${meta.samplename}/${cram.name}") + } + } mosdepth_global { path { meta, _file -> return (meta.library ? "${meta.library}/${meta.samplename}/" : "${meta.samplename}/") diff --git a/modules.json b/modules.json index 328d7a78..691ef5e7 100644 --- a/modules.json +++ b/modules.json @@ -50,6 +50,31 @@ "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120", "installed_by": ["modules"] }, + "fgumi/extract": { + "branch": "master", + "git_sha": "52e5a02f344a873b1dc133fa1c4602f639e252c4", + "installed_by": ["modules"] + }, + "fgumi/filter": { + "branch": "master", + "git_sha": "4d10fe71e9f35f6cad3649a77ab58c59cc3bfc4d", + "installed_by": ["modules"] + }, + "fgumi/group": { + "branch": "master", + "git_sha": "4d10fe71e9f35f6cad3649a77ab58c59cc3bfc4d", + "installed_by": ["modules"] + }, + "fgumi/simplex": { + "branch": "master", + "git_sha": "4d10fe71e9f35f6cad3649a77ab58c59cc3bfc4d", + "installed_by": ["modules"] + }, + "fgumi/sort": { + "branch": "master", + "git_sha": "b5bd7ec03fe7cad8f8ab319c2f9881b256d055c6", + "installed_by": ["modules"] + }, "gnu/sort": { "branch": "master", "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120", diff --git a/modules/local/fgumi/snapalign/main.nf b/modules/local/fgumi/snapalign/main.nf new file mode 100644 index 00000000..7520e846 --- /dev/null +++ b/modules/local/fgumi/snapalign/main.nf @@ -0,0 +1,45 @@ +process FGUMI_SNAP_ALIGN { + tag "$meta.id" + label 'process_high' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/763a833519c23555be888065f492215f57344155106972e272a0f8df78c57659/data' + : 'community.wave.seqera.io/library/fgumi_samtools_snap-aligner:c985f9394623a414'}" + + input: + tuple val(meta), path(unmapped_bam), path(index, stageAs: "index/*"), path(fasta), path(dict) + + output: + tuple val(meta), path("${prefix}.snap.bam"), emit: mapped_bam + tuple val(meta), path(unmapped_bam), emit: unmapped_bam + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def snap_args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.fgumi" + + """ + # SNAP index directory is resolved from staged index content. + INDEX_FILE=\$(find -L ./ -name "OverflowTable*" -print -quit) + [ -z "\$INDEX_FILE" ] && echo "Snap index files not found" 1>&2 && exit 1 + INDEX=\$(dirname "\$INDEX_FILE") + + fgumi fastq --input ${unmapped_bam} \ + | snap-aligner paired \ + \$INDEX \ + -pairedInterleavedFastq - \ + -o ${prefix}.snap.bam \ + -t ${task.cpus} \ + ${snap_args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi" + """ + touch ${prefix}.snap.bam + touch ${unmapped_bam} + """ +} diff --git a/modules/local/fgumi/zipper/main.nf b/modules/local/fgumi/zipper/main.nf new file mode 100644 index 00000000..ad25f337 --- /dev/null +++ b/modules/local/fgumi/zipper/main.nf @@ -0,0 +1,38 @@ +process FGUMI_ZIPPER { + tag "$meta.id" + label 'process_high' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/763a833519c23555be888065f492215f57344155106972e272a0f8df78c57659/data' + : 'community.wave.seqera.io/library/fgumi_samtools_snap-aligner:c985f9394623a414'}" + + input: + tuple val(meta), path(mapped_sam), path(unmapped_qname_bam), path(fasta), path(dict) + + output: + tuple val(meta), path("${prefix}.zipper.bam"), emit: bam + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.fgumi" + + """ + # mapped_sam and unmapped_qname_bam must be queryname-sorted in the same order. + fgumi zipper \ + --unmapped ${unmapped_qname_bam} \ + --reference ${fasta} \ + ${args} \ + --output ${prefix}.zipper.bam \ + < ${mapped_sam} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi" + """ + touch ${prefix}.zipper.bam + """ +} diff --git a/modules/nf-core/fgumi/extract/environment.yml b/modules/nf-core/fgumi/extract/environment.yml new file mode 100644 index 00000000..7960b0e5 --- /dev/null +++ b/modules/nf-core/fgumi/extract/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::fgumi=0.2.0" diff --git a/modules/nf-core/fgumi/extract/main.nf b/modules/nf-core/fgumi/extract/main.nf new file mode 100644 index 00000000..374b1a1b --- /dev/null +++ b/modules/nf-core/fgumi/extract/main.nf @@ -0,0 +1,38 @@ +process FGUMI_EXTRACT { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/394ef34259ba03c393d25ce2559530fe6df2c6f125a27be69da89ca7a1c70e30/data' : + 'community.wave.seqera.io/library/fgumi:0.3.0--bcbb552cbefcbda1' }" + + input: + tuple val(meta), path(reads), val(library) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val("${task.process}"), val('fgumi'), eval('fgumi --version | sed "s/^fgumi //"'), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + fgumi extract \\ + --inputs ${reads.join(' ')} \\ + --output ${prefix}.bam \\ + ${args} \\ + --sample ${prefix} \\ + --library "${library}" + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + """ +} diff --git a/modules/nf-core/fgumi/extract/meta.yml b/modules/nf-core/fgumi/extract/meta.yml new file mode 100644 index 00000000..7d62b590 --- /dev/null +++ b/modules/nf-core/fgumi/extract/meta.yml @@ -0,0 +1,70 @@ +name: fgumi_extract +description: Extract unique molecular indices (UMIs) from FASTQ files and write + an unaligned BAM file. +keywords: + - umi + - extract + - fastq + - bam +tools: + - fgumi: + description: High-performance tools for working with UMI-tagged sequencing + data. + homepage: https://github.com/fulcrumgenomics/fgumi + documentation: https://docs.rs/fgumi + tool_dev_url: https://github.com/fulcrumgenomics/fgumi + licence: + - "MIT" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Input FASTQ files used for UMI extraction. + pattern: "*.fastq.gz" + ontologies: + - edam: http://edamontology.org/format_3989 + - library: + type: string + description: Library name to store in the output BAM read group. +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: Unaligned BAM with extracted UMIs in SAM tags. + pattern: "*.bam" + ontologies: [] + versions_fgumi: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - 'fgumi --version | sed "s/^fgumi //"': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - 'fgumi --version | sed "s/^fgumi //"': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@atrigila" +maintainers: + - "@atrigila" diff --git a/modules/nf-core/fgumi/extract/tests/main.nf.test b/modules/nf-core/fgumi/extract/tests/main.nf.test new file mode 100644 index 00000000..44ce2572 --- /dev/null +++ b/modules/nf-core/fgumi/extract/tests/main.nf.test @@ -0,0 +1,73 @@ +nextflow_process { + + name "Test Process FGUMI_EXTRACT" + script "../main.nf" + process "FGUMI_EXTRACT" + + tag "modules" + tag "modules_nfcore" + tag "fgumi" + tag "fgumi/extract" + + config "./nextflow.config" + + test("homo_sapiens - [fastq1, fastq2]") { + + when { + params { + module_args = '' + } + process { + """ + input[0] = [ + [ id:'test' ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ], + 'illumina', + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2] - stub") { + + options "-stub" + + when { + params { + module_args = "--read-structures +T +M" + } + process { + """ + input[0] = [ + [ id:'test' ], + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz', checkIfExists: true) + ], + 'test', + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + + } +} diff --git a/modules/nf-core/fgumi/extract/tests/main.nf.test.snap b/modules/nf-core/fgumi/extract/tests/main.nf.test.snap new file mode 100644 index 00000000..8c054caf --- /dev/null +++ b/modules/nf-core/fgumi/extract/tests/main.nf.test.snap @@ -0,0 +1,54 @@ +{ + "homo_sapiens - [fastq1, fastq2]": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test.bam:md5,9c2476e5c354f57bed109e582b8953be" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_EXTRACT", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-05-07T12:48:51.477812468", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "homo_sapiens - [fastq1, fastq2] - stub": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_EXTRACT", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-05-07T12:49:21.767581495", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fgumi/extract/tests/nextflow.config b/modules/nf-core/fgumi/extract/tests/nextflow.config new file mode 100644 index 00000000..54ef8845 --- /dev/null +++ b/modules/nf-core/fgumi/extract/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: FGUMI_EXTRACT { + ext.args = { "${params.module_args}" } + } +} diff --git a/modules/nf-core/fgumi/filter/environment.yml b/modules/nf-core/fgumi/filter/environment.yml new file mode 100644 index 00000000..7960b0e5 --- /dev/null +++ b/modules/nf-core/fgumi/filter/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::fgumi=0.2.0" diff --git a/modules/nf-core/fgumi/filter/main.nf b/modules/nf-core/fgumi/filter/main.nf new file mode 100644 index 00000000..01d68133 --- /dev/null +++ b/modules/nf-core/fgumi/filter/main.nf @@ -0,0 +1,56 @@ +process FGUMI_FILTER { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/394ef34259ba03c393d25ce2559530fe6df2c6f125a27be69da89ca7a1c70e30/data' : + 'community.wave.seqera.io/library/fgumi:0.3.0--bcbb552cbefcbda1' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + val min_reads + val keep_rejected + + output: + tuple val(meta), path("${prefix}.bam") , emit: bam + tuple val(meta), path("${prefix}.rejects.bam"), emit: rejects, optional: true + tuple val(meta), path("${prefix}.stats.txt") , emit: stats + tuple val("${task.process}"), val('fgumi'), eval('fgumi --version | sed "s/^fgumi //"'), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}_consensus_filtered" + def rejects_command = keep_rejected ? "--rejects ${prefix}.rejects.bam" : '' + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + + """ + fgumi filter \\ + --input ${bam} \\ + --output ${prefix}.bam \\ + --ref ${fasta} \\ + --min-reads ${min_reads} \\ + --threads ${task.cpus} \\ + --stats ${prefix}.stats.txt \\ + ${rejects_command} \\ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_consensus_filtered" + def rejects_command = keep_rejected ? "touch ${prefix}.rejects.bam" : '' + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + touch ${prefix}.bam + ${rejects_command} + touch ${prefix}.stats.txt + """ +} diff --git a/modules/nf-core/fgumi/filter/meta.yml b/modules/nf-core/fgumi/filter/meta.yml new file mode 100644 index 00000000..b6af0220 --- /dev/null +++ b/modules/nf-core/fgumi/filter/meta.yml @@ -0,0 +1,108 @@ +name: "fgumi_filter" +description: | + Filters consensus reads generated by simplex or duplex consensus calling. + This is a high-performance replacement for fgbio FilterConsensusReads. +keywords: + - umi + - filter + - consensus + - bam +tools: + - "fgumi": + description: "High-performance tools for working with UMI-tagged sequencing data." + homepage: "https://github.com/fulcrumgenomics/fgumi" + documentation: "https://docs.rs/fgumi" + tool_dev_url: "https://github.com/fulcrumgenomics/fgumi" + licence: + - "MIT" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Consensus BAM file to be filtered + pattern: "*.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" + - - meta2: + type: map + description: | + Groovy Map containing genome information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta,fna}" + ontologies: + - edam: "http://edamontology.org/format_1929" + - min_reads: + type: integer + description: Minimum number of reads required to keep a consensus read + - keep_rejected: + type: boolean + description: Whether to keep rejected reads in a separate BAM file +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "${prefix}.bam": + type: file + description: Filtered consensus BAM file + pattern: "*.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" + rejects: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "${prefix}.rejects.bam": + type: file + description: Optional BAM file containing reads that were filtered out + pattern: "*.rejects.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" + stats: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "${prefix}.stats.txt": + type: file + description: Optional text file containing filtering statistics + pattern: "*.stats.txt" + ontologies: [] + versions_fgumi: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - 'fgumi --version | sed "s/^fgumi //"': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - 'fgumi --version | sed "s/^fgumi //"': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/fgumi/filter/tests/main.nf.test b/modules/nf-core/fgumi/filter/tests/main.nf.test new file mode 100644 index 00000000..26fd52cf --- /dev/null +++ b/modules/nf-core/fgumi/filter/tests/main.nf.test @@ -0,0 +1,103 @@ +nextflow_process { + + name "Test Process FGUMI_FILTER" + script "../main.nf" + process "FGUMI_FILTER" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "fgumi" + tag "fgumi/filter" + tag "fgumi/sort" + tag "fgumi/group" + tag "fgumi/simplex" + + setup { + run("FGUMI_SORT") { + script "../../sort/main.nf" + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam', checkIfExists: true) + ] + """ + } + + } + run("FGUMI_GROUP") { + script "../../group/main.nf" + process { + """ + input[0] = FGUMI_SORT.out.bam + input[1] = 'adjacency' + """ + } + } + run("FGUMI_SIMPLEX") { + script "../../simplex/main.nf" + process { + """ + input[0] = FGUMI_GROUP.out.bam + input[1] = 1 + input[2] = false + """ + } + } + } + + test("homo_sapiens - bam") { + + when { + process { + """ + input[0] = FGUMI_SIMPLEX.out.bam + input[1] = [ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[2] = 1 + input[3] = false + """ + } + } + + then { + assert process.success + assertAll( + // bam file is non deterministic in its output order + { assert snapshot(sanitizeOutput(process.out, unstableKeys: ['bam'])).match() } + ) + } + + } + + test("homo_sapiens - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = FGUMI_SIMPLEX.out.bam + input[1] = [ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ] + input[2] = 1 + input[3] = false + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgumi/filter/tests/main.nf.test.snap b/modules/nf-core/fgumi/filter/tests/main.nf.test.snap new file mode 100644 index 00000000..a658d249 --- /dev/null +++ b/modules/nf-core/fgumi/filter/tests/main.nf.test.snap @@ -0,0 +1,76 @@ +{ + "homo_sapiens - bam": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test_consensus_filtered.bam" + ] + ], + "rejects": [ + + ], + "stats": [ + [ + { + "id": "test" + }, + "test_consensus_filtered.stats.txt:md5,88f50c970ee78378013d32614dde9b47" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_FILTER", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T09:14:15.950832036", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "homo_sapiens - bam - stub": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test_consensus_filtered.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "rejects": [ + + ], + "stats": [ + [ + { + "id": "test" + }, + "test_consensus_filtered.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_FILTER", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T09:15:57.252685521", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fgumi/filter/tests/nextflow.config b/modules/nf-core/fgumi/filter/tests/nextflow.config new file mode 100644 index 00000000..28103edf --- /dev/null +++ b/modules/nf-core/fgumi/filter/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: FGUMI_SORT { + ext.args = '--order template-coordinate' + } +} diff --git a/modules/nf-core/fgumi/group/environment.yml b/modules/nf-core/fgumi/group/environment.yml new file mode 100644 index 00000000..7960b0e5 --- /dev/null +++ b/modules/nf-core/fgumi/group/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::fgumi=0.2.0" diff --git a/modules/nf-core/fgumi/group/main.nf b/modules/nf-core/fgumi/group/main.nf new file mode 100644 index 00000000..44876751 --- /dev/null +++ b/modules/nf-core/fgumi/group/main.nf @@ -0,0 +1,51 @@ +process FGUMI_GROUP { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/394ef34259ba03c393d25ce2559530fe6df2c6f125a27be69da89ca7a1c70e30/data' : + 'community.wave.seqera.io/library/fgumi:0.3.0--bcbb552cbefcbda1' }" + + input: + tuple val(meta), path(bam) + val strategy + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.family_size_histogram.txt"), emit: histogram + tuple val(meta), path("*.grouping_metrics.txt") , emit: metrics + tuple val("${task.process}"), val('fgumi'), eval('fgumi --version | sed "s/^fgumi //"'), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_umi-grouped" + + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + + """ + fgumi group \\ + --input ${bam} \\ + --output ${prefix}.bam \\ + --strategy ${strategy} \\ + --family-size-histogram ${prefix}.family_size_histogram.txt \\ + --grouping-metrics ${prefix}.grouping_metrics.txt \\ + ${args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}_umi-grouped" + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + touch ${prefix}.bam + touch ${prefix}.family_size_histogram.txt + touch ${prefix}.grouping_metrics.txt + """ +} diff --git a/modules/nf-core/fgumi/group/meta.yml b/modules/nf-core/fgumi/group/meta.yml new file mode 100644 index 00000000..4a3a3c3c --- /dev/null +++ b/modules/nf-core/fgumi/group/meta.yml @@ -0,0 +1,100 @@ +name: "fgumi_group" +description: | + Groups reads together that appear to have come from the same original molecule. + Reads are grouped by template, and then templates are sorted by the 5' mapping positions + of the reads from the template. Reads that have the same end positions are then sub-grouped + by UMI sequence. This is a high-performance replacement for fgbio GroupReadsByUmi. +keywords: + - umi + - groupreads + - bam +tools: + - "fgumi": + description: "High-performance tools for working with UMI-tagged sequencing data." + homepage: "https://github.com/fulcrumgenomics/fgumi" + documentation: "https://docs.rs/fgumi" + tool_dev_url: "https://github.com/fulcrumgenomics/fgumi" + licence: + - "MIT" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file containing reads with UMI tags. The file must be coordinate sorted. + pattern: "*.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" + - strategy: + type: string + enum: + - "Identity" + - "Edit" + - "Adjacency" + - "Paired" + description: | + Required argument: defines the UMI assignment strategy. + Must be chosen among: Identity, Edit, Adjacency, Paired. +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: UMI-grouped BAM file + pattern: "*.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" + histogram: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.family_size_histogram.txt": + type: file + description: Optional output of tag family size counts + pattern: "*.family_size_histogram.txt" + metrics: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.grouping_metrics.txt": + type: file + description: Optional output of UMI grouping metrics + pattern: "*.grouping_metrics.txt" + versions_fgumi: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - fgumi --version | sed "s/^fgumi //": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - fgumi --version | sed "s/^fgumi //": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/fgumi/group/tests/main.nf.test b/modules/nf-core/fgumi/group/tests/main.nf.test new file mode 100644 index 00000000..6ed0f513 --- /dev/null +++ b/modules/nf-core/fgumi/group/tests/main.nf.test @@ -0,0 +1,74 @@ +nextflow_process { + + name "Test Process FGUMI_GROUP" + script "../main.nf" + process "FGUMI_GROUP" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "fgumi" + tag "fgumi/group" + tag "fgumi/sort" + + test("sarscov2 - bam") { + + setup { + run("FGUMI_SORT") { + script "../../sort/main.nf" + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam', checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = FGUMI_SORT.out.bam + input[1] = 'adjacency' + """ + } + } + + then { + assert process.success + assertAll( + // bam file is non deterministic in its output order + { assert snapshot(sanitizeOutput(process.out, unstableKeys: ['bam'])).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam', checkIfExists: true) + ] + input[1] = 'adjacency' + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgumi/group/tests/main.nf.test.snap b/modules/nf-core/fgumi/group/tests/main.nf.test.snap new file mode 100644 index 00000000..7595f16c --- /dev/null +++ b/modules/nf-core/fgumi/group/tests/main.nf.test.snap @@ -0,0 +1,86 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test_umi-grouped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "histogram": [ + [ + { + "id": "test" + }, + "test_umi-grouped.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "metrics": [ + [ + { + "id": "test" + }, + "test_umi-grouped.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_GROUP", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T14:26:33.524685932", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "sarscov2 - bam": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test_umi-grouped.bam" + ] + ], + "histogram": [ + [ + { + "id": "test" + }, + "test_umi-grouped.family_size_histogram.txt:md5,4f680ae7e7413c4b88f7ee82fd237162" + ] + ], + "metrics": [ + [ + { + "id": "test" + }, + "test_umi-grouped.grouping_metrics.txt:md5,6d32f1f0d9277fe6f07d5e6ff56e70ac" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_GROUP", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T14:26:27.388945914", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fgumi/group/tests/nextflow.config b/modules/nf-core/fgumi/group/tests/nextflow.config new file mode 100644 index 00000000..28103edf --- /dev/null +++ b/modules/nf-core/fgumi/group/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: FGUMI_SORT { + ext.args = '--order template-coordinate' + } +} diff --git a/modules/nf-core/fgumi/simplex/environment.yml b/modules/nf-core/fgumi/simplex/environment.yml new file mode 100644 index 00000000..7960b0e5 --- /dev/null +++ b/modules/nf-core/fgumi/simplex/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::fgumi=0.2.0" diff --git a/modules/nf-core/fgumi/simplex/main.nf b/modules/nf-core/fgumi/simplex/main.nf new file mode 100644 index 00000000..9c02c7cb --- /dev/null +++ b/modules/nf-core/fgumi/simplex/main.nf @@ -0,0 +1,54 @@ +process FGUMI_SIMPLEX { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a5/a510706f3481fae12ff6100d6e4ad298b8bf464a2d93a6afe35e9cf26542d080/data' + : 'community.wave.seqera.io/library/fgumi:0.2.0--fe028e7a64e5da27'}" + + input: + tuple val(meta), path(grouped_bam) + val min_reads + val keep_rejected + + output: + tuple val(meta), path("${prefix}.bam") , emit: bam + tuple val(meta), path("${prefix}.rejects.bam"), emit: rejects, optional: true + tuple val(meta), path("${prefix}.stats.txt") , emit: stats + tuple val("${task.process}"), val('fgumi'), eval('fgumi --version | sed "s/^fgumi //"'), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}_simplex_unmapped" + def rejects_command = keep_rejected ? "--rejects ${prefix}.rejects.bam" : '' + + if ("${grouped_bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + + """ + fgumi simplex \\ + --input ${grouped_bam} \\ + --output ${prefix}.bam \\ + --min-reads ${min_reads} \\ + --threads ${task.cpus} \\ + --stats ${prefix}.stats.txt \\ + ${rejects_command} \\ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}_simplex_unmapped" + if ("${grouped_bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + touch ${prefix}.bam + touch ${prefix}.rejects.bam + touch ${prefix}.stats.txt + """ +} diff --git a/modules/nf-core/fgumi/simplex/meta.yml b/modules/nf-core/fgumi/simplex/meta.yml new file mode 100644 index 00000000..0f7a751a --- /dev/null +++ b/modules/nf-core/fgumi/simplex/meta.yml @@ -0,0 +1,98 @@ +name: "fgumi_simplex" +description: | + Calls simplex consensus sequences from reads with the same unique molecular tag. + This is a high-performance replacement for fgbio CallMolecularConsensusReads. +keywords: + - umi + - consensus + - simplex + - bam +tools: + - "fgumi": + description: "High-performance tools for working with UMI-tagged sequencing data." + homepage: "https://github.com/fulcrumgenomics/fgumi" + documentation: "https://docs.rs/fgumi" + tool_dev_url: "https://github.com/fulcrumgenomics/fgumi" + licence: + - "MIT" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - grouped_bam: + type: file + description: | + The input SAM or BAM file, grouped by UMIs + pattern: "*.{bam,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" + - min_reads: + type: integer + description: Minimum number of original reads to build each consensus read. + - keep_rejected: + type: boolean + description: If true, output rejected reads to a separate BAM file +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "${prefix}.bam": + type: file + description: | + Output SAM or BAM file with simplex consensus reads. + pattern: "*.{bam,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" + rejects: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "${prefix}.rejects.bam": + type: file + description: Optional BAM file containing reads that were rejected + pattern: "*.rejects.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + stats: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "${prefix}.stats.txt": + type: file + description: Optional text file containing consensus statistics + pattern: "*.stats.txt" + versions_fgumi: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - fgumi --version | sed "s/^fgumi //": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - fgumi --version | sed "s/^fgumi //": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/fgumi/simplex/tests/main.nf.test b/modules/nf-core/fgumi/simplex/tests/main.nf.test new file mode 100644 index 00000000..2442b881 --- /dev/null +++ b/modules/nf-core/fgumi/simplex/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_process { + + name "Test Process FGUMI_SIMPLEX" + script "../main.nf" + process "FGUMI_SIMPLEX" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "fgumi" + tag "fgumi/simplex" + tag "fgumi/sort" + tag "fgumi/group" + + setup { + run("FGUMI_SORT") { + script "../../sort/main.nf" + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam', checkIfExists: true) + ] + """ + } + } + run("FGUMI_GROUP") { + script "../../group/main.nf" + process { + """ + input[0] = FGUMI_SORT.out.bam + input[1] = 'adjacency' + """ + } + } + } + + test("homo_sapiens - bam") { + + when { + process { + """ + input[0] = FGUMI_GROUP.out.bam + input[1] = 1 + input[2] = false + """ + } + } + + then { + assert process.success + assertAll( + // bam file is non deterministic in its output order + { assert snapshot(sanitizeOutput(process.out, unstableKeys: ['bam'])).match() } + ) + } + + } + + test("homo_sapiens - bam - with rejects") { + + when { + process { + """ + input[0] = FGUMI_GROUP.out.bam + input[1] = 1 + input[2] = true + """ + } + } + + then { + assert process.success + assertAll( + // bam file is non deterministic in its output order + { assert snapshot(sanitizeOutput(process.out, unstableKeys: ['bam', 'rejects'])).match() } + ) + } + + } + + test("homo_sapiens - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_grouped.bam', checkIfExists: true) + ] + input[1] = 1 + input[2] = false + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgumi/simplex/tests/main.nf.test.snap b/modules/nf-core/fgumi/simplex/tests/main.nf.test.snap new file mode 100644 index 00000000..80a6adfa --- /dev/null +++ b/modules/nf-core/fgumi/simplex/tests/main.nf.test.snap @@ -0,0 +1,154 @@ +{ + "homo_sapiens - bam - with rejects": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.bam" + ] + ], + "rejects": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.rejects.bam" + ] + ], + "stats": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.stats.txt:md5,61bfbca538c809387368c351412732ee" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SIMPLEX", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T14:26:50.835328059", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "homo_sapiens - bam": { + "content": [ + { + "bam": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.bam" + ] + ], + "rejects": [ + + ], + "stats": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.stats.txt:md5,61bfbca538c809387368c351412732ee" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SIMPLEX", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T09:30:50.970124786", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "homo_sapiens - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.rejects.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + "FGUMI_SIMPLEX", + "fgumi", + "0.2.0" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "rejects": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.rejects.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stats": [ + [ + { + "id": "test" + }, + "test_simplex_unmapped.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SIMPLEX", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-05T14:26:58.463534187", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fgumi/simplex/tests/nextflow.config b/modules/nf-core/fgumi/simplex/tests/nextflow.config new file mode 100644 index 00000000..28103edf --- /dev/null +++ b/modules/nf-core/fgumi/simplex/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: FGUMI_SORT { + ext.args = '--order template-coordinate' + } +} diff --git a/modules/nf-core/fgumi/sort/environment.yml b/modules/nf-core/fgumi/sort/environment.yml new file mode 100644 index 00000000..7960b0e5 --- /dev/null +++ b/modules/nf-core/fgumi/sort/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::fgumi=0.2.0" diff --git a/modules/nf-core/fgumi/sort/main.nf b/modules/nf-core/fgumi/sort/main.nf new file mode 100644 index 00000000..00679c9f --- /dev/null +++ b/modules/nf-core/fgumi/sort/main.nf @@ -0,0 +1,45 @@ +process FGUMI_SORT { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/394ef34259ba03c393d25ce2559530fe6df2c6f125a27be69da89ca7a1c70e30/data' : + 'community.wave.seqera.io/library/fgumi:0.3.0--bcbb552cbefcbda1' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.{csi,bai}"), emit: index, optional: true + tuple val("${task.process}"), val('fgumi'), eval('fgumi --version | sed "s/^fgumi //"'), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_sorted" + + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + + """ + fgumi sort \\ + --input ${bam} \\ + --output ${prefix}.bam \\ + --threads ${task.cpus} \\ + ${args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}_sorted" + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + touch ${prefix}.bam + """ +} diff --git a/modules/nf-core/fgumi/sort/meta.yml b/modules/nf-core/fgumi/sort/meta.yml new file mode 100644 index 00000000..1dc25228 --- /dev/null +++ b/modules/nf-core/fgumi/sort/meta.yml @@ -0,0 +1,82 @@ +name: "fgumi_sort" +description: | + Sorts a SAM or BAM file. Several sort orders are available, including coordinate, + queryname, and template-coordinate. This is a high-performance replacement for fgbio SortBam. +keywords: + - sort + - bam + - sam +tools: + - "fgumi": + description: "High-performance tools for working with UMI-tagged sequencing data." + homepage: "https://github.com/fulcrumgenomics/fgumi" + documentation: "https://docs.rs/fgumi" + tool_dev_url: "https://github.com/fulcrumgenomics/fgumi" + licence: + - "MIT" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + The input SAM or BAM file to be sorted. + pattern: "*.{bam,sam}" + ontologies: + - edam: "http://edamontology.org/format_2572" +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: | + Sorted output BAM file. + pattern: "*.bam" + ontologies: + - edam: "http://edamontology.org/format_2572" + index: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{csi,bai}": + type: file + description: | + Index file if the bam file is coordinate sorted. + pattern: "*.{csi,bai}" + ontologies: + - edam: "http://edamontology.org/format_3327" + versions_fgumi: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - 'fgumi --version | sed "s/^fgumi //"': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - fgumi: + type: string + description: The tool name + - 'fgumi --version | sed "s/^fgumi //"': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/fgumi/sort/tests/main.nf.test b/modules/nf-core/fgumi/sort/tests/main.nf.test new file mode 100644 index 00000000..fc549bf5 --- /dev/null +++ b/modules/nf-core/fgumi/sort/tests/main.nf.test @@ -0,0 +1,89 @@ +nextflow_process { + + name "Test Process FGUMI_SORT" + script "../main.nf" + process "FGUMI_SORT" + + tag "modules" + tag "modules_nfcore" + tag "fgumi" + tag "fgumi/sort" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + bam(process.out.bam.get(0).get(1)).getReadsMD5(), + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("sarscov2 - bam - template-coordinate") { + + when { + params { + module_args = '--order template-coordinate' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + bam(process.out.bam.get(0).get(1)).getReadsMD5(), + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fgumi/sort/tests/main.nf.test.snap b/modules/nf-core/fgumi/sort/tests/main.nf.test.snap new file mode 100644 index 00000000..07a9c7ee --- /dev/null +++ b/modules/nf-core/fgumi/sort/tests/main.nf.test.snap @@ -0,0 +1,87 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + [ + "FGUMI_SORT", + "fgumi", + "0.2.0" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test_sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + + ], + "versions_fgumi": [ + [ + "FGUMI_SORT", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-04T13:47:11.834557157", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "sarscov2 - bam": { + "content": [ + "461d8083b03a321eb1902ad544fd7d2f", + { + "versions_fgumi": [ + [ + "FGUMI_SORT", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-04T13:47:00.183383916", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + }, + "sarscov2 - bam - template-coordinate": { + "content": [ + "461d8083b03a321eb1902ad544fd7d2f", + { + "versions_fgumi": [ + [ + "FGUMI_SORT", + "fgumi", + "0.2.0" + ] + ] + } + ], + "timestamp": "2026-06-04T13:47:05.881895696", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.0" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fgumi/sort/tests/nextflow.config b/modules/nf-core/fgumi/sort/tests/nextflow.config new file mode 100644 index 00000000..4ad67e9b --- /dev/null +++ b/modules/nf-core/fgumi/sort/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: FGUMI_SORT { + ext.args = { params.module_args } + } +} diff --git a/nextflow.config b/nextflow.config index 20e156f9..3af8b3db 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,30 @@ params { igenomes_base = '/references/' igenomes_ignore = false + // Analysis options + split_fastq = 100000000 + genelists = null + + // UMI consensus (fgumi) options + fgumi_group_strategy = 'adjacency' + fgumi_group_edits = 1 + fgumi_simplex_min_reads = 1 + fgumi_queue_memory = 768 + fgumi_queue_memory_per_thread= true + fgumi_compression_level = 1 + fgumi_sort_max_memory = '2G' + fgumi_sort_memory_per_thread = true + fgumi_snap_ignore_mismatched_pairs = true + + // MultiQC options + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null + + // Boilerplate options + outdir = null publish_dir_mode = 'copy' monochrome_logs = false hook_url = System.getenv('HOOK_URL') diff --git a/nextflow_schema.json b/nextflow_schema.json index 36936165..d3c99338 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,6 +73,58 @@ "exists": true, "format": "directory-path", "description": "Directory containing gene list bed files for granular coverage analysis" + }, + "fgumi_group_strategy": { + "type": "string", + "default": "adjacency", + "description": "UMI grouping strategy for fgumi group.", + "enum": ["identity", "edit", "adjacency", "paired"] + }, + "fgumi_group_edits": { + "type": "integer", + "default": 1, + "minimum": 0, + "description": "Maximum UMI edit distance used by fgumi group." + }, + "fgumi_simplex_min_reads": { + "type": "integer", + "default": 1, + "minimum": 1, + "description": "Minimum number of reads required per UMI family for fgumi simplex consensus generation." + }, + "fgumi_queue_memory": { + "type": "integer", + "default": 768, + "minimum": 64, + "description": "fgumi queue-memory budget in MB." + }, + "fgumi_queue_memory_per_thread": { + "type": "boolean", + "default": true, + "description": "Scale fgumi queue-memory by allocated thread count." + }, + "fgumi_compression_level": { + "type": "integer", + "default": 1, + "minimum": 0, + "maximum": 12, + "description": "Compression level for fgumi BAM outputs." + }, + "fgumi_sort_max_memory": { + "type": "string", + "default": "2G", + "pattern": "^\\d+(\\.\\d+)?[KMG]$", + "description": "Memory budget for fgumi sort, for example 2G or 768M." + }, + "fgumi_sort_memory_per_thread": { + "type": "boolean", + "default": true, + "description": "Scale fgumi sort memory by allocated thread count." + }, + "fgumi_snap_ignore_mismatched_pairs": { + "type": "boolean", + "default": true, + "description": "Pass -I to SNAP to ignore mismatched read IDs in paired-end input." } } }, diff --git a/subworkflows/local/fastq_to_aligned_cram/main.nf b/subworkflows/local/fastq_to_aligned_cram/main.nf index bf87f087..18be9689 100644 --- a/subworkflows/local/fastq_to_aligned_cram/main.nf +++ b/subworkflows/local/fastq_to_aligned_cram/main.nf @@ -13,6 +13,7 @@ include { SAMTOOLS_SORT } from "../../../modules/nf-core/samtools/sort/m // SUBWORKFLOWS include { FASTQ_ALIGN_DNA } from '../../nf-core/fastq_align_dna/main' include { FASTQ_ALIGN_RNA } from '../../local/fastq_align_rna/main' +include { UMI_CONSENSUS_FGUMI } from '../../local/umi_consensus/main.nf' // FUNCTIONS include { getGenomeAttribute } from '../../local/utils_nfcore_preprocessing_pipeline' @@ -42,12 +43,27 @@ workflow FASTQ_TO_CRAM { } .set { ch_meta_reads_aligner_index_fasta_datatype } - // align fastq files per sample - // ALIGNMENT([meta,fastq], index, sort) + ch_meta_reads_aligner_index_fasta_datatype.dna + .branch { meta, reads, aligner, index, fasta -> + // fgumi consensus is opt-in via fgumi_aware to avoid changing samtools umi_aware semantics. + umi: meta.fgumi_aware == true + return [meta, reads, aligner, index, fasta] + non_umi: true + return [meta, reads, aligner, index, fasta] + } + .set { ch_dna_to_align } + + // Align non-UMI DNA fastq files per sample FASTQ_ALIGN_DNA( - ch_meta_reads_aligner_index_fasta_datatype.dna, + ch_dna_to_align.non_umi, false, ) + + // UMI-aware fgumi branch (steps 1, 3, 4, 5, 6, 7 in fgumi Basic Workflow) + UMI_CONSENSUS_FGUMI( + ch_dna_to_align.umi + ) + FASTQ_ALIGN_RNA( ch_meta_reads_aligner_index_fasta_datatype.rna ) @@ -100,6 +116,18 @@ workflow FASTQ_TO_CRAM { ch_markdup_index = channel.empty() + // UMI branch outputs are mixed into the common markdup/metrics streams. + ch_markdup_index = ch_markdup_index.mix( + UMI_CONSENSUS_FGUMI.out.cram_crai + ) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.grouping_metrics) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.family_size_histogram) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.consensus_metrics) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.filtering_metrics) + ch_family_size_histogram = UMI_CONSENSUS_FGUMI.out.family_size_histogram + ch_filtered_consensus_cram = UMI_CONSENSUS_FGUMI.out.filtered_consensus_cram + ch_zipper_diagnostics = UMI_CONSENSUS_FGUMI.out.zipper_diagnostics + // BIOBAMBAM_BAMSORMADUP([meta, [bam, bam]], fasta, fai) BIOBAMBAM_BAMSORMADUP(ch_bam_fasta_fai.bamsormadup) ch_markdup_index = ch_markdup_index.mix(BIOBAMBAM_BAMSORMADUP.out.bam.join(BIOBAMBAM_BAMSORMADUP.out.bam_index, failOnMismatch: true, failOnDuplicate: true)) @@ -149,8 +177,12 @@ workflow FASTQ_TO_CRAM { emit: cram_crai = ch_cram_crai + // UMI-specific output channels for downstream reporting and publishing. + filtered_consensus_cram = ch_filtered_consensus_cram + zipper_diagnostics = ch_zipper_diagnostics rna_splice_junctions = FASTQ_ALIGN_RNA.out.splice_junctions rna_junctions = FASTQ_ALIGN_RNA.out.junctions sormadup_metrics = ch_sormadup_metrics + family_size_histogram = ch_family_size_histogram align_reports = FASTQ_ALIGN_DNA.out.reports } diff --git a/subworkflows/local/fgumi_snapzipper/main.nf b/subworkflows/local/fgumi_snapzipper/main.nf new file mode 100644 index 00000000..efc96188 --- /dev/null +++ b/subworkflows/local/fgumi_snapzipper/main.nf @@ -0,0 +1,54 @@ +#!/usr/bin/env nextflow + +// MODULES +include { FGUMI_SNAP_ALIGN } from "../../../modules/local/fgumi/snapalign/main.nf" +include { FGUMI_ZIPPER } from "../../../modules/local/fgumi/zipper/main.nf" +include { FGUMI_SORT as FGUMI_TEMPLATE_SORT } from "../../../modules/nf-core/fgumi/sort/main.nf" +include { SAMTOOLS_SORT as SAMTOOLS_QNAME_SORT_UNMAPPED } from "../../../modules/nf-core/samtools/sort/main.nf" +include { SAMTOOLS_SORT as SAMTOOLS_QNAME_SORT_MAPPED } from "../../../modules/nf-core/samtools/sort/main.nf" + +workflow FGUMI_SNAP_ZIPPER { + take: + ch_meta_unmapped_index_fasta_dict + + main: + FGUMI_SNAP_ALIGN(ch_meta_unmapped_index_fasta_dict) + + // Queryname sort the unmapped BAM in parallel with mapped BAM sort. + SAMTOOLS_QNAME_SORT_UNMAPPED( + FGUMI_SNAP_ALIGN.out.unmapped_bam + .join( + ch_meta_unmapped_index_fasta_dict.map { meta, _unmapped_bam, _index, fasta, _dict -> [meta, fasta] }, + by: 0, + ) + .map { meta, unmapped_bam, fasta -> [meta, unmapped_bam, fasta] }, + '' + ) + + // Sort mapped alignments by queryname and emit SAM for zipper stdin. + SAMTOOLS_QNAME_SORT_MAPPED( + FGUMI_SNAP_ALIGN.out.mapped_bam + .join( + ch_meta_unmapped_index_fasta_dict.map { meta, _unmapped_bam, _index, fasta, _dict -> [meta, fasta] }, + by: 0, + ) + .map { meta, mapped_bam, fasta -> [meta, mapped_bam, fasta] }, + '' + ) + + FGUMI_ZIPPER( + SAMTOOLS_QNAME_SORT_MAPPED.out.sam + .join(SAMTOOLS_QNAME_SORT_UNMAPPED.out.bam, by: 0) + .join( + ch_meta_unmapped_index_fasta_dict.map { meta, _unmapped_bam, _index, fasta, dict -> [meta, fasta, dict] }, + by: 0, + ) + .map { meta, mapped_sam, unmapped_qname_bam, fasta, dict -> [meta, mapped_sam, unmapped_qname_bam, fasta, dict] } + ) + + FGUMI_TEMPLATE_SORT(FGUMI_ZIPPER.out.bam) + + emit: + bam = FGUMI_TEMPLATE_SORT.out.bam + versions_fgumi = FGUMI_SNAP_ALIGN.out.versions_fgumi.mix(FGUMI_ZIPPER.out.versions_fgumi).mix(FGUMI_TEMPLATE_SORT.out.versions_fgumi) +} diff --git a/subworkflows/local/umi_consensus/main.nf b/subworkflows/local/umi_consensus/main.nf new file mode 100644 index 00000000..2fa24401 --- /dev/null +++ b/subworkflows/local/umi_consensus/main.nf @@ -0,0 +1,81 @@ +#!/usr/bin/env nextflow + +// MODULES +include { FGUMI_EXTRACT } from "../../../modules/nf-core/fgumi/extract/main.nf" +include { FGUMI_FILTER } from "../../../modules/nf-core/fgumi/filter/main.nf" +include { FGUMI_GROUP } from "../../../modules/nf-core/fgumi/group/main.nf" +include { FGUMI_SIMPLEX } from "../../../modules/nf-core/fgumi/simplex/main.nf" +include { FGUMI_SNAP_ZIPPER } from "../fgumi_snapzipper/main.nf" +include { SAMTOOLS_SORT } from "../../../modules/nf-core/samtools/sort/main.nf" + +// FUNCTIONS +include { getGenomeAttribute } from '../../local/utils_nfcore_preprocessing_pipeline' + +workflow UMI_CONSENSUS_FGUMI { + take: + ch_meta_reads_aligner_index_fasta // channel: [mandatory] [meta, reads, aligner, index, fasta] + + main: + // Step numbers follow the fgumi basic workflow terminology (this path executes steps 1, 3, 4, 5, and 7). + // Step 1: build an unmapped BAM with UMI tags from input FASTQ. + FGUMI_EXTRACT( + ch_meta_reads_aligner_index_fasta + .map { meta, reads, _aligner, _index, _fasta -> [meta, reads, (meta.readgroup?.LB ?: meta.library ?: meta.id)] } + ) + + // Step 3: align with SNAP, zipper tags back, then template-coordinate sort. + FGUMI_SNAP_ZIPPER( + FGUMI_EXTRACT.out.bam + .join( + ch_meta_reads_aligner_index_fasta.map { meta, _reads, _aligner, _index, fasta -> + [meta, getGenomeAttribute(meta.genome_data, 'snap'), fasta, getGenomeAttribute(meta.genome_data, 'dict')] + }, + by: 0, + ) + .map { meta, unmapped_bam, snap_index, fasta, dict -> [meta, unmapped_bam, snap_index, fasta, dict] } + ) + + FGUMI_GROUP( + FGUMI_SNAP_ZIPPER.out.bam, + (params.fgumi_group_strategy ?: 'adjacency') + ) + + FGUMI_SIMPLEX( + FGUMI_GROUP.out.bam, + (params.fgumi_simplex_min_reads ?: 1), + false + ) + + // Step 7: filter consensus reads, then coordinate-sort/index for downstream CRAM conversion. + FGUMI_FILTER( + FGUMI_SIMPLEX.out.bam, + FGUMI_SIMPLEX.out.bam + .join( + ch_meta_reads_aligner_index_fasta.map { meta, _reads, _aligner, _index, fasta -> [meta, fasta] }, + by: 0, + ) + .map { meta, _bam, fasta -> [meta, fasta] }, + "1,1,1", + false + ) + + SAMTOOLS_SORT( + FGUMI_FILTER.out.bam + .join( + ch_meta_reads_aligner_index_fasta.map { meta, _reads, _aligner, _index, fasta -> [meta, fasta] }, + by: 0, + ) + .map { meta, bam, fasta -> [meta, bam, fasta] }, + "crai" + ) + + emit: + cram_crai = SAMTOOLS_SORT.out.cram.join(SAMTOOLS_SORT.out.crai, failOnMismatch: true, failOnDuplicate: true) + // Compatibility output kept for downstream interfaces; currently not produced by this branch. + zipper_diagnostics = channel.empty() + grouping_metrics = FGUMI_GROUP.out.metrics + family_size_histogram = FGUMI_GROUP.out.histogram + consensus_metrics = FGUMI_SIMPLEX.out.stats + filtering_metrics = FGUMI_FILTER.out.stats + filtered_consensus_cram = SAMTOOLS_SORT.out.cram +} diff --git a/tests/config/snapalign_custom_prefix.config b/tests/config/snapalign_custom_prefix.config new file mode 100644 index 00000000..21d053f1 --- /dev/null +++ b/tests/config/snapalign_custom_prefix.config @@ -0,0 +1,5 @@ +process { + withName: 'FGUMI_SNAP_ALIGN' { + ext.prefix = 'snapcustom' + } +} diff --git a/tests/inputs/fgumi/snap_index/OverflowTable.txt b/tests/inputs/fgumi/snap_index/OverflowTable.txt new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/inputs/fgumi/snap_index/OverflowTable.txt @@ -0,0 +1 @@ +placeholder diff --git a/tests/inputs/test.yml b/tests/inputs/test.yml index 3a432b4b..9f2ca977 100644 --- a/tests/inputs/test.yml +++ b/tests/inputs/test.yml @@ -49,3 +49,18 @@ run_coverage: true fastq_1: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R1.fastq.gz fastq_2: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R2.fastq.gz +# UMI consensus (fgumi) inputs +# Example DNA sample with fgumi_aware enabled for fgumi processing. +- id: sample1 + samplename: sample1_chr21 + library: test_library + organism: Homo sapiens + tag: WES + sample_type: DNA + aligner: snap + markdup: bamsormadup + fgumi_aware: true + run_coverage: true + fastq_1: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/sample1_S31_R1_001.fastq.gz + fastq_2: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/sample1_S31_R2_001.fastq.gz + diff --git a/tests/modules/local/fgumi/snapalign/main.nf.test b/tests/modules/local/fgumi/snapalign/main.nf.test new file mode 100644 index 00000000..c3f47619 --- /dev/null +++ b/tests/modules/local/fgumi/snapalign/main.nf.test @@ -0,0 +1,81 @@ +nextflow_process { + + name "Test Process FGUMI_SNAP_ALIGN" + script "modules/local/fgumi/snapalign/main.nf" + process "FGUMI_SNAP_ALIGN" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/snapalign" + + test("test - stub") { + // Stub-mode contract test: verifies qname-sorted prep outputs. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test", samplename: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/unmapped.bam", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/snap_index", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } + + test("test - stub alt meta") { + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "sample_alt", samplename: "sample_alt"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/unmapped.bam", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/snap_index", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } + + test("test - stub custom prefix") { + options "-stub -c ${projectDir}/tests/config/snapalign_custom_prefix.config" + + when { + process { + """ + input[0] = [ + [id: "prefix_case", samplename: "prefix_case"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/unmapped.bam", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/snap_index", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/snapalign/main.nf.test.snap b/tests/modules/local/fgumi/snapalign/main.nf.test.snap new file mode 100644 index 00000000..e9314c16 --- /dev/null +++ b/tests/modules/local/fgumi/snapalign/main.nf.test.snap @@ -0,0 +1,185 @@ +{ + "test - stub alt meta": { + "content": [ + { + "0": [ + [ + { + "id": "sample_alt", + "samplename": "sample_alt" + }, + "sample_alt.fgumi.snap.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "sample_alt", + "samplename": "sample_alt" + }, + "unmapped.bam:md5,329308690118ca17867c78cc61a0fab0" + ] + ], + "2": [ + [ + "FGUMI_SNAP_ALIGN", + "fgumi", + "0.1.2" + ] + ], + "mapped_bam": [ + [ + { + "id": "sample_alt", + "samplename": "sample_alt" + }, + "sample_alt.fgumi.snap.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unmapped_bam": [ + [ + { + "id": "sample_alt", + "samplename": "sample_alt" + }, + "unmapped.bam:md5,329308690118ca17867c78cc61a0fab0" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SNAP_ALIGN", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-06-19T14:48:47.467294268" + }, + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "samplename": "test" + }, + "test.fgumi.snap.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "samplename": "test" + }, + "unmapped.bam:md5,329308690118ca17867c78cc61a0fab0" + ] + ], + "2": [ + [ + "FGUMI_SNAP_ALIGN", + "fgumi", + "0.1.2" + ] + ], + "mapped_bam": [ + [ + { + "id": "test", + "samplename": "test" + }, + "test.fgumi.snap.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unmapped_bam": [ + [ + { + "id": "test", + "samplename": "test" + }, + "unmapped.bam:md5,329308690118ca17867c78cc61a0fab0" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SNAP_ALIGN", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-02T16:05:22.614091027" + }, + "test - stub custom prefix": { + "content": [ + { + "0": [ + [ + { + "id": "prefix_case", + "samplename": "prefix_case" + }, + "snapcustom.snap.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "prefix_case", + "samplename": "prefix_case" + }, + "unmapped.bam:md5,329308690118ca17867c78cc61a0fab0" + ] + ], + "2": [ + [ + "FGUMI_SNAP_ALIGN", + "fgumi", + "0.1.2" + ] + ], + "mapped_bam": [ + [ + { + "id": "prefix_case", + "samplename": "prefix_case" + }, + "snapcustom.snap.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unmapped_bam": [ + [ + { + "id": "prefix_case", + "samplename": "prefix_case" + }, + "unmapped.bam:md5,329308690118ca17867c78cc61a0fab0" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SNAP_ALIGN", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-06-19T14:59:18.039374313" + } +} \ No newline at end of file diff --git a/tests/modules/local/fgumi/zipper/main.nf.test b/tests/modules/local/fgumi/zipper/main.nf.test new file mode 100644 index 00000000..0fe82acc --- /dev/null +++ b/tests/modules/local/fgumi/zipper/main.nf.test @@ -0,0 +1,34 @@ +nextflow_process { + + name "Test Process FGUMI_ZIPPER" + script "modules/local/fgumi/zipper/main.nf" + process "FGUMI_ZIPPER" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/zipper" + + test("test - stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test", samplename: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/unmapped.bam", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/unmapped.bam", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/zipper/main.nf.test.snap b/tests/modules/local/fgumi/zipper/main.nf.test.snap new file mode 100644 index 00000000..dd22e5d2 --- /dev/null +++ b/tests/modules/local/fgumi/zipper/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "samplename": "test" + }, + "test.fgumi.zipper.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FGUMI_ZIPPER", + "fgumi", + "0.1.2" + ] + ], + "bam": [ + [ + { + "id": "test", + "samplename": "test" + }, + "test.fgumi.zipper.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_ZIPPER", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-06-17T00:00:00.000000000" + } +} diff --git a/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test new file mode 100644 index 00000000..53aa2463 --- /dev/null +++ b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test @@ -0,0 +1,56 @@ +nextflow_workflow { + + name "Test Workflow FASTQ_TO_CRAM UMI fgumi stub" + script "subworkflows/local/fastq_to_aligned_cram/main.nf" + workflow "FASTQ_TO_CRAM" + + tag "subworkflows" + tag "subworkflows/local" + tag "subworkflows/local/fastq_to_aligned_cram" + tag "fgumi" + + test("fastq to cram - fgumi umi-aware - stub") { + // End-to-end UMI branch contract test in stub mode. + options "-stub" + when { + workflow { + """ + input[0] = Channel.of([ + [ + id: "UMI_consensus1", + samplename: "HT1080_chr20", + single_end: false, + sample_type: "DNA", + markdup: "bamsormadup", + fgumi_aware: true, + genome_data: [ + fasta: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + fai: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + dict: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + snap: "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + ] + ], + [ + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R1.fastq.gz", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R2.fastq.gz", checkIfExists: true) + ], + "snap", + file("s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/", checkIfExists: true), + file("s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + [] + ]) + """ + } + } + + then { + assert workflow.success + // Explicitly assert newly exposed UMI channels. + assert workflow.out.family_size_histogram.size() == 1 + assert workflow.out.filtered_consensus_cram.size() == 1 + assert snapshot( + sanitizeOutput(workflow.out, unstableKeys:["cram_crai"]) + ).match() + } + } +} diff --git a/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test.snap b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test.snap new file mode 100644 index 00000000..a247c372 --- /dev/null +++ b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test.snap @@ -0,0 +1,162 @@ +{ + "fastq to cram - fgumi umi-aware - stub": { + "content": [ + { + "align_reports": [ + + ], + "cram_crai": [ + [ + { + "fgumi_aware": true, + "genome_data": { + "dict": "ref.dict", + "fai": "ref.fa.fai", + "fasta": "ref.fa", + "snap": "snap_index" + }, + "id": "UMI_consensus1", + "markdup": "bamsormadup", + "sample_type": "DNA", + "samplename": "HT1080_chr20", + "single_end": false + }, + "UMI_consensus1.cram", + "UMI_consensus1.cram.crai" + ] + ], + "family_size_histogram": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtered_consensus_cram": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "rna_junctions": [ + + ], + "rna_splice_junctions": [ + + ], + "sormadup_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.filtering_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.simplex.consensus_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-09T10:20:21.479979029" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test b/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test index f3082fc3..49b29e89 100644 --- a/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test +++ b/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test @@ -224,4 +224,50 @@ nextflow_workflow { ) } } + + test("fastq to cram - fgumi umi-aware - stub") { + options: "-stub" + when { + workflow { + """ + // [meta, [fq_1,fq_2], aligner, index, fasta] + input[0] = Channel.of([ + [ + id: "UMI_consensus1", + samplename: "HT1080_chr20", + single_end: false, + sample_type: "DNA", + markdup: "bamsormadup", + umi_aware: true, + genome_data: [ + fasta: "${projectDir}/tests/inputs/fgumi/ref.fa", + fai: "${projectDir}/tests/inputs/fgumi/ref.fa.fai", + dict: "${projectDir}/tests/inputs/fgumi/ref.dict", + snap: "${projectDir}/tests/inputs/fgumi/snap_index" + ] + ], + [ + file("${projectDir}/tests/inputs/fgumi/R1.fastq.gz", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/R2.fastq.gz", checkIfExists: true) + ], + "snap", + file("${projectDir}/tests/inputs/fgumi/snap_index", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/ref.fa", checkIfExists: true), + [] + ]) + """ + } + } + + then { + assertAll( + { + assert workflow.success + assert snapshot( + sanitizeOutput(workflow.out, unstableKeys:["cram_crai"]) + ).match() + } + ) + } + } } diff --git a/tests/subworkflows/local/umi_consensus/main.nf.test b/tests/subworkflows/local/umi_consensus/main.nf.test new file mode 100644 index 00000000..f0648c8f --- /dev/null +++ b/tests/subworkflows/local/umi_consensus/main.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test Workflow UMI_CONSENSUS_FGUMI" + script "subworkflows/local/umi_consensus/main.nf" + workflow "UMI_CONSENSUS_FGUMI" + + tag "subworkflows" + tag "subworkflows/local" + tag "subworkflows/local/umi_consensus" + tag "fgumi" + + test("umi consensus fgumi - stub") { + options "-stub" + when { + workflow { + """ + input[0] = Channel.of([ + [ + id: "UMI_consensus1", + samplename: "HT1080_chr20", + single_end: false, + sample_type: "DNA", + markdup: "bamsormadup", + fgumi_aware: true, + genome_data: [ + fasta: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + fai: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + dict: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + snap: "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + ] + ], + [ + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R1.fastq.gz", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R2.fastq.gz", checkIfExists: true) + ], + "snap", + file("s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/", checkIfExists: true), + file("s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true) + ]) + """ + } + } + + then { + assert workflow.success + assert workflow.out.cram_crai.size() == 1 + assert workflow.out.grouping_metrics.size() == 1 + assert workflow.out.family_size_histogram.size() == 1 + assert workflow.out.consensus_metrics.size() == 1 + assert workflow.out.filtering_metrics.size() == 1 + assert workflow.out.filtered_consensus_cram.size() == 1 + assert snapshot(workflow.out).match() + } + } +} diff --git a/tests/subworkflows/local/umi_consensus/main.nf.test.snap b/tests/subworkflows/local/umi_consensus/main.nf.test.snap new file mode 100644 index 00000000..cdb95716 --- /dev/null +++ b/tests/subworkflows/local/umi_consensus/main.nf.test.snap @@ -0,0 +1,265 @@ +{ + "umi consensus fgumi - stub": { + "content": [ + { + "0": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.filter.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e", + "UMI_consensus1.fgumi.filter.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.simplex.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.filter.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.filter.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "consensus_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.simplex.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram_crai": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.filter.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e", + "UMI_consensus1.fgumi.filter.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "family_size_histogram": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtered_consensus_cram": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.filter.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtering_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.filter.stats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "grouping_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + "fai": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + "dict": "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + "snap": "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "zipper_diagnostics": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-06-19T09:47:14.672939014" + } +} \ No newline at end of file diff --git a/workflows/preprocessing.nf b/workflows/preprocessing.nf index 46c93597..fc8549f0 100644 --- a/workflows/preprocessing.nf +++ b/workflows/preprocessing.nf @@ -271,7 +271,9 @@ workflow PREPROCESSING { FASTQ_TO_CRAM( ch_meta_reads_aligner_index_fasta_gtf ) + // Collect both standard and UMI-specific metrics for MultiQC. ch_multiqc_files = ch_multiqc_files.mix(FASTQ_TO_CRAM.out.sormadup_metrics) + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_TO_CRAM.out.family_size_histogram) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -328,7 +330,6 @@ workflow PREPROCESSING { BAM_QC.out.samtools_idxstats, BAM_QC.out.picard_multiplemetrics, BAM_QC.out.picard_wgsmetrics, - BAM_QC.out.picard_wgsmetrics, BAM_QC.out.picard_hsmetrics, ) @@ -432,6 +433,10 @@ workflow PREPROCESSING { rna_junctions = FASTQ_TO_CRAM.out.rna_junctions align_reports = FASTQ_TO_CRAM.out.align_reports sormadup_metrics = FASTQ_TO_CRAM.out.sormadup_metrics + // UMI-specific outputs exposed at workflow level. + family_size_histogram = FASTQ_TO_CRAM.out.family_size_histogram + umi_filtered_consensus_cram = FASTQ_TO_CRAM.out.filtered_consensus_cram + umi_zipper_diagnostics = FASTQ_TO_CRAM.out.zipper_diagnostics mosdepth_global = COVERAGE.out.mosdepth_global mosdepth_summary = COVERAGE.out.mosdepth_summary mosdepth_regions = COVERAGE.out.mosdepth_regions