nf-cmgg · freerkvandijk · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@
 
 **nf-cmgg/preprocessing** is a bioinformatics pipeline that demultiplexes and aligns raw sequencing data.
 It also performs basic QC and coverage analysis.
+The pipeline also includes an optional per-sample FGUMI consensus branch for UMI-aware DNA processing.
 
 The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible.
 
@@ -24,6 +25,7 @@ Steps include:
 - Run QC using [`MultiQC SAV`](https://github.com/MultiQC/MultiQC_SAV)
 - Read QC and trimming using [`fastp`](https://github.com/OpenGene/fastp) or [`falco`](https://github.com/smithlabcode/falco)
 - Alignment using either [`bwa`](https://github.com/lh3/bwa), [`bwa-mem2`](https://github.com/bwa-mem2/bwa-mem2), [`bowtie2`](https://github.com/BenLangmead/bowtie2), [`dragmap`](https://github.com/Illumina/DRAGMAP), [`snap`](https://github.com/amplab/snap) or [`strobe`](https://github.com/ksahlin/strobealign) for DNA-seq and [`STAR`](https://github.com/alexdobin/STAR) for RNA-seq
+- Optional FGUMI consensus branch for `fgumi_aware` DNA samples (`extract -> snap/zipper -> group -> simplex -> filter -> sort`)
 - Duplicate marking using [`bamsormadup`](https://gitlab.com/german.tischler/biobambam2) or [`samtools markdup`](http://www.htslib.org/doc/samtools-markdup.html)
 - Coverage analysis using [`mosdepth`](https://github.com/brentp/mosdepth) and [`samtools coverage`](http://www.htslib.org/doc/samtools-coverage.html)
 - Alignment QC using [`samtools flagstat`](http://www.htslib.org/doc/samtools-flagstat.html), [`samtools stats`](http://www.htslib.org/doc/samtools-stats.html), [`samtools idxstats`](http://www.htslib.org/doc/samtools-idxstats.html) and [`picard CollectHsMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectHsMetrics), [`picard CollectWgsMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics), [`picard CollectMultipleMetrics`](https://broadinstitute.github.io/picard/command-line-overview.html#CollectMultipleMetrics)

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -53,6 +53,24 @@
                 "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name.",
                 "default": false
             },
+            "fgumi_aware": {
+                "meta": ["fgumi_aware"],
+                "type": "boolean",
+                "description": "Enable UMI-aware consensus processing through the fgumi branch.",
+                "default": false
+            },
+            "fgumi_read_structures": {
+                "meta": ["fgumi_read_structures"],
+                "type": "string",
+                "description": "Read structures passed to fgumi extract for this sample.",
+                "default": null
+            },
+            "fgumi_extract_umis_from_read_names": {
+                "meta": ["fgumi_extract_umis_from_read_names"],
+                "type": "boolean",
+                "description": "Override fgumi extraction from read names for this sample.",
+                "default": null
+            },
             "skip_trimming": {
                 "meta": ["skip_trimming"],
                 "type": "boolean",

diff --git a/assets/schema_sampleinfo.json b/assets/schema_sampleinfo.json
@@ -93,6 +93,24 @@
                 "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name.",
                 "default": false
             },
+            "fgumi_aware": {
+                "meta": ["fgumi_aware"],
+                "type": "boolean",
+                "description": "Enable UMI-aware consensus processing through the fgumi branch.",
+                "default": false
+            },
+            "fgumi_read_structures": {
+                "meta": ["fgumi_read_structures"],
+                "type": "string",
+                "description": "Read structures passed to fgumi extract for this sample.",
+                "default": null
+            },
+            "fgumi_extract_umis_from_read_names": {
+                "meta": ["fgumi_extract_umis_from_read_names"],
+                "type": "boolean",
+                "description": "Override fgumi extraction from read names for this sample.",
+                "default": null
+            },
             "skip_trimming": {
                 "meta": ["skip_trimming"],
                 "type": "boolean",

diff --git a/conf/modules.config b/conf/modules.config
@@ -231,6 +231,163 @@ process {
         }
     }
 
+    //// FGUMI extract (step 1)
+    withName: '.*FGUMI_EXTRACT' {
+        cpus       = 4
+        memory     = 16.GB
+        ext.prefix = { "${meta.id}.fgumi.unmapped" }
+        ext.args   = {
+            [
+                "--read-group-id ${meta.readgroup?.ID ?: meta.id}",
+                meta.readgroup?.PL ? "--platform ${meta.readgroup.PL}" : "",
+                meta.readgroup?.PU ? "--platform-unit \"${meta.readgroup.PU}\"" : "",
+                meta.readgroup?.PM ? "--platform-model \"${meta.readgroup.PM}\"" : "",
+                meta.readgroup?.CN ? "--sequencing-center \"${meta.readgroup.CN}\"" : "",
+                meta.readgroup?.PI ? "--predicted-insert-size ${meta.readgroup.PI}" : "",
+                meta.readgroup?.DS ? "--description \"${meta.readgroup.DS}\"" : "",
+                meta.readgroup?.DT ? "--run-date \"${meta.readgroup.DT}\"" : "",
+                "--read-structures ${meta.fgumi_read_structures ?: '+T +T'}",
+                ((meta.fgumi_extract_umis_from_read_names != null ? meta.fgumi_extract_umis_from_read_names : true) ? "--extract-umis-from-read-names" : ""),
+                "--compression-level ${params.fgumi_compression_level}",
+            ].join(" ").trim()
+        }
+    }
+
+    //// FGUMI fastq | SNAP | zipper (step 3a)
+    withName: '.*FGUMI_SNAP_ZIPPER:FGUMI_SNAP_ALIGN' {
+        cpus       = 16
+        memory     = 64.GB
+        ext.prefix = { "${meta.id}.fgumi" }
+        ext.args   = {
+            [
+                "-b-",
+                "-sm 20",
+                params.fgumi_snap_ignore_mismatched_pairs ? "-I" : "",
+                "-hc-",
+                "-S id",
+                "-sa",
+                "-xf 2",
+                meta.readgroup ? "-R \"@RG\\t" + meta.readgroup.findResults { rg -> rg.value?.trim() ? "${rg.key}:${rg.value}" : null }.join("\\t") + "\"" : "",
+            ].join(" ").trim()
+        }
+    }
+
+    //// Queryname sort unmapped BAM before zipper (step 3b)
+    withName: '.*FGUMI_SNAP_ZIPPER:SAMTOOLS_QNAME_SORT_UNMAPPED' {
+        cpus       = 16
+        memory     = 64.GB
+        ext.prefix = { "${meta.id}.fgumi.unmapped.queryname" }
+        ext.args   = {
+            [
+                "-n",
+            ].join(" ").trim()
+        }
+    }
+
+    //// Queryname sort mapped stream before zipper (step 3c)
+    withName: '.*FGUMI_SNAP_ZIPPER:SAMTOOLS_QNAME_SORT_MAPPED' {
+        cpus       = 16
+        memory     = 64.GB
+        ext.prefix = { "${meta.id}.fgumi.snap.queryname" }
+        ext.args   = {
+            [
+                "-n",
+                "--output-fmt sam",
+            ].join(" ").trim()
+        }
+    }
+
+    //// FGUMI zipper between queryname-sorted streams (step 3d)
+    withName: '.*FGUMI_SNAP_ZIPPER:FGUMI_ZIPPER' {
+        cpus       = 16
+        memory     = 64.GB
+        ext.prefix = { "${meta.id}.fgumi" }
+        ext.args   = {
+            [
+                "--threads ${task.cpus}",
+            ].join(" ").trim()
+        }
+    }
+
+    //// FGUMI template-coordinate sort after zipper (step 3e)
+    withName: '.*FGUMI_SNAP_ZIPPER:FGUMI_TEMPLATE_SORT' {
+        cpus       = 16
+        memory     = 64.GB
+        ext.prefix = { "${meta.id}.fgumi.template" }
+        ext.args   = {
+            [
+                "--order template-coordinate",
+                "--max-memory ${params.fgumi_sort_max_memory}",
+                "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}",
+            ].join(" ").trim()
+        }
+    }
+
+    //// FGUMI group (step 4)
+    withName: '.*FGUMI_GROUP' {
+        cpus       = 8
+        memory     = 32.GB
+        ext.prefix = { "${meta.id}.fgumi.group" }
+        ext.args   = {
+            [
+                "--edits ${meta.fgumi_group_edits != null ? meta.fgumi_group_edits : 1}",
+                "--compression-level ${meta.fgumi_compression_level != null ? meta.fgumi_compression_level : 1}",
+            ].join(" ").trim()
+        }
+    }
+
+    //// FGUMI simplex (step 5)
+    withName: '.*FGUMI_SIMPLEX' {
+        cpus       = 8
+        memory     = 32.GB
+        ext.prefix = { "${meta.id}.fgumi.simplex" }
+        ext.args   = {
+            [
+                "--queue-memory ${params.fgumi_queue_memory}",
+                params.fgumi_queue_memory_per_thread ? "--queue-memory-per-thread" : "",
+                "--compression-level ${params.fgumi_compression_level}",
+            ].join(" ").trim()
+        }
+    }
+
+    //// FGUMI filter + coordinate sort/index (step 7)
+    withName: '.*FGUMI_FILTER' {
+        cpus       = 4
+        memory     = 16.GB
+        ext.prefix = { "${meta.id}.fgumi.filter" }
+        ext.args   = ''
+    }
+
+    //// FGUMI coordinate sort/index after filter (step 7)
+    withName: '.*FGUMI_SORT' {
+        cpus       = 8
+        memory     = 32.GB
+        ext.prefix = { "${meta.id}.fgumi.filter" }
+        ext.args   = {
+            [
+                "--order coordinate",
+                "--write-index",
+                "--threads ${task.cpus}",
+                "--max-memory ${params.fgumi_sort_max_memory}",
+                "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}",
+            ].join(" ").trim()
+        }
+    }
+
+    //// UMI step 7b now uses samtools sort to produce CRAM directly.
+    withName: '.*UMI_CONSENSUS_FGUMI:SAMTOOLS_SORT' {
+        cpus       = 8
+        memory     = 32.GB
+        ext.prefix = { "${meta.id}.fgumi.filter.sorted" }
+        ext.args   = {
+            [
+                "--write-index",
+                "--output-fmt cram,version=3.0",
+                "--output-fmt-option archive",
+            ].join(" ").trim()
+        }
+    }
+
     // coverage
     //// Mosdepth
     withName: '.*COVERAGE:MOSDEPTH' {

diff --git a/docs/README.md b/docs/README.md
@@ -4,6 +4,8 @@ The nf-cmgg/preprocessing documentation is split into the following pages:
 
 - [Usage](usage.md)
   - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags.
+- [Analysis flow](analysis_flow.md)
+  - Mermaid overview of the current fgumi-aware analysis path and integration into the FASTQ to CRAM workflow.
 - [Output](output.md)
   - An overview of the different results produced by the pipeline and how to interpret them.
 - [Parameters](parameters.md)

diff --git a/docs/analysis_flow.md b/docs/analysis_flow.md
@@ -0,0 +1,68 @@
+# nf-cmgg/preprocessing: Analysis Flow
+
+## FGUMI-aware DNA flow (new branch)
+
+The diagram below summarizes the current fgumi-aware branch and how it joins the common FASTQ to CRAM path.
+It reflects the current wiring in `FASTQ_TO_CRAM` and `UMI_CONSENSUS_FGUMI`, including the early branching and data joins.
+
+```mermaid
+flowchart TD
+  A[Input channel: meta + reads + aligner + index + fasta + gtf] --> B{sample_type}
+  B -->|RNA| R1[FASTQ_ALIGN_RNA]
+  B -->|DNA| C{meta.fgumi_aware == true}
+
+  C -->|false| D1[FASTQ_ALIGN_DNA non-UMI]
+  C -->|true| U0[Enter UMI_CONSENSUS_FGUMI]
+
+    subgraph UMI_CONSENSUS_FGUMI
+      U1[Step 1: FGUMI_EXTRACT\n(reads -> unmapped BAM with UMI tags)]
+      U1J[Join with reference assets\nSNAP index + fasta + dict from meta.genome_data]
+      U2[Step 3: FGUMI_SNAP_ZIPPER]
+      U2a[samtools sort -n\nunmapped BAM]
+      U2b[fgumi fastq]
+      U2c[snap-aligner paired]
+      U2d[samtools sort -n\npost-SNAP]
+      U2e[fgumi zipper]
+      U2f[fgumi sort --order template-coordinate]
+      U3[Step 4: FGUMI_GROUP]
+      U4[Step 5: FGUMI_SIMPLEX]
+      U5J[Join simplex BAM with fasta]
+      U5[Step 7a: FGUMI_FILTER]
+      U6[Step 7b: SAMTOOLS_SORT\ncoordinate sort + CRAM index]
+
+      U1 --> U1J --> U2
+      U2 --> U2a --> U2b --> U2c --> U2d --> U2e --> U2f
+      U2f --> U3 --> U4 --> U5J --> U5 --> U6
+    end
+
+    D1 --> M1[Markdup branch selector\nbamsormadup | samtools | sort]
+    R1 --> M1
+
+    U6 --> MIX1[Mix UMI CRAM/CRAI into common postprocess stream]
+    U3 --> MET1[grouping_metrics]
+    U3 --> MET2[family_size_histogram]
+    U4 --> MET3[consensus_metrics]
+    U5 --> MET4[filtering_metrics]
+    U6 --> MET5[filtered_consensus_cram]
+
+    M1 --> P1[BIOBAMBAM_BAMSORMADUP or SAMTOOLS_SORMADUP or SAMTOOLS_SORT]
+    P1 --> COMP{bam or cram}
+    MIX1 --> COMP
+
+    COMP -->|bam| CVT[SAMTOOLS_CONVERT to CRAM]
+    COMP -->|cram| OUT1[cram + crai]
+    CVT --> OUT1
+
+    OUT1 --> E1[emit: cram_crai]
+    MET1 --> E2[emit: sormadup_metrics]
+    MET2 --> E3[emit: family_size_histogram]
+    MET5 --> E4[emit: filtered_consensus_cram]
+```
+
+## Notes
+
+- The fgumi branch is opt-in per sample via fgumi_aware.
+  - Step numbering mirrors the implementation comments: steps 1, 3, 4, 5, and 7 are executed in this branch.
+  - The "step 2" nomenclature from upstream fgumi Basic Workflow is intentionally absent in this pipeline path.
+- UMI-specific metrics are emitted and mixed into the common reporting stream.
+- The output still converges to the shared CRAM/crai downstream path.
diff --git a/docs/usage.md b/docs/usage.md
@@ -1,6 +1,8 @@
 # nf-cmgg/preprocessing: Usage
 
-Parameter documentation can be found [here](parameters.md)
+Parameter documentation can be found in the [pipeline parameters reference](parameters.md).
+
+The current fgumi-aware branch and overall integration into the FASTQ to CRAM flow are documented in the [analysis flow diagram](analysis_flow.md).
 
 ## Introduction
 
@@ -26,6 +28,7 @@ A `fastq` samplesheet file consisting of paired-end data may look something like
   aligner: bwamem
   markdup: bamsormadup
   umi_aware: false
+  fgumi_aware: false
   skip_trimming: false
   trim_front: 0
   trim_tail: 0
@@ -67,6 +70,10 @@ Following table shows the fields that are used by the `fastq` samplesheet:
 
 An [example samplesheet](../tests/inputs/test.yml) has been provided with the pipeline.
 
+> [!NOTE]
+> `umi_aware` and `fgumi_aware` are independent options.
+> Use `umi_aware` for samtools markdup UMI mode, and `fgumi_aware` to run the fgumi consensus branch.
+
 ### Flowcell samplesheet
 
 A `flowcell` samplesheet file consisting of one sequencing run may look something like the one below.
@@ -102,6 +109,7 @@ A `flowcell` sample info JSON/YML file consisting for one sequencing run may loo
   aligner: bwamem
   markdup: bamsormadup
   umi_aware: false
+  fgumi_aware: false
   skip_trimming: false
   trim_front: 0
   trim_tail: 0

diff --git a/main.nf b/main.nf
@@ -178,6 +178,9 @@ workflow {
     rna_junctions              = PREPROCESSING.out.rna_junctions
     align_reports              = PREPROCESSING.out.align_reports
     sormadup_metrics           = PREPROCESSING.out.sormadup_metrics
+    // Additional UMI consensus outputs.
+    family_size_histogram      = PREPROCESSING.out.family_size_histogram
+    umi_filtered_consensus_cram = PREPROCESSING.out.umi_filtered_consensus_cram
     mosdepth_global            = PREPROCESSING.out.mosdepth_global
     mosdepth_summary           = PREPROCESSING.out.mosdepth_summary
     mosdepth_regions           = PREPROCESSING.out.mosdepth_regions
@@ -275,6 +278,17 @@ output {
             metrics >> (meta.library ? "${meta.library}/${meta.samplename}/${meta.samplename}.duplicate_metrics.txt" : "${meta.samplename}/${meta.samplename}.duplicate_metrics.txt")
         }
     }
+    // UMI consensus artefacts are published per sample next to CRAM outputs.
+    family_size_histogram {
+        path { meta, histogram ->
+            histogram >> (meta.library ? "${meta.library}/${meta.samplename}/${histogram.name}" : "${meta.samplename}/${histogram.name}")
+        }
+    }
+    umi_filtered_consensus_cram {
+        path { meta, cram ->
+            cram >> (meta.library ? "${meta.library}/${meta.samplename}/${cram.name}" : "${meta.samplename}/${cram.name}")
+        }
+    }
     mosdepth_global {
         path { meta, _file ->
             return (meta.library ? "${meta.library}/${meta.samplename}/" : "${meta.samplename}/")