From 7beb2f1ffd8a1112780047430b77e9f346f2996a Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Sun, 5 Apr 2026 14:28:02 -0400 Subject: [PATCH 01/20] Integrate dnaseq-nextflow into ReFlow via new dnaseqExperiment template Replaces dnaSeqExperimentFromAccession and dnaSeqExperimentFromLocal with a single dnaseqExperiment datasetTemplate modeled after longReadRnaSeqExperiment. Adds snpAndCnvDNASeq.xml workflow (processSingleExperiment + loadSingleExperiment via dnaseq-nextflow), MakeDnaSeqNextflowConfig.pm, and MakeDnaSeqLoadNextflowConfig.pm. Co-Authored-By: Claude Sonnet 4.6 --- .../MakeDnaSeqLoadNextflowConfig.pm | 39 +++++ .../WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 113 +++++++++++++ Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 151 ++++++++++++++++++ Main/lib/xml/workflowTemplates/dnaseq.xml | 79 ++++++--- 4 files changed, 363 insertions(+), 19 deletions(-) create mode 100644 Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm create mode 100644 Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm create mode 100644 Main/lib/xml/workflow/snpAndCnvDNASeq.xml diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm new file mode 100644 index 000000000..33b1569eb --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm @@ -0,0 +1,39 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqLoadNextflowConfig; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); + +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; + +sub run { + my ($self, $test, $undo) = @_; + + my $indelDir = $self->getParamValue("indelDir"); + my $extDbRlsSpec = $self->getParamValue("extDbRlsSpec"); + my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec"); + + my $configPath = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile"); + + if ($undo) { + $self->runCmd(0, "rm -rf $configPath"); + } else { + open(F, ">", $configPath) or die "$! :Can't open config file '$configPath' for writing"; + print F +" +params { + indelDir = \"$indelDir\" + extDbRlsSpec = '\"$extDbRlsSpec\"' + genomeExtDbRlsSpec = '\"$genomeExtDbRlsSpec\"' +} + +singularity { + enabled = true + autoMounts = true +} +"; + close(F); + } +} + +1; diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm new file mode 100644 index 000000000..c6ed9ec18 --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -0,0 +1,113 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqNextflowConfig; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); + +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; +use GUS::ObjRelP::DbiDatabase; +use GUS::Supported::GusConfig; +use CBIL::Util::PropertySet; + +sub run { + my ($self, $test, $undo) = @_; + + my $workingDirRelativePath = $self->getParamValue("workingDirRelativePath"); + + my $sampleSheetFile = $self->getParamValue("sampleSheetFile"); + my $genomeFile = $self->getParamValue("genomeFile"); + my $gtfFile = $self->getParamValue("gtfFile"); + my $footprintFile = $self->getParamValue("footprintFile"); + my $ploidy = $self->getParamValue("ploidy"); + my $resultsDirectory = $self->getParamValue("resultsDirectory"); + my $organismAbbrev = $self->getParamValue("organismAbbrev"); + my $geneSourceIdOrthologFile = $self->getParamValue("geneSourceIdOrthologFile"); + my $chrsForCalcFile = $self->getParamValue("chrsForCalcFile"); + + my $nextflowConfigFile = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile"); + + # Translate local paths to cluster-side paths + my $digestedSampleSheet = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $sampleSheetFile); + my $digestedGenomeFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $genomeFile); + my $digestedGtfFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $gtfFile); + my $digestedFootprintFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $footprintFile); + my $digestedResultsDir = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $resultsDirectory); + my $digestedOrthologFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $geneSourceIdOrthologFile); + my $digestedChrsForCalcFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $chrsForCalcFile); + + # Workflow config values + my $minCoverage = $self->getConfig("minCoverage"); + my $winLen = $self->getConfig("winLen"); + my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); + my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); + my $freebayesMinAltFraction = $self->getConfig("freebayesMinAltFraction"); + my $bwaThreads = $self->getConfig("bwaThreads"); + + my $executor = $self->getClusterExecutor(); + my $queue = $self->getClusterQueue(); + + # Look up taxon_id from the database for this organism + my $gusConfigFile = $ENV{GUS_HOME} . "/config/gus.config"; + die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; + + my @properties = (); + my $gusConfig = CBIL::Util::PropertySet->new($gusConfigFile, \@properties, 1); + + my $taxonSql = "select taxon_id from apidb.organism where abbrev = '$organismAbbrev'"; + + my $db = GUS::ObjRelP::DbiDatabase->new( + $gusConfig->{props}->{dbiDsn}, + $gusConfig->{props}->{databaseLogin}, + $gusConfig->{props}->{databasePassword}, + 0, 0, 1, + $gusConfig->{props}->{coreSchemaName} + ); + + my $dbh = $db->getQueryHandle(); + my $stmt = $dbh->prepare($taxonSql); + $stmt->execute(); + + my $taxonId; + while (my @row = $stmt->fetchrow_array()) { + $taxonId = $row[0]; + } + + if ($undo) { + $self->runCmd(0, "rm -rf $nextflowConfigFile"); + } else { + open(F, ">", $nextflowConfigFile) or die "$! :Can't open config file '$nextflowConfigFile' for writing"; + print F +" +params { + samplesheet = \"$digestedSampleSheet\" + bwaThreads = $bwaThreads + minCoverage = $minCoverage + genomeFastaFile = \"$digestedGenomeFile\" + gtfFile = \"$digestedGtfFile\" + footprintFile = \"$digestedFootprintFile\" + winLen = $winLen + ploidy = $ploidy + outputDir = \"$digestedResultsDir\" + trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile + freebayesMinAltFraction = $freebayesMinAltFraction + maxNumberOfReads = $maxNumberOfReads + taxonId = \"$taxonId\" + geneSourceIdOrthologFile = \"$digestedOrthologFile\" + chrsForCalcFile = \"$digestedChrsForCalcFile\" +} + +process { + executor = '$executor' + queue = '$queue' +} + +singularity { + enabled = true + autoMounts = true +} +"; + close(F); + } +} + +1; diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml new file mode 100644 index 000000000..6b418d32c --- /dev/null +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + + + + $$parentDataDir$$/dnaseq + $$dataDir$$/analysisDir + $$analysisDirectory$$/results + $$analysisDirectory$$/nextflow.config + $$analysisDirectory$$/ngs-samples-nextflow.config + $$parentDataDir$$/final + $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/$$organismAbbrev$$.fasta + $$dataDir$$/final + $$dataDir$$/loadSingleExperiment + $$loadSingleExperimentDir$$/nextflow.config + + + $$dataDir$$ + + + + $$analysisDirectory$$ + + + + + $$resultsDirectory$$ + + + + + $$loadSingleExperimentDir$$ + + + + + $$gtfFile$$ + $$gtfSymLink$$ + + + + + $$genomeFastaFile$$ + $$genomeSymLink$$ + + + + + $$finalDir$$ + $$finalSymLink$$ + + + + + $$organismAbbrev$$ + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv + + + + + $$gusConfigFile$$ + $$analysisDirectory$$ + $$finalSymLink$$ + $$analysisDirectory$$/ngs-samples-results + $$ngsSamplesNextflowConfigFile$$ + $$dataDir$$ + samplesheet.csv + $$fromSRA$$ + DNASeq + $$organismAbbrev$$ + + + + + $$gusConfigFile$$ + $$nextflowConfigFile$$ + $$dataDir$$ + $$analysisDirectory$$/ngs-samples-results/samplesheet.csv + $$genomeSymLink$$ + $$gtfSymLink$$ + $$footprintFile$$ + $$ploidy$$ + $$resultsDirectory$$ + $$organismAbbrev$$ + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv + + + + + + $$gusConfigFile$$ + $$projectName$$ + $$dataDir$$ + $$nextflowConfigFile$$ + $$ngsSamplesNextflowConfigFile$$ + $$organismAbbrev$$ + $$genomeExtDbRlsSpec$$ + false + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + $$analysisDirectory$$ + VEuPathDB/dnaseq-nextflow + processSingleExperiment + + + + + + + + + $$loadNextflowConfigFile$$ + $$dataDir$$ + $$resultsDirectory$$ + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + $$genomeExtDbRlsSpec$$ + + + + + + $$gusConfigFile$$ + $$projectName$$ + $$dataDir$$ + $$loadSingleExperimentDir$$ + $$loadNextflowConfigFile$$ + $$loadSingleExperimentDir$$/results + $$organismAbbrev$$ + $$genomeExtDbRlsSpec$$ + false + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + VEuPathDB/dnaseq-nextflow + loadSingleExperiment + + + + diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 0fda986b5..65a2096f3 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -1,5 +1,6 @@ + @@ -30,16 +31,57 @@ $$geneFootprintFile$$ $$projectName$$ $$genomeExtDbRlsSpec$$ - $$gusConfigFile$$ + $$gusConfigFile$$ + + + + + + + + + + ${organismAbbrev}_${name}_dnaseqExperiment_RSRC + $$organismDatasetLoaderXmlFile$$ + $$dataDir$$ + $$gusConfigFile$$ + + + + + ${organismAbbrev} + ${projectName} + ${name} + ${organismAbbrev}_${name}_dnaseqExperiment_RSRC + ${version} + $$dataDir$$/${organismAbbrev}_${name}_dnaseqExperiment_RSRC + $$gtfFile$$ + $$genomeFastaFile$$ + $$geneFootprintFile$$ + ${ploidy} + ${fromSRA} + $$relativeWebServicesDir$$ + $$gusConfigFile$$ + $$genomeExtDbRlsSpec$$ + + + + + + + $$dataDir$$/$$referenceStrainOrganismAbbrev$$_mergeExperiments - + From 85a599f249580082d7e3e5493bb8677779d2cb13 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Tue, 14 Apr 2026 11:44:35 -0400 Subject: [PATCH 02/20] Simplify dnaseq workflow: remove unused params, move shared queries to organism level - Remove trimmomaticAdaptorsFile, freebayesMinAltFraction, maxNumberOfReads from MakeDnaSeqNextflowConfig and MakeDnaSeqSingleExperimentNextflowConfig - Remove taxonId param and DB lookup from MakeDnaSeqNextflowConfig (no longer needed by dnaseq-nextflow pipeline) - Move retrieveGeneCNVAndPloidyQueries from snpAndCnvDNASeq to dnaseq template (organism-level files created once; symlinked into each experiment dir) - snpAndCnvDNASeq now handles processSingleExperiment only; load steps removed Co-Authored-By: Claude Sonnet 4.6 --- .../WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 37 -------- ...akeDnaSeqSingleExperimentNextflowConfig.pm | 4 - Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 63 +++++--------- Main/lib/xml/workflowTemplates/dnaseq.xml | 84 ++++--------------- Main/lib/xml/workflowTemplates/project.xml | 1 - 5 files changed, 34 insertions(+), 155 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm index c6ed9ec18..9fc9d4067 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -5,9 +5,6 @@ package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqNextflowConfig; use strict; use warnings; use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; -use GUS::ObjRelP::DbiDatabase; -use GUS::Supported::GusConfig; -use CBIL::Util::PropertySet; sub run { my ($self, $test, $undo) = @_; @@ -20,7 +17,6 @@ sub run { my $footprintFile = $self->getParamValue("footprintFile"); my $ploidy = $self->getParamValue("ploidy"); my $resultsDirectory = $self->getParamValue("resultsDirectory"); - my $organismAbbrev = $self->getParamValue("organismAbbrev"); my $geneSourceIdOrthologFile = $self->getParamValue("geneSourceIdOrthologFile"); my $chrsForCalcFile = $self->getParamValue("chrsForCalcFile"); @@ -38,40 +34,11 @@ sub run { # Workflow config values my $minCoverage = $self->getConfig("minCoverage"); my $winLen = $self->getConfig("winLen"); - my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); - my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); - my $freebayesMinAltFraction = $self->getConfig("freebayesMinAltFraction"); my $bwaThreads = $self->getConfig("bwaThreads"); my $executor = $self->getClusterExecutor(); my $queue = $self->getClusterQueue(); - # Look up taxon_id from the database for this organism - my $gusConfigFile = $ENV{GUS_HOME} . "/config/gus.config"; - die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; - - my @properties = (); - my $gusConfig = CBIL::Util::PropertySet->new($gusConfigFile, \@properties, 1); - - my $taxonSql = "select taxon_id from apidb.organism where abbrev = '$organismAbbrev'"; - - my $db = GUS::ObjRelP::DbiDatabase->new( - $gusConfig->{props}->{dbiDsn}, - $gusConfig->{props}->{databaseLogin}, - $gusConfig->{props}->{databasePassword}, - 0, 0, 1, - $gusConfig->{props}->{coreSchemaName} - ); - - my $dbh = $db->getQueryHandle(); - my $stmt = $dbh->prepare($taxonSql); - $stmt->execute(); - - my $taxonId; - while (my @row = $stmt->fetchrow_array()) { - $taxonId = $row[0]; - } - if ($undo) { $self->runCmd(0, "rm -rf $nextflowConfigFile"); } else { @@ -88,10 +55,6 @@ params { winLen = $winLen ploidy = $ploidy outputDir = \"$digestedResultsDir\" - trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile - freebayesMinAltFraction = $freebayesMinAltFraction - maxNumberOfReads = $maxNumberOfReads - taxonId = \"$taxonId\" geneSourceIdOrthologFile = \"$digestedOrthologFile\" chrsForCalcFile = \"$digestedChrsForCalcFile\" } diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm index 9ade04724..4579a431a 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm @@ -35,10 +35,8 @@ sub run { my $varscanPValue = $self->getConfig("varscanPValue"); my $varscanMinVarFreqSnp = $self->getConfig("varscanMinVarFreqSnp"); my $varscanMinVarFreqCons = $self->getConfig("varscanMinVarFreqCons"); - my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); my $hisat2Index = $self->getConfig("hisat2Index"); my $createIndex = $self->getConfig("createIndex"); - my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); my $ebiFtpUser = $self->getConfig("ebiFtpUser"); my $ebiFtpPassword = $self->getConfig("ebiFtpPassword"); @@ -93,11 +91,9 @@ params { hisat2Index = $hisat2Index createIndex = $createIndex outputDir = \"$clusterResultDir\" - trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile varscanPValue = $varscanPValue varscanMinVarFreqSnp = $varscanMinVarFreqSnp varscanMinVarFreqCons = $varscanMinVarFreqCons - maxNumberOfReads = $maxNumberOfReads taxonId = \"$taxonId\" geneSourceIdOrthologFile = \"$geneSourceIdOrthologFile\" chrsForCalcFile = \"$chrsForCalcFile\" diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index 6b418d32c..aca5a9003 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -1,4 +1,3 @@ - @@ -11,6 +10,8 @@ + + @@ -24,9 +25,8 @@ $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/$$organismAbbrev$$.fasta $$dataDir$$/final - $$dataDir$$/loadSingleExperiment - $$loadSingleExperimentDir$$/nextflow.config - + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv $$dataDir$$ @@ -41,11 +41,6 @@ - - $$loadSingleExperimentDir$$ - - - $$gtfFile$$ $$gtfSymLink$$ @@ -64,10 +59,15 @@ - - $$organismAbbrev$$ - $$dataDir$$/geneSourceIdOrthologFile.tsv - $$dataDir$$/chrsForCalcsFile.tsv + + $$geneSourceIdOrthologFile$$ + $$geneSourceIdOrthologSymLink$$ + + + + + $$chrsForCalcsFile$$ + $$chrsForCalcsSymLink$$ @@ -95,14 +95,14 @@ $$footprintFile$$ $$ploidy$$ $$resultsDirectory$$ - $$organismAbbrev$$ - $$dataDir$$/geneSourceIdOrthologFile.tsv - $$dataDir$$/chrsForCalcsFile.tsv + $$geneSourceIdOrthologSymLink$$ + $$chrsForCalcsSymLink$$ - + + - + $$gusConfigFile$$ $$projectName$$ $$dataDir$$ @@ -120,32 +120,9 @@ + + - - $$loadNextflowConfigFile$$ - $$dataDir$$ - $$resultsDirectory$$ - $$experimentDatasetName$$|$$experimentDatasetVersion$$ - $$genomeExtDbRlsSpec$$ - - - - - - $$gusConfigFile$$ - $$projectName$$ - $$dataDir$$ - $$loadSingleExperimentDir$$ - $$loadNextflowConfigFile$$ - $$loadSingleExperimentDir$$/results - $$organismAbbrev$$ - $$genomeExtDbRlsSpec$$ - false - $$experimentDatasetName$$|$$experimentDatasetVersion$$ - VEuPathDB/dnaseq-nextflow - loadSingleExperiment - - diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 65a2096f3..a2079e224 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -7,11 +7,12 @@ - $$parentDataDir$$/dnaseq $$dataDir$$/geneFootprintFile.txt $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv $$dataDir$$ @@ -35,6 +36,13 @@ + + $$organismAbbrev$$ + $$geneSourceIdOrthologFile$$ + $$chrsForCalcsFile$$ + + + @@ -66,82 +74,18 @@ $$relativeWebServicesDir$$ $$gusConfigFile$$ $$genomeExtDbRlsSpec$$ + $$geneSourceIdOrthologFile$$ + $$chrsForCalcsFile$$ + - - $$dataDir$$/$$referenceStrainOrganismAbbrev$$_mergeExperiments + $$dataDir$$/$$organismAbbrev$$_mergeExperiments @@ -151,7 +95,7 @@ $$dataDir$$/**/results $$genomeFastaFile$$ $$gtfFile$$ - $$referenceStrainOrganismAbbrev$$ + $$organismAbbrev$$ $$dataDir$$/mergeExperiments/results cache.txt undoneStrains.txt diff --git a/Main/lib/xml/workflowTemplates/project.xml b/Main/lib/xml/workflowTemplates/project.xml index 05a209202..fdb2a5197 100644 --- a/Main/lib/xml/workflowTemplates/project.xml +++ b/Main/lib/xml/workflowTemplates/project.xml @@ -345,7 +345,6 @@ ${projectName}/${organismAbbrev}.xml $$dataDir$$/${organismAbbrev} ${organismAbbrev}_primary_genome_RSRC|${genomeVersion} - ${referenceStrainOrganismAbbrev} $$relativeWebServicesDir$$ $$dataDir$$/${organismAbbrev}/loadGenome/genomicSeqs.fa From 95de88a360d695b47aec5c55e2f71a5610163116 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Tue, 14 Apr 2026 11:59:47 -0400 Subject: [PATCH 03/20] getTaxonId instead of orgAbbrev for cnvandploidyqueries script --- .../WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm | 10 ++++++++-- Main/lib/xml/workflowTemplates/dnaseq.xml | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 627a8aff2..7e34fa2ff 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -10,7 +10,13 @@ sub run { my $organismAbbrev = $self->getParamValue('organismAbbrev'); my $geneSourceIdOrthologFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue("geneSourceIdOrthologFile")); my $chrsForCalcsFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue('chrsForCalcsFile')); - + my $gusConfigFile = $self->getParamValue('gusConfigFile'); + $gusConfigFile = $self->getWorkflowDataDir() . "/$gusConfigFile"; + + # TODO: get the groups file from globalConfig? + + my $taxonId = $self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getTaxonId(); + if ($undo) { $self->runCmd(0, "rm -f $geneSourceIdOrthologFile"); $self->runCmd(0, "rm -f $chrsForCalcsFile"); @@ -19,7 +25,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --organismAbbrev $organismAbbrev --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); } } } diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index a2079e224..2446dd05c 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -40,6 +40,7 @@ $$organismAbbrev$$ $$geneSourceIdOrthologFile$$ $$chrsForCalcsFile$$ + $$gusConfigFile$$ From d4a7ae455623906cca835c3717780b36b2f2735e Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Tue, 14 Apr 2026 15:25:09 -0400 Subject: [PATCH 04/20] Adding ortho group file parameter to RetrieveGeneCNVAndPloidyQueries --- .../perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 7e34fa2ff..515450be2 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -12,8 +12,7 @@ sub run { my $chrsForCalcsFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue('chrsForCalcsFile')); my $gusConfigFile = $self->getParamValue('gusConfigFile'); $gusConfigFile = $self->getWorkflowDataDir() . "/$gusConfigFile"; - - # TODO: get the groups file from globalConfig? + my $fullOrthoGroupsFile = $self->getSharedConfig("fullOrthoGroupsFile"); my $taxonId = $self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getTaxonId(); @@ -25,7 +24,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullGroupsFile"); } } } From d79e9f513f05a50884688bf4c8074311b49a287b Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Tue, 14 Apr 2026 15:26:16 -0400 Subject: [PATCH 05/20] Updating runGeneCNVAndPloidyQuery to function from ortho group flat file and with new database schema --- Main/bin/runGeneCNVAndPloidyQuery | 100 ++++++++++++++++++------------ 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 818dc8f29..d3d85e0a2 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -7,43 +7,30 @@ use GUS::ObjRelP::DbiDatabase; use GUS::Supported::GusConfig; use CBIL::Util::PropertySet; -my ($gusConfigFile,$organismAbbrev,$geneSourceIdOrthologFile,$chrsForCalcsFile); -&GetOptions("organismAbbrev=s" => \$organismAbbrev, +my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCalcsFile); +&GetOptions("taxonId=s" => \$taxonId, + "orthoGroupFile=s" => \$orthoGroupFile, "geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile, "chrsForCalcsFile=s" => \$chrsForCalcsFile); -my $ploidy = 2; - -my $geneSourceSql = "with sequence as ( - select gf.source_id as gene_source_id - , gf.na_feature_id - , ns.source_id as contig_source_id - , ns.source_id as sequence_source_id - , ns.TAXON_ID - from dots.genefeature gf - , DOTS.NASEQUENCE ns - , SRES.ONTOLOGYTERM ot - where gf.na_sequence_id = ns.na_sequence_id - and ot.name = 'chromosome' - and ns.SEQUENCE_ONTOLOGY_ID = ot.ONTOLOGY_TERM_ID - and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev') - ), orthologs as ( - select gf.na_feature_id, sg.name - from dots.genefeature gf - , dots.SequenceSequenceGroup ssg - , dots.SequenceGroup sg - , core.TableInfo ti - where gf.na_feature_id = ssg.sequence_id - and ssg.sequence_group_id = sg.sequence_group_id - and ssg.source_table_id = ti.table_id - and ti.name = 'GeneFeature' - ) - select s.gene_source_id - , o.name - from sequence s - , orthologs o - where s.na_feature_id = o.na_feature_id"; - -my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')"; + +my $proteinToGeneSql = " +SELECT aas.source_id AS protein_source_id, + gf.source_id AS gene_source_id +FROM dots.AASequence aas +JOIN dots.TranslatedAASequence tas ON aas.aa_sequence_id = tas.aa_sequence_id +JOIN dots.TranslatedAAFeature taf ON taf.aa_sequence_id = tas.aa_sequence_id +JOIN dots.Transcript t ON taf.na_feature_id = t.na_feature_id +JOIN dots.GeneFeature gf ON t.parent_id = gf.na_feature_id +WHERE aas.subclass_view = 'TranslatedAASequence' + AND aas.taxon_id = $taxonId + AND aas.taxon_id IN ( + SELECT taxon_id + FROM apidb.organism + WHERE is_annotated_genome = 1 + ) +"; + +my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId"; $gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; @@ -59,19 +46,50 @@ my $db = GUS::ObjRelP::DbiDatabase-> new($gusConfig->{props}->{dbiDsn}, my $dbh = $db->getQueryHandle(); -my $orthoMclStmt = $dbh->prepare($geneSourceSql); -$orthoMclStmt->execute(); +my $proteinToGeneStmt = $dbh->prepare($proteinToGeneSql); +$proteinToGeneStmt->execute(); + +my %proteinToGene; +while (my @row = $proteinToGeneStmt->fetchrow_array()){ + $proteinToGene{$row[0]} = $row[1]; +} -open(GENE,">$geneSourceIdOrthologFile"); -while (my @row = $orthoMclStmt->fetchrow_array()){ - print GENE "$row[0]\t$row[1]\n"; +my %proteinToGroup; +open(GROUPS, "<$orthoGroupFile") or die "Cannot open $orthoGroupFile: $!"; +while (my $line = ) { + chomp $line; + my ($groupId, $proteinList) = split(/:\s*/, $line, 2); + next unless defined $proteinList; + foreach my $protein (split(/\s+/, $proteinList)) { + $proteinToGroup{$protein} = $groupId; + } +} +close GROUPS; + +my @proteinsWithNoGroup; +open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!"; +while (my ($protein, $gene) = each %proteinToGene) { + my $group = $proteinToGroup{$protein}; + unless ($group) { + (my $altProtein = $protein) =~ s/:/\_/g; + $group = $proteinToGroup{$altProtein}; + } + if ($group) { + print GENE "$gene\t$group\n"; + } else { + push @proteinsWithNoGroup, $protein; + } } close GENE; +if (@proteinsWithNoGroup) { + print STDERR "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; +} + my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); $chrsForCalcs->execute(); -open(CHRS,">$chrsForCalcsFile"); +open(CHRS, ">$chrsForCalcsFile") or die "Cannot open $chrsForCalcsFile: $!"; while (my @row = $chrsForCalcs->fetchrow_array()){ print CHRS "$row[0]\t\n"; } From 3e3692c77fe84baa0c2589621c9319998d13ddda Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Fri, 15 May 2026 09:33:06 -0400 Subject: [PATCH 06/20] add repeatmasker bed and fix dependencies --- Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 37 ++++++++++++++++++---- Main/lib/xml/workflowTemplates/dnaseq.xml | 12 ++++--- Main/lib/xml/workflowTemplates/project.xml | 1 + 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index aca5a9003..cbc6313c1 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -15,16 +15,25 @@ + - $$parentDataDir$$/dnaseq + $$parentDataDir$$/dnaseqNextflow $$dataDir$$/analysisDir $$analysisDirectory$$/results $$analysisDirectory$$/nextflow.config $$analysisDirectory$$/ngs-samples-nextflow.config $$parentDataDir$$/final + $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/$$organismAbbrev$$.fasta + $$dataDir$$/$$organismAbbrev$$.bed.gz + $$dataDir$$/geneFootprintFile.txt + $$dataDir$$/final + + + + $$dataDir$$/geneSourceIdOrthologFile.tsv $$dataDir$$/chrsForCalcsFile.tsv @@ -53,6 +62,13 @@ + + $$repeatMaskedBed$$ + $$repeatMaskedBedSymLink$$ + + + + $$finalDir$$ $$finalSymLink$$ @@ -71,6 +87,12 @@ + + $$footprintFile$$ + $$footprintFileSymLink$$ + + + $$gusConfigFile$$ $$analysisDirectory$$ @@ -91,15 +113,21 @@ $$dataDir$$ $$analysisDirectory$$/ngs-samples-results/samplesheet.csv $$genomeSymLink$$ + $$repeatMaskedBedSymLink$$ $$gtfSymLink$$ - $$footprintFile$$ + $$footprintFileSymLink$$ $$ploidy$$ $$resultsDirectory$$ $$geneSourceIdOrthologSymLink$$ $$chrsForCalcsSymLink$$ + + + + + @@ -117,11 +145,6 @@ processSingleExperiment - - - - - diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 2446dd05c..623274398 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -7,7 +7,8 @@ - + + $$parentDataDir$$/dnaseq $$dataDir$$/geneFootprintFile.txt $$dataDir$$/$$organismAbbrev$$.gtf @@ -33,9 +34,10 @@ $$projectName$$ $$genomeExtDbRlsSpec$$ $$gusConfigFile$$ - + + $$organismAbbrev$$ $$geneSourceIdOrthologFile$$ @@ -69,6 +71,7 @@ $$dataDir$$/${organismAbbrev}_${name}_dnaseqExperiment_RSRC $$gtfFile$$ $$genomeFastaFile$$ + $$repeatMaskedBed$$ $$geneFootprintFile$$ ${ploidy} ${fromSRA} @@ -78,6 +81,7 @@ $$geneSourceIdOrthologFile$$ $$chrsForCalcsFile$$ + @@ -89,7 +93,7 @@ $$dataDir$$/$$organismAbbrev$$_mergeExperiments - + diff --git a/Main/lib/xml/workflowTemplates/project.xml b/Main/lib/xml/workflowTemplates/project.xml index f8fa7206b..d9573e568 100644 --- a/Main/lib/xml/workflowTemplates/project.xml +++ b/Main/lib/xml/workflowTemplates/project.xml @@ -347,6 +347,7 @@ ${organismAbbrev}_primary_genome_RSRC|${genomeVersion} $$relativeWebServicesDir$$ $$dataDir$$/${organismAbbrev}/loadGenome/genomicSeqs.fa + $$dataDir$$/${organismAbbrev}/maskGenome/analysisDir/results/blocked.seq.bed.gz From f21e855c1f32d7bdab44a157a5ced61079df053f Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Fri, 15 May 2026 09:36:49 -0400 Subject: [PATCH 07/20] minor --- Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index cbc6313c1..076ee3cc9 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -24,18 +24,16 @@ $$analysisDirectory$$/ngs-samples-nextflow.config $$parentDataDir$$/final - $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/$$organismAbbrev$$.fasta $$dataDir$$/$$organismAbbrev$$.bed.gz - $$dataDir$$/geneFootprintFile.txt - - $$dataDir$$/final - - + $$dataDir$$/final + $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/geneFootprintFile.txt $$dataDir$$/geneSourceIdOrthologFile.tsv $$dataDir$$/chrsForCalcsFile.tsv + $$dataDir$$ From c1c3a804ef5cd2f66e78207fd27e3a8fdec2b4f0 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Mon, 18 May 2026 11:38:11 -0400 Subject: [PATCH 08/20] debug gusConfigFile --- Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm b/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm index 0b8433ea9..171a72d33 100644 --- a/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm +++ b/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm @@ -20,7 +20,7 @@ sub run { my $project = $self->getParamValue("project"); my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec"); my $cdsOnly = $self->getBooleanParamValue("cdsOnly"); -my $gusConfigFile = $self->getGusConfigFile(); + my $gusConfigFile = $self->getParamValue('gusConfigFile'); my $cmd = "makeGtf.pl --outputFile $workflowDataDir/$gtfDir/$outputFile --project $project --genomeExtDbRlsSpec '$genomeExtDbRlsSpec' --gusConfigFile $gusConfigFile"; From 2a22f73098eeef029db0edff6d1ec0be55ffa617 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Mon, 18 May 2026 11:40:49 -0400 Subject: [PATCH 09/20] fix syntax error --- Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 515450be2..7b675ffd2 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -24,7 +24,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullGroupsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullOrthoGroupsFile"); } } } From ae0cc7743e22f65894e92fc9d86b5271e8fb7843 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Mon, 18 May 2026 11:45:00 -0400 Subject: [PATCH 10/20] add gusConfigFile and fullOrthoGroupsFile parameters --- Main/lib/xml/workflowTemplates/dnaseq.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 623274398..469c60324 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -14,12 +14,14 @@ $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/geneSourceIdOrthologFile.tsv $$dataDir$$/chrsForCalcsFile.tsv + $$dataDir$$/fullOrthoGroupsFile $$dataDir$$ + $$gusConfigFile$$ $$organismAbbrev$$.gtf $$dataDir$$ $$projectName$$ @@ -43,6 +45,7 @@ $$geneSourceIdOrthologFile$$ $$chrsForCalcsFile$$ $$gusConfigFile$$ + $$fullOrthoGroupsFile$$ From f71ca1e91a0d26f12e2fc50e67e56b38788dd55e Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Mon, 18 May 2026 16:35:39 -0400 Subject: [PATCH 11/20] more debug of gusConfigFile --- Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm b/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm index 171a72d33..686cc9f55 100644 --- a/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm +++ b/Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm @@ -20,7 +20,7 @@ sub run { my $project = $self->getParamValue("project"); my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec"); my $cdsOnly = $self->getBooleanParamValue("cdsOnly"); - my $gusConfigFile = $self->getParamValue('gusConfigFile'); + my $gusConfigFile = $self->getWorkflowDataDir() . "/" . $self->getParamValue('gusConfigFile'); my $cmd = "makeGtf.pl --outputFile $workflowDataDir/$gtfDir/$outputFile --project $project --genomeExtDbRlsSpec '$genomeExtDbRlsSpec' --gusConfigFile $gusConfigFile"; From b15c9b1be241e02d8b50482bb613cbfc85751ee2 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Mon, 18 May 2026 16:37:41 -0400 Subject: [PATCH 12/20] No need to include the fullOrthoGroupsFile paramValue --- Main/lib/xml/workflowTemplates/dnaseq.xml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 469c60324..9b92d0dca 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -14,7 +14,6 @@ $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/geneSourceIdOrthologFile.tsv $$dataDir$$/chrsForCalcsFile.tsv - $$dataDir$$/fullOrthoGroupsFile $$dataDir$$ @@ -45,7 +44,6 @@ $$geneSourceIdOrthologFile$$ $$chrsForCalcsFile$$ $$gusConfigFile$$ - $$fullOrthoGroupsFile$$ From 6f4620a0f35215f7c2082d7fa779a3ba89f508a6 Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Wed, 20 May 2026 15:40:28 -0400 Subject: [PATCH 13/20] Removing entry parameter from ngs sample workflow in runNextflowOnCluster --- Main/lib/xml/workflow/runNextflowOnCluster.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/lib/xml/workflow/runNextflowOnCluster.xml b/Main/lib/xml/workflow/runNextflowOnCluster.xml index 5eeecd543..7c98c3248 100644 --- a/Main/lib/xml/workflow/runNextflowOnCluster.xml +++ b/Main/lib/xml/workflow/runNextflowOnCluster.xml @@ -42,7 +42,7 @@ VEuPathDB/ngs-samples-nextflow true $$organismAbbrev$$ - $$entry$$ + From 2de5def5e6cad63be92ee8f2b2456932af8223f4 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Tue, 2 Jun 2026 13:20:06 -0400 Subject: [PATCH 14/20] add gusConfigFile parameter --- Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 7b675ffd2..3a9b8f455 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -24,7 +24,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullOrthoGroupsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullOrthoGroupsFile --gusConfigFile $gusConfigFile"); } } } From d463146fcc669251ce27267d3a4356caffd8a507 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Tue, 2 Jun 2026 13:22:31 -0400 Subject: [PATCH 15/20] use the gusConfigFile provided as a parameter instead of the one in $GUS_HOME --- Main/bin/runGeneCNVAndPloidyQuery | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index d3d85e0a2..d467ab48d 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -11,6 +11,7 @@ my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCa &GetOptions("taxonId=s" => \$taxonId, "orthoGroupFile=s" => \$orthoGroupFile, "geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile, + "gusConfigFile=s" => \$gusConfigFile, "chrsForCalcsFile=s" => \$chrsForCalcsFile); my $proteinToGeneSql = " @@ -32,7 +33,7 @@ WHERE aas.subclass_view = 'TranslatedAASequence' my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId"; -$gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; +#$gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; my @properties = (); From 6b506bab958ed208eef4069888d3333be1fd1943 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 3 Jun 2026 14:18:47 -0400 Subject: [PATCH 16/20] gate clusterOptions on lsf executor; add maxMemoryGigs param for bwaMem Co-Authored-By: Claude Sonnet 4.6 --- .../WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 66 +++++++++++++++---- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm index 9fc9d4067..afc97c2be 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -36,32 +36,70 @@ sub run { my $winLen = $self->getConfig("winLen"); my $bwaThreads = $self->getConfig("bwaThreads"); - my $executor = $self->getClusterExecutor(); - my $queue = $self->getClusterQueue(); + my $executor = $self->getClusterExecutor(); + my $queue = $self->getClusterQueue(); + my $maxMemoryGigs = eval { $self->getParamValue("maxMemoryGigs") }; + + my $isLsf = lc($executor) eq 'lsf'; + + # runFreebayes process block + my $freebayesBlock = " withName: 'runFreebayes' {\n"; + $freebayesBlock .= " maxRetries = 1\n"; + $freebayesBlock .= " errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'finish' }\n"; + if ($isLsf) { + $freebayesBlock .= " clusterOptions = {\n"; + $freebayesBlock .= " (task.attempt > 1 && task.exitStatus in 130..140)\n"; + $freebayesBlock .= " ? '-M 12000 -R \"rusage [mem=12000] span[hosts=1]\"'\n"; + $freebayesBlock .= " : '-M 4000 -R \"rusage [mem=4000] span[hosts=1]\"'\n"; + $freebayesBlock .= " }\n"; + } + $freebayesBlock .= " }\n"; + + # bwaMem process block + my $bwaRetries = defined($maxMemoryGigs) ? 0 : 1; + my $bwaBlock = " withName: 'bwaMem' {\n"; + $bwaBlock .= " maxRetries = $bwaRetries\n"; + if (!defined($maxMemoryGigs)) { + $bwaBlock .= " errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'finish' }\n"; + } + if ($isLsf) { + if (defined($maxMemoryGigs)) { + my $memMb = int($maxMemoryGigs * 1024); + $bwaBlock .= " clusterOptions = '-M $memMb -R \"rusage [mem=$memMb] span[hosts=1]\"'\n"; + } else { + $bwaBlock .= " clusterOptions = {\n"; + $bwaBlock .= " (task.attempt > 1 && task.exitStatus in 130..140)\n"; + $bwaBlock .= " ? '-M 12000 -R \"rusage [mem=12000] span[hosts=1]\"'\n"; + $bwaBlock .= " : '-M 4000 -R \"rusage [mem=4000] span[hosts=1]\"'\n"; + $bwaBlock .= " }\n"; + } + } + $bwaBlock .= " }\n"; if ($undo) { $self->runCmd(0, "rm -rf $nextflowConfigFile"); } else { open(F, ">", $nextflowConfigFile) or die "$! :Can't open config file '$nextflowConfigFile' for writing"; - print F -" + print F " params { - samplesheet = \"$digestedSampleSheet\" - bwaThreads = $bwaThreads - minCoverage = $minCoverage - genomeFastaFile = \"$digestedGenomeFile\" - gtfFile = \"$digestedGtfFile\" - footprintFile = \"$digestedFootprintFile\" - winLen = $winLen - ploidy = $ploidy - outputDir = \"$digestedResultsDir\" + samplesheet = \"$digestedSampleSheet\" + bwaThreads = $bwaThreads + minCoverage = $minCoverage + genomeFastaFile = \"$digestedGenomeFile\" + gtfFile = \"$digestedGtfFile\" + footprintFile = \"$digestedFootprintFile\" + winLen = $winLen + ploidy = $ploidy + outputDir = \"$digestedResultsDir\" geneSourceIdOrthologFile = \"$digestedOrthologFile\" - chrsForCalcFile = \"$digestedChrsForCalcFile\" + chrsForCalcFile = \"$digestedChrsForCalcFile\" } process { executor = '$executor' queue = '$queue' +$freebayesBlock +$bwaBlock } singularity { From 448d72fa73c9f40b7a0e95b6616cd75eb0ba8283 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 3 Jun 2026 14:29:35 -0400 Subject: [PATCH 17/20] dynamically size bwaMem memory from genome fasta using (Gb*3.3)+2 rule Co-Authored-By: Claude Sonnet 4.6 --- .../WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm index afc97c2be..3c304d957 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -36,12 +36,33 @@ sub run { my $winLen = $self->getConfig("winLen"); my $bwaThreads = $self->getConfig("bwaThreads"); - my $executor = $self->getClusterExecutor(); - my $queue = $self->getClusterQueue(); - my $maxMemoryGigs = eval { $self->getParamValue("maxMemoryGigs") }; + my $executor = $self->getClusterExecutor(); + my $queue = $self->getClusterQueue(); + my $maxMemoryGigs = eval { $self->getParamValue("maxMemoryGigs") }; + my $genomeFastaFile = eval { $self->getParamValue("genomeFastaFile") }; my $isLsf = lc($executor) eq 'lsf'; + # Dynamic bwaMem memory: (genome_Gb * 3.3) + 2 GB, rounded up to next power of 2 for safety + my $bwaDefaultMemMb = 4 * 1024; # fallback: 4 GB + if (defined($genomeFastaFile)) { + my $genomeSize = 0; + open(my $fh, "<", $genomeFastaFile) or die "Cannot open genome fasta '$genomeFastaFile': $!"; + while (<$fh>) { + next if /^>/; + chomp; + $genomeSize += length($_); + } + close($fh); + my $genomeGb = $genomeSize / 1_000_000_000; + my $rawGb = ($genomeGb * 3.3) + 2; + my $safeGb = $rawGb * 1.25; + my $memGb = 1; + $memGb *= 2 while $memGb < $safeGb; + $bwaDefaultMemMb = $memGb * 1024; + } + my $bwaRetryMemMb = $bwaDefaultMemMb * 2; + # runFreebayes process block my $freebayesBlock = " withName: 'runFreebayes' {\n"; $freebayesBlock .= " maxRetries = 1\n"; @@ -69,8 +90,8 @@ sub run { } else { $bwaBlock .= " clusterOptions = {\n"; $bwaBlock .= " (task.attempt > 1 && task.exitStatus in 130..140)\n"; - $bwaBlock .= " ? '-M 12000 -R \"rusage [mem=12000] span[hosts=1]\"'\n"; - $bwaBlock .= " : '-M 4000 -R \"rusage [mem=4000] span[hosts=1]\"'\n"; + $bwaBlock .= " ? '-M $bwaRetryMemMb -R \"rusage [mem=$bwaRetryMemMb] span[hosts=1]\"'\n"; + $bwaBlock .= " : '-M $bwaDefaultMemMb -R \"rusage [mem=$bwaDefaultMemMb] span[hosts=1]\"'\n"; $bwaBlock .= " }\n"; } } From 269c121a53c40392dcac708f120a87f912c86647 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Wed, 3 Jun 2026 14:31:10 -0400 Subject: [PATCH 18/20] pass genomeFastaFile to nextflowConfig step for dynamic memory sizing Co-Authored-By: Claude Sonnet 4.6 --- Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index 076ee3cc9..9b20bbb13 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -111,6 +111,7 @@ $$dataDir$$ $$analysisDirectory$$/ngs-samples-results/samplesheet.csv $$genomeSymLink$$ + $$genomeFastaFile$$ $$repeatMaskedBedSymLink$$ $$gtfSymLink$$ $$footprintFileSymLink$$ From d28c3195ad04c28790b206b900c31256713b4d13 Mon Sep 17 00:00:00 2001 From: Sufen Hu Date: Wed, 3 Jun 2026 23:40:39 -0400 Subject: [PATCH 19/20] debug $genomeFastaFile --- Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm index 3c304d957..67634db5c 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -39,7 +39,7 @@ sub run { my $executor = $self->getClusterExecutor(); my $queue = $self->getClusterQueue(); my $maxMemoryGigs = eval { $self->getParamValue("maxMemoryGigs") }; - my $genomeFastaFile = eval { $self->getParamValue("genomeFastaFile") }; + my $genomeFastaFile = $self->getWorkflowDataDir() . "/" . eval { $self->getParamValue("genomeFastaFile") }; my $isLsf = lc($executor) eq 'lsf'; From 13a14b6a9c33b6aa79ec3eeade63fb8fbe01e803 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Fri, 5 Jun 2026 16:35:22 -0400 Subject: [PATCH 20/20] add step to copy DNASeq bigwig files to webservices directory Adds CopyDnaseqBigwigToWebSvc workflow step and wires it into snpAndCnvDNASeq.xml after processSingleExperimentOnCluster. Iterates per-sample subdirectories in resultsDirectory and copies .bw files to $websiteFilesDir/.../dnaseq/bigwig/$experimentDatasetName/$sampleName/. Co-Authored-By: Claude Sonnet 4.6 --- .../WorkflowSteps/CopyDnaseqBigwigToWebSvc.pm | 49 +++++++++++++++++++ Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 9 ++++ 2 files changed, 58 insertions(+) create mode 100644 Main/lib/perl/WorkflowSteps/CopyDnaseqBigwigToWebSvc.pm diff --git a/Main/lib/perl/WorkflowSteps/CopyDnaseqBigwigToWebSvc.pm b/Main/lib/perl/WorkflowSteps/CopyDnaseqBigwigToWebSvc.pm new file mode 100644 index 000000000..64cca7105 --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/CopyDnaseqBigwigToWebSvc.pm @@ -0,0 +1,49 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::CopyDnaseqBigwigToWebSvc; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; +use ApiCommonWorkflow::Main::Util::OrganismInfo; + +sub run { + my ($self, $test, $undo) = @_; + + my $copyFromDir = $self->getParamValue('copyFromDir'); + my $organismAbbrev = $self->getParamValue('organismAbbrev'); + my $relativeDir = $self->getParamValue('relativeDir'); + my $experimentDatasetName = $self->getParamValue('experimentDatasetName'); + my $gusConfigFile = $self->getParamValue('gusConfigFile'); + + my $workflowDataDir = $self->getWorkflowDataDir(); + $gusConfigFile = "$workflowDataDir/$gusConfigFile"; + my $websiteFilesDir = $self->getWebsiteFilesDir($test); + + my $organismNameForFiles = + $self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getNameForFiles(); + + my $experimentCopyToDir = "$websiteFilesDir/$relativeDir/$organismNameForFiles/dnaseq/bigwig/$experimentDatasetName"; + my $sourceDir = "$workflowDataDir/$copyFromDir"; + + $self->testInputFile('copyFromDir', $sourceDir); + + if ($undo) { + $self->runCmd(0, "rm -rf $experimentCopyToDir"); + } else { + $self->runCmd($test, "mkdir -p $experimentCopyToDir"); + + opendir(my $dh, $sourceDir) or die "Cannot open results directory '$sourceDir': $!"; + my @samples = grep { !/^\./ && -d "$sourceDir/$_" } readdir($dh); + closedir($dh); + + die "No sample subdirectories found in '$sourceDir'" unless @samples; + + foreach my $sample (@samples) { + my $sampleCopyToDir = "$experimentCopyToDir/$sample"; + $self->runCmd($test, "mkdir -p $sampleCopyToDir"); + $self->runCmd($test, "cp $sourceDir/$sample/*.bw $sampleCopyToDir/"); + } + } +} + +1; diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index 9b20bbb13..0a3ba564f 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -146,5 +146,14 @@ + + $$resultsDirectory$$ + $$organismAbbrev$$ + $$relativeWebServicesDir$$ + $$experimentDatasetName$$ + $$gusConfigFile$$ + + +