cleaning up and adding version info

DOH-JDJ0303 · Mar 29, 2024 · 7d15347 · 7d15347
1 parent 48cf569
commit 7d15347
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 47 deletions.
diff --git a/bin/consensus.sh b/bin/consensus.sh
@@ -1,4 +1,13 @@
 #!/bin/bash
+version="1.0"
+
+# consensus.sh
+# Author: Jared Johnson, [email protected]
+
+set -o pipefail
+
+# get version info
+if [ "$1" == "version" ]; then echo "${version}" && exit 0; fi
 
 # input
 name=$1

diff --git a/bin/input-qc.sh b/bin/input-qc.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
+version="1.0"
+
+# input-qc.sh
+# Author: Jared Johnson, [email protected]
+
 set -o pipefail
 
+# get version info
+if [ "$1" == "version" ]; then echo "${version}" && exit 0; fi
+
 # input
 fasta=$1
 prefix=$2

diff --git a/bin/summary.R b/bin/summary.R
@@ -1,7 +1,8 @@
 #!/usr/bin/env Rscript
+version <- "1.0"
 
-#---- LIBRARIES ----#
-library(tidyverse)
+# summary.R
+# Author: Jared Johnson, [email protected]
 
 #---- ARGUMENTS ----#
 args <- commandArgs(trailingOnly = T)
@@ -11,6 +12,15 @@ fastani_ava_file <- args[3]
 fastani_seeds_file <- args[4]
 seeds_file <- args[5]
 
+#---- VERSION ----#
+if(clusters_file == "version"){
+  cat(version, sep = "\n")
+  quit(status=0)
+}
+
+#---- LIBRARIES ----#
+library(tidyverse)
+
 #---- FUNCTIONS ----#
 basename_fa <- function(path){
     result <- basename(path) %>%

diff --git a/modules/local/consensus.nf b/modules/local/consensus.nf
@@ -22,5 +22,10 @@ process CONSENSUS {
     # collect consensus size info
     length=\$(cat ${prefix}.fa | grep -v '>' | tr -d '\n\t ' | wc -c)
     echo "${prefix},\${length}" > ${prefix}_length.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        consensus: \$(consensus.sh version)
+    END_VERSIONS
     """
 }
diff --git a/modules/local/input-qc.nf b/modules/local/input-qc.nf
@@ -28,5 +28,10 @@ process INPUT_QC {
     input-qc.sh \${seqs} ${prefix} "${expected_length}" "${params.len_threshold}"
     # set sequence count
     seq_count=\$(cat ${prefix}-qc-summary.csv | cut -f 5 -d ',' | grep -v 'filter4' | tr -d '\t\r\n ')
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        input-qc: \$(input-qc.sh version)
+    END_VERSIONS
     """
 }
diff --git a/modules/local/summary.nf b/modules/local/summary.nf
@@ -23,5 +23,10 @@ process SUMMARY {
     cat ${clusters} | grep -v 'seq,taxa,segment,cluster' > clusters-no-header.csv
     # run script
     summary.R clusters-no-header.csv ${lengths} ${ani_ava} ${ani_seeds} ${seeds}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        summary: \$(summary.R version)
+    END_VERSIONS
     """
 }
diff --git a/workflows/epitome.nf b/workflows/epitome.nf
@@ -42,7 +42,6 @@ include { FASTANI_AVA   } from '../modules/local/fastani'
 include { FASTANI_SEEDS } from '../modules/local/fastani'
 include { SUMMARY       } from '../modules/local/summary'
 
-
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
@@ -72,34 +71,54 @@ workflow EPITOME {
 
     ch_versions = Channel.empty()
 
-    Channel.fromPath(params.input)
+    /*
+    =============================================================================================================================
+        LOAD SAMPLESHEET
+    =============================================================================================================================
+    */
+
+    Channel
+        .fromPath(params.input)
         .splitCsv(header:true)
         .map{ tuple(it.taxa, it.segment, file(it.assembly, checkIfExists: true), it.length) }
-        .set{ manifest } 
+        .set{ manifest }
+
+    /*
+    =============================================================================================================================
+        QUALITY FILTER SEQUENCES
+    =============================================================================================================================
+    */
 
-    // MODULE: Filter low quality sequences
+    // MODULE: Filter low quality sequences & remove duplicates
     INPUT_QC(
         manifest
     )
 
-    //
+    /*
+    =============================================================================================================================
+        CLUSTER SEQUENCES 
+    =============================================================================================================================
+    */
     // MODULE: Run Mash
-    //
     MASH (
         INPUT_QC.out.assemblies
     )
     ch_versions = ch_versions.mix(MASH.out.versions.first())
 
-    // MODULE: CLUSTER
-    MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() <= 2000 }.set{ small_datasets }
-    MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() > 2000 }.set{ large_datasets }
+    // MODULE: Cluster sequences with cutree
+    // Small datasets
     CLUSTER (
-        small_datasets
+        MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() <= 2000 }
     )
+    ch_versions = ch_versions.mix(CLUSTERS.out.versions.first())
+
+    // Large datasets - requires much more memory!
     CLUSTER_LARGE (
-        large_datasets
+        MASH.out.dist.filter{ taxa, segment, dist, count -> count.toInteger() > 2000 }
     )
+    ch_versions = ch_versions.mix(CLUSTERS_LARGE.out.versions.first())
 
+    // Combine small and large dataset cluster results and add clean sequence paths
     CLUSTER
         .out
         .results
@@ -111,52 +130,78 @@ workflow EPITOME {
         .map{ taxa, segment, cluster, contigs, seqs, count -> [ taxa, segment, cluster, contigs, seqs, contigs.size() ] }
         .set{ clusters }
 
-    // MODULE: SEQTK_SUBSEQ
+    // MODULE: Split clusters into multi-fasta files
     SEQTK_SUBSEQ(
         clusters
     )
+    ch_versions = ch_versions.mix(SEQTK_SUBSEQ.out.versions.first())
+
+    /*
+    =============================================================================================================================
+        ALIGN SEQUENCE CLUSTERS 
+    =============================================================================================================================
+    */
 
-    // MODULE: MAFFT
+    // MODULE: Align clustered sequences with mafft - only performed on clusters containing more than one sequence 
     MAFFT(
         SEQTK_SUBSEQ
             .out
             .sequences
             .filter{ taxa, segment, cluster, seqs, count -> count > 1 }
             .map{ taxa, segment, cluster, seqs, count -> [ taxa, segment, cluster, seqs ] }
     )
-    // recombine with singletons
+    ch_versions = ch_versions.mix(MAFFT.out.versions.first())
+
+    // recombine with singletons (i.e., clusters containing 1 sequence)
     SEQTK_SUBSEQ
         .out
         .sequences
         .filter{ taxa, segment, cluster, seqs, count -> count == 1 }
         .map{ taxa, segment, cluster, seqs, count -> [ taxa, segment, cluster, seqs ] }
         .concat(MAFFT.out.fa)
         .set{ alignments }
-
+
+    /*
+    =============================================================================================================================
+        CREATE CONSENSUS 
+    =============================================================================================================================
+    */
     // MODULE: Create consensus sequences
     CONSENSUS(
         alignments
     )
+    ch_versions = ch_versions.mix(CONSENSUS.out.versions.first())
 
-    // MODULE: Run blastn
+    /*
+    =============================================================================================================================
+        GATHER DATA ON CONSENSUS SEQUENCES
+    =============================================================================================================================
+    */
+    // MODULE: Determine average nucleotide identity between consensus sequences
     FASTANI_AVA (
         CONSENSUS.out.fa.groupTuple(by: [0,1]).map{ taxa, segment, cluster, assembly, length -> [ taxa, segment, assembly, length.min() ] }
     )
-
+    ch_versions = ch_versions.mix(FASTANI_AVA.out.versions.first())
+    // Classify consensus sequences based on supplied seed sequences - if supplied
     if(params.seeds){
         Channel
             .fromPath(params.seeds)
             .splitCsv(header:true)
             .map{ tuple(it.ref, file(it.assembly)) }
             .set{ seeds }
-        // MODULE: Run blastn
+        // MODULE: Determine average nucleotide identity between the consensus sequences and seed sequences
         FASTANI_SEEDS (
             CONSENSUS.out.fa.map{ taxa, segment, cluster, assembly, length -> assembly }.collect(),
             seeds.map{ ref, assembly -> assembly }.collect()
-        )
+        )    
+        ch_versions = ch_versions.mix(FASTANI_SEEDS.out.versions.first())
     }
 
-
+    /*
+    =============================================================================================================================
+        SUMMARIZE RESULTS
+    =============================================================================================================================
+    */
     // MODULE: Create summary
     SUMMARY(
         CLUSTER.out.results.concat(CLUSTER_LARGE.out.results).splitText().collectFile(name: "all-clusters.csv"),
@@ -165,34 +210,16 @@ workflow EPITOME {
         params.seeds ? FASTANI_SEEDS.out.ani : [],
         params.seeds ? file(params.seeds) : []
     )
-    CUSTOM_DUMPSOFTWAREVERSIONS (
-        ch_versions.unique().collectFile(name: 'collated_versions.yml')
-    )
+    ch_versions = ch_versions.mix(SUMMARY.out.versions.first())
 
     /*
-    //
-    // MODULE: MultiQC
-    //
-    workflow_summary    = WorkflowRefmaker.paramsSummaryMultiqc(workflow, summary_params)
-    ch_workflow_summary = Channel.value(workflow_summary)
-
-    methods_description    = WorkflowRefmaker.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params)
-    ch_methods_description = Channel.value(methods_description)
-
-    ch_multiqc_files = Channel.empty()
-    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
-    ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
-    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
-
-    MULTIQC (
-        ch_multiqc_files.collect(),
-        ch_multiqc_config.toList(),
-        ch_multiqc_custom_config.toList(),
-        ch_multiqc_logo.toList()
-    )
-    multiqc_report = MULTIQC.out.report.toList()
-
+    =============================================================================================================================
+        NEXTFLOW DEFAULTS
+    =============================================================================================================================
     */
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
 }
 
 /*