improving validation subworkflow

DOH-JDJ0303 · May 1, 2024 · a756e8c · a756e8c
1 parent e040a71
commit a756e8c
Show file tree

Hide file tree

Showing 11 changed files with 153 additions and 46 deletions.
diff --git a/bin/validate.sh → bin/val_gather.sh b/bin/validate.sh → bin/val_gather.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# validate.sh v1.0
+# val_gather.sh v1.0
 # Author: Jared Johnson, [email protected]
 
 #----- INPUTS -----#

diff --git a/bin/contig-pairs.sh → bin/val_pair.sh b/bin/contig-pairs.sh → bin/val_pair.sh
@@ -1,9 +1,14 @@
 #!/bin/bash
 
+# val_pair.sh v1.0
+# Author: Jared Johnson, [email protected]
+
+#----- INPUTS -----#
 F1=$1
 F2=$2
 PREFIX=$3
 
+# 
 cat $F1 | awk '{print $1}' > f1.fa
 cat $F2 | awk '{print $1}' > f2.fa
 

diff --git a/bin/val_report.R b/bin/val_report.R
@@ -13,9 +13,9 @@ library(tidyr)
 # calculate global metrics
 global_metrics <- function(results_file, pairs_file, metric){
     if((file.exists(results_file) && file.exists(pairs_file))){
-        results <- read_csv(results_file) %>%
-          .$Result
-        n_missing_extra <- read_csv(pairs_file, col_names = F) %>%
+        df <- read_csv(results_file)
+        results <- df[,ncol(df)] %>% unlist()
+        n_missing_extra <- read_tsv(pairs_file, col_names = F) %>%
           rename(seq1=1,
                  seq2=2) %>%
           filter(seq1 == "null" | seq2 == "null") %>%
@@ -29,7 +29,7 @@ global_metrics <- function(results_file, pairs_file, metric){
     }else(return(data.frame("metric" = metric)))
 }
 
-acc <- global_metrics("accuracy_null_results.csv","accuracy_null_pairs.csv", "Accuracy")
+acc <- global_metrics("accuracy_results.csv","accuracy_pairs.csv", "Accuracy")
 inter <- global_metrics("precision_inter_results.csv","precision_inter_pairs.csv", "Inter-Assay Reproducility")
 intra <- global_metrics("precision_intra_results.csv","precision_intra_pairs.csv", "Intra-Assay Reproducility")
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -242,5 +242,34 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-
+    withName: 'PAIR' {
+        ext.args            = ""
+        ext.when            = {  }
+        publishDir          = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/validation" },
+            pattern: "*.pairs.txt"
+        ]
+    }
+    withName: 'GATHER' {
+        ext.args            = ""
+        ext.when            = {  }
+        publishDir          = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/validation" },
+            pattern: "*.csv"
+        ]
+    }
+    withName: 'REPORT' {
+        ext.args            = ""
+        ext.when            = {  }
+        publishDir          = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/validation" },
+            pattern: "validation-report.csv"
+        ]
+    }
 }
diff --git a/modules/local/irma.nf b/modules/local/irma.nf
@@ -10,7 +10,7 @@ process IRMA {
     path module_template
 
     output:
-    tuple val(meta), path('results/*.fasta'),  emit: consensus
+    tuple val(meta), path('results/*.fa'),     emit: consensus
     tuple val(meta), path('results/*.bam'),    emit: bam
     tuple val(meta), path('results/logs/'),    emit: logs
     tuple val(meta), path('results/figures/'), emit: figures
@@ -26,13 +26,27 @@ process IRMA {
     """
     # determine IRMA path
     irma_path=\$(which IRMA)
+
     # create module
     mod=\$(shuf -er -n20  {A..Z} {a..z} {0..9} | tr -d '\n')
     mv ${module_template} \${irma_path}_RES/modules/\${mod}
+
     # combine references into single file
     cat ${refs} > \${irma_path}_RES/modules/\${mod}/reference/consensus.fasta
+
     # run IRMA
     IRMA \${mod} ${reads[0]} ${reads[1]} results
+
+    # update fasta names and headers
+    for f in \$(ls results/*.fasta)
+    do
+        file=\${f##*/}
+        ref_id=\${file%.fasta}
+        PREFIX="${prefix}_\${ref_id}"
+
+        cat \${f} | sed "s/>.*/>\${PREFIX}/g" > results/\${PREFIX}.fa
+    done
+
     # clean up
     rm -r \${irma_path}_RES/modules/\${mod}
     """

diff --git a/modules/local/ivar_consensus.nf b/modules/local/ivar_consensus.nf
@@ -17,7 +17,7 @@ process IVAR_CONSENSUS {
 
     script:
     def args = task.ext.args ?: ''
-    prefix = "${meta.id}-${ref_id}"
+    prefix = "${meta.id}_${ref_id}"
 
     """
     # setup for pipe
@@ -32,6 +32,8 @@ process IVAR_CONSENSUS {
        -t ${params.ivar_t} \\
        -q ${params.ivar_q} \\
        ${args}
+    
+    sed -i 's/>.*/>${prefix}/g' ${prefix}.fa
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/val_metrics.nf → modules/local/val_gather.nf b/modules/local/val_metrics.nf → modules/local/val_gather.nf
@@ -1,4 +1,4 @@
-process METRICS {
+process GATHER {
     label 'process_low'
 
     conda "bioconda::mafft=7.520"
@@ -20,7 +20,7 @@ process METRICS {
     def args = task.ext.args   ?: ''
     """
     mafft --auto ${fasta} > ${fasta.baseName}.aln
-    validate.sh ${fasta.baseName}.aln "${metric}"
+    val_gather.sh ${fasta.baseName}.aln "${metric}"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/val_join.nf b/modules/local/val_join.nf
diff --git a/modules/local/val_pair.nf b/modules/local/val_pair.nf
@@ -26,7 +26,7 @@ process PAIR {
     cat ${seqs2} > seq2.fa
 
     # create contig pairs
-    contig-pairs.sh seq1.fa seq2.fa "${id}"
+    val_pair.sh seq1.fa seq2.fa "${id}"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/subworkflows/local/assemble.nf b/subworkflows/local/assemble.nf
@@ -47,7 +47,7 @@ workflow ASSEMBLE {
             .out
             .consensus
             .transpose()
-            .map{ meta, consensus -> [meta, consensus.getSimpleName(), consensus] }
+            .map{ meta, consensus -> [meta, consensus.getSimpleName().replace(meta.id+'_', ''), consensus] }
             .set{ ch_consensus }
     }
 

diff --git a/subworkflows/local/validate.nf b/subworkflows/local/validate.nf
@@ -3,8 +3,7 @@
 //
 
 include { PAIR    } from '../../modules/local/val_pair'
-include { METRICS } from '../../modules/local/val_metrics'
-include { JOIN    } from '../../modules/local/val_join'
+include { GATHER  } from '../../modules/local/val_gather'
 include { REPORT  } from '../../modules/local/val_report'
 
 
@@ -64,26 +63,108 @@ workflow VALIDATE {
 
     ch_accuracy.concat(ch_inter_group).concat(ch_intra_group).set{ ch_datasets }
 
+    /* 
+    =============================================================================================================================
+        DETERMINE PAIRWISE COMPARISONS
+    =============================================================================================================================
+    */
     PAIR (
         ch_datasets
     )
 
-    METRICS (
+    /* 
+    =============================================================================================================================
+        GATHER METRICS
+    =============================================================================================================================
+    */
+    GATHER (
         PAIR.out.fasta.transpose()
     )
 
-    METRICS
+    /* 
+    =============================================================================================================================
+        JOIN ALL METRICS/PAIRS INTO SINGLE CHANNEL
+    =============================================================================================================================
+    */
+    // Accuracy
+    GATHER
         .out
         .result
-        .groupTuple(by: [0,1])
-        .join(PAIR.out.pairs.groupTuple(by: [0,1]), by: [0,1])
+        .filter{ metric, type, results -> metric == "accuracy" }
+        .map{ metric, type, results -> results }
+        .splitText()
+        .filter(line -> line != "Sample,Truth,TP,FP,FN,Result\n")
+        .collectFile(name: "accuracy_results.csv")
+        .set{ ch_acc_res }
+
+    PAIR
+        .out
+        .pairs
+        .filter{ metric, type, pairs -> metric == "accuracy" }
+        .map{ metric, type, pairs -> pairs }
+        .splitText()
+        .collectFile(name: "accuracy_pairs.csv")
+        .set{ ch_acc_pair }
+
+    // Intra-assay Precision
+    GATHER
+        .out
+        .result
+        .filter{ metric, type, results -> metric == "precision" && type == "inter" }
+        .map{ metric, type, results -> results }
+        .splitText()
+        .filter(line -> line != "Sample1,Sample2,TP,PP,Result\n")
+        .collectFile(name: "precision_inter_results.csv")
+        .set{ ch_prec_inter_res }
+
+
+    PAIR
+        .out
+        .pairs
+        .filter{ metric, type, pairs -> metric == "precision" && type == "inter" }
+        .map{ metric, type, pairs -> pairs }
+        .splitText()
+        .collectFile(name: "precision_inter_pairs.csv")
+        .set{ ch_prec_inter_pair }
+
+    // Intra-assay Precision
+    GATHER
+        .out
+        .result
+        .filter{ metric, type, results -> metric == "precision" && type == "intra" }
+        .map{ metric, type, results -> results }
+        .splitText()
+        .filter(line -> line != "Sample1,Sample2,TP,PP,Result\n")
+        .collectFile(name: "precision_intra_results.csv")
+        .set{ ch_prec_intra_res }
+
+    PAIR
+        .out
+        .pairs
+        .filter{ metric, type, pairs -> metric == "precision" && type == "intra" }
+        .map{ metric, type, pairs -> pairs }
+        .splitText()
+        .collectFile(name: "precision_intra_pairs.csv")
+        .set{ ch_prec_intra_pair }
+
+    // Combine all
+    ch_acc_res
+        .concat(ch_acc_pair)
+        .concat(ch_prec_inter_res)
+        .concat(ch_prec_inter_pair)
+        .concat(ch_prec_intra_res)
+        .concat(ch_prec_intra_pair)
+        .flatten()
+        .collect()
         .set{ ch_results }
-    JOIN (
-        ch_results
-    )
 
+    /* 
+    =============================================================================================================================
+        GENERATE REPORT
+    =============================================================================================================================
+    */
     REPORT (
-        JOIN.out.results.flatten().collect()
+       ch_results
     )
 
     //emit: