kraken2 --> sourmash; fast and accurate mode

DOH-JDJ0303 · Feb 6, 2024 · 05aa846 · 05aa846
1 parent b9fb6ee
commit 05aa846
Show file tree

Hide file tree

Showing 52 changed files with 1,906 additions and 74,021 deletions.
diff --git a/assets/databases/NCBI_Assembly_stats_2023-11-29.txt b/assets/databases/NCBI_Assembly_stats_2023-11-29.txt
diff --git a/assets/databases/genbank-2022.03-viral-k21.zip b/assets/databases/genbank-2022.03-viral-k21.zip
diff --git a/assets/databases/genbank-2022.03-viral.lineages.csv.gz b/assets/databases/genbank-2022.03-viral.lineages.csv.gz
diff --git a/bin/combine-summary.R b/bin/combine-summary.R
@@ -12,6 +12,7 @@ library(tidyr)
 
 # get list of summary lines
 files <- list.files("./", pattern = ".csv", full.names = T)
+print(files)
 # combine all lines
 df <- do.call(bind_rows, lapply(files, FUN=read.csv))
 # calculate depth of coverage, if any of the samples mapped

diff --git a/bin/k2_summary.R b/bin/k2_summary.R
diff --git a/bin/refs_summary.R → bin/ref-select_accurate.R b/bin/refs_summary.R → bin/ref-select_accurate.R
@@ -17,6 +17,7 @@ refs_comp <- args[2]
 sample <- args[3]
 gen_frac <- args[4]
 
+
 # load alignment file
 paf <- read_tsv(paf_file, col_names = F) %>%
   rename(CONTIG = 6,

diff --git a/bin/sm_summary.R b/bin/sm_summary.R
@@ -0,0 +1,21 @@
+#!/usr/bin/env Rscript
+
+# check for required packages
+list.of.packages <- c("readr", "dplyr","tidyr")
+new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
+if(length(new.packages)) install.packages(new.packages)
+
+# load packages
+library(readr)
+library(dplyr)
+library(tidyr)
+
+# load args
+args <- commandArgs(trailingOnly=T)
+sm_output <- args[1]
+sample <- args[2]
+
+# load Sourmash gather output
+df.sm <- read.csv(sm_output)
+colnames(df.sm)
+#  write.csv(file = paste0(sample,".k2-summary.csv"), quote = F, row.names = F)
diff --git a/bin/summaryline.R b/bin/summaryline.R
@@ -13,45 +13,21 @@ library(tidyr)
 # load args
 args <- commandArgs(trailingOnly=T)
 fastp2tbl <- args[1]
-k2_summary <- args[2]
+sm_summary <- args[2]
 samtoolstats2tbl <- args[3]
 assembly_stats <- args[4]
 sample <- args[5]
 ref <- args[6]
-refs_meta <- args[7]
-snvs <- args[8]
+snvs <- args[7]
 
 
 #----- Sample ID & Reference
 df.summaryline <- data.frame(ID = sample, REFERENCE = ref)
 
-#----- Add reference metadata -----#
-if(file.exists(refs_meta)){
-  df.refs_meta <- read.csv(refs_meta)
-  df.summaryline <- merge(df.summaryline, df.refs_meta, by = "REFERENCE", all.x = T)
-}
-
 #----- Full Read Stats -----#
 df.fastp2tbl <- read_csv(fastp2tbl)
 df.summaryline <- cbind(df.summaryline, df.fastp2tbl)
 
-#----- Kraken2 Summary -----#
-# load summary
-df.k2 <- read.csv(k2_summary)
-# determine main taxa - i.e., those present at >= 1% relative abundance
-main_taxa <- df.k2 %>%
-  filter(EST_PER_ABUND >= 1) %>%
-  .$TAXA %>%
-  paste(collapse = "; ")
-# determine other taxa - i.e., those preset at < 1% relative abundance
-other_taxa <- df.k2 %>%
-  filter(EST_PER_ABUND < 1) %>%
-  .$TAXA %>%
-  paste(collapse = "; ")
-# combine into a table
-df.k2_main_other <- data.frame(MAIN_TAXA = main_taxa, OTHER_TAXA = other_taxa)
-df.summaryline <- cbind(df.summaryline, df.k2_main_other)
-
 #----- Mapped Reads Stats -----#
 if(file.exists(samtoolstats2tbl)){
   df.samtoolstats2tbl <- read.csv(samtoolstats2tbl)
@@ -67,12 +43,15 @@ if(file.exists(assembly_stats)){
   df.summaryline <- cbind(df.summaryline, df.assembly_stats)
 }else(cat("\nNo assembly stats provided\n"))
 
+#---- Sourmash Species Summary ----#
+df.summaryline <- df.summaryline %>%
+  mutate(SPECIES_SUMMARY = gsub(readLines(sm_summary), pattern = ";$", replacement = ""))
+
 #----- Create Summaryline -----#
 # make ID always show up first
 column.names <- df.summaryline %>% 
   select(-ID) %>%
   colnames()
-df.summaryline <- df.summaryline %>%
-  select(ID, column.names) %>%
+df.summaryline <- df.summaryline[,c("ID", column.names)] %>%
   rename_with(toupper)
 write.csv(x=df.summaryline, file = "summaryline.csv", quote = F, row.names = F)
diff --git a/bin/vaper2 - Shortcut.lnk b/bin/vaper2 - Shortcut.lnk
diff --git a/conf/modules.config b/conf/modules.config
@@ -25,7 +25,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: 'PREPARE_REFS' {
+    withName: 'FORMAT_REFS' {
         ext.args            = ""
         ext.when            = {  }
         publishDir          = [
@@ -73,7 +73,16 @@ process {
             path: { "${params.outdir}/${meta.id}/qc/" }
         ]
     }
-    withName: 'KRAKEN2' {
+    withName: 'SOURMASH_SKETCH' {
+        ext.args            = "dna --param-string 'scaled=1000,k=21,abund'"
+        ext.when            = {  }
+        publishDir          = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/${meta.id}/taxonomy/" }
+        ]
+    }
+    withName: 'SOURMASH_SEARCH' {
         ext.args            = ""
         ext.when            = {  }
         publishDir          = [
@@ -82,6 +91,15 @@ process {
             path: { "${params.outdir}/${meta.id}/taxonomy/" }
         ]
     }
+    withName: 'SOURMASH_GATHER' {
+        ext.args            = "--threshold-bp 500 -k 21"
+        ext.when            = {  }
+        publishDir          = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/${meta.id}/taxonomy/" }
+        ]
+    }
     withName: 'MINIMAP2_ALIGN' {
         ext.args            = "-x asm20 --secondary=no"
         ext.when            = {  }

diff --git a/lib/WorkflowVaper.groovy b/lib/WorkflowVaper.groovy
@@ -15,11 +15,7 @@ class WorkflowVaper {
         genomeExistsError(params, log)
 
         if (!params.refs) {
-            Nextflow.error "No reference file was supplied. Please supply the path to a 'tar.gz' compressed directory containing reference fasta files using '--refs refs.tar.gz'. This file can be created using the command 'tar czvf refs.tar.gz refs' "
-        }
-
-        if (!params.k2db) {
-            Nextflow.error "No Kraken2 database was supplied. Please supply the path to a Kraken2 viral reference database. This file must be 'tar.gz compressed. Reference database can be found at 'https://benlangmead.github.io/aws-indexes/k2'"
+            Nextflow.error "No reference file was supplied "
         }
     }
 

diff --git a/modules.json b/modules.json
@@ -75,6 +75,16 @@
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
+                    "sourmash/gather": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
+                    "sourmash/sketch": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "spades": {
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",

diff --git a/modules/local/bwa_mem.nf b/modules/local/bwa_mem.nf
@@ -8,16 +8,14 @@ process BWA_MEM {
         'biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }"
 
     input:
-    tuple val(meta), val(ref), path(reads)
-    path  refs_tar
+    tuple val(meta), val(ref_id), path(ref), path(reads)
 
     output:
-    tuple val(meta), val(ref), path('*.bam'),           emit: bam
-    tuple val(meta), val(ref), path('*.coverage.txt'),  emit: coverage
-    tuple val(meta), val(ref), path('*.stats.txt'),     emit: stats
-    tuple val(meta), val(ref), path("*.read-list.txt"), emit: read_list
-
-    path "versions.yml", emit: versions
+    tuple val(meta), val(ref_id), path('*.bam'),           emit: bam
+    tuple val(meta), val(ref_id), path('*.coverage.txt'),  emit: coverage
+    tuple val(meta), val(ref_id), path('*.stats.txt'),     emit: stats
+    tuple val(meta), val(ref_id), path("*.read-list.txt"), emit: read_list
+    path "versions.yml",                                   emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -30,23 +28,18 @@ process BWA_MEM {
     # setup for pipe
     set -euxo pipefail
 
-    # extract references
-    mkdir refs
-    gzip -d ${refs_tar}
-    tar -xvhf *.tar -C refs
-
     # index the reference
-    bwa index refs/*/${ref}
+    bwa index ${ref}
 
     # run bwa mem, select only mapped reads, convert to .bam, and sort
-    bwa mem -t ${task.cpus} refs/*/${ref} ${reads[0]} ${reads[1]} | samtools view -b -F 4 - | samtools sort - > ${prefix}-${ref}.bam
+    bwa mem -t ${task.cpus} ${ref} ${reads[0]} ${reads[1]} | samtools view -b -F 4 - | samtools sort - > ${prefix}-${ref_id}.bam
 
     # gather read stats
-    samtools coverage ${prefix}-${ref}.bam > ${prefix}-${ref}.coverage.txt
-    samtools stats --threads ${task.cpus} ${prefix}-${ref}.bam > ${prefix}-${ref}.stats.txt
+    samtools coverage ${prefix}-${ref_id}.bam > ${prefix}-${ref_id}.coverage.txt
+    samtools stats --threads ${task.cpus} ${prefix}-${ref_id}.bam > ${prefix}-${ref_id}.stats.txt
 
     # get list of read headers for fastq extraction
-    samtools view ${prefix}-${ref}.bam | cut -f 1 | sort | uniq > ${prefix}-${ref}.read-list.txt
+    samtools view ${prefix}-${ref_id}.bam | cut -f 1 | sort | uniq > ${prefix}-${ref_id}.read-list.txt
     
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/create-summaryline.nf b/modules/local/create-summaryline.nf
@@ -5,8 +5,7 @@ process SUMMARYLINE {
     container "docker.io/jdj0303/waphl-viral-base:1.0.0"
 
     input:
-    tuple val(meta), val(ref), path(fastp2tbl), path(k2_summary), path(samtoolstats2tbl), path(assembly_stats), path(vcf)
-    path refs_meta
+    tuple val(meta), val(ref_id), path(samtoolstats2tbl), path(assembly_stats), path(vcf), path(fastp2tbl), path(sm_summary)
 
     output:
     tuple val(meta), path("*.summaryline.csv"), emit: summaryline
@@ -18,16 +17,16 @@ process SUMMARYLINE {
     script: // This script is bundled with the pipeline, in nf-core/waphlviral/bin/
     """
     # determine number of SNVs
-    if [ -f ${vcf} ]
+    if [ -f "${vcf}" ]
     then
         snvs=\$(cat ${vcf} | grep -v "#" | wc -l)
     else
         snvs="NA"
     fi
 
     # create summaryline
-    summaryline.R "${fastp2tbl}" "${k2_summary}" "${samtoolstats2tbl}" "${assembly_stats}" "${prefix}" "${ref}" "${refs_meta}" \$(echo \$snvs)
+    summaryline.R "${fastp2tbl}" "${sm_summary}" "${samtoolstats2tbl}" "${assembly_stats}" "${prefix}" "${ref_id}"  "\$(echo \$snvs)"
     # rename using prefix and reference
-    mv summaryline.csv ${prefix}-${ref}.summaryline.csv
+    mv summaryline.csv "${prefix}-${ref_id}.summaryline.csv"
     """
 }
diff --git a/modules/local/prepare_refs.nf → modules/local/format_refs.nf b/modules/local/prepare_refs.nf → modules/local/format_refs.nf
@@ -1,5 +1,4 @@
-process PREPARE_REFS {
-    tag "$refs_tar"
+process FORMAT_REFS {
     label 'process_low'
 
     conda "bioconda::seqtk"
@@ -8,7 +7,7 @@ process PREPARE_REFS {
         'staphb/seqtk:1.3' }"
 
     input:
-    path refs_tar
+    path refs
 
     output:
     path "refs.fa",       emit: refs
@@ -21,13 +20,8 @@ process PREPARE_REFS {
     version = "1.0"
     script: // This script is bundled with the pipeline, in nf-core/waphlviral/bin/
     """
-    # extract references
-    mkdir refs
-    gzip -d ${refs_tar}
-    tar -xvhf *.tar -C refs
     # rename fasta headers and combine into multi-fasta file
-    refs=\$(ls -d refs/*/*)
-    for ref in \${refs}
+    for ref in \$(echo "${refs}")
     do
         seqtk seq -C \${ref} > tmp.fa
         seqtk rename tmp.fa \${ref##*/} >> refs.fa

diff --git a/modules/local/get_mapped_fastq.nf b/modules/local/get_mapped_fastq.nf
@@ -1,5 +1,5 @@
 process MAPPED_FASTQ {
-    tag "${meta.id}-${ref}"
+    tag "${meta.id}-${ref_id}"
     label 'process_single'
 
     conda "bioconda::seqtk=1.3"
@@ -8,7 +8,7 @@ process MAPPED_FASTQ {
         'biocontainers/seqtk:1.3--h5bf99c6_3' }"
 
     input:
-    tuple val(meta), val(ref), path(filter_list), path(reads)
+    tuple val(meta), val(ref_id), path(filter_list), path(reads)
 
     output:
     path "*.gz"         , emit: reads
@@ -28,15 +28,15 @@ process MAPPED_FASTQ {
         $args \\
         ${reads[0]} \\
         $filter_list | \\
-        gzip --no-name > ${prefix}-${ref}_R1.fastq.gz
+        gzip --no-name > ${prefix}-${ref_id}_R1.fastq.gz
 
     # reverse reads
     seqtk \\
         subseq \\
         $args \\
         ${reads[0]} \\
         $filter_list | \\
-        gzip --no-name > ${prefix}-${ref}_R2.fastq.gz
+        gzip --no-name > ${prefix}-${ref_id}_R2.fastq.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":