Skip to content

Commit

Permalink
kraken2 --> sourmash; fast and accurate mode
Browse files Browse the repository at this point in the history
  • Loading branch information
DOH-JDJ0303 committed Feb 6, 2024
1 parent b9fb6ee commit 05aa846
Show file tree
Hide file tree
Showing 52 changed files with 1,906 additions and 74,021 deletions.
73,654 changes: 0 additions & 73,654 deletions assets/databases/NCBI_Assembly_stats_2023-11-29.txt

This file was deleted.

Binary file added assets/databases/genbank-2022.03-viral-k21.zip
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions bin/combine-summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ library(tidyr)

# get list of summary lines
files <- list.files("./", pattern = ".csv", full.names = T)
print(files)
# combine all lines
df <- do.call(bind_rows, lapply(files, FUN=read.csv))
# calculate depth of coverage, if any of the samples mapped
Expand Down
53 changes: 0 additions & 53 deletions bin/k2_summary.R

This file was deleted.

1 change: 1 addition & 0 deletions bin/refs_summary.R → bin/ref-select_accurate.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ refs_comp <- args[2]
sample <- args[3]
gen_frac <- args[4]


# load alignment file
paf <- read_tsv(paf_file, col_names = F) %>%
rename(CONTIG = 6,
Expand Down
21 changes: 21 additions & 0 deletions bin/sm_summary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env Rscript

# check for required packages
list.of.packages <- c("readr", "dplyr","tidyr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# load packages
library(readr)
library(dplyr)
library(tidyr)

# load args
args <- commandArgs(trailingOnly=T)
sm_output <- args[1]
sample <- args[2]

# load Sourmash gather output
df.sm <- read.csv(sm_output)
colnames(df.sm)
# write.csv(file = paste0(sample,".k2-summary.csv"), quote = F, row.names = F)
35 changes: 7 additions & 28 deletions bin/summaryline.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,21 @@ library(tidyr)
# load args
args <- commandArgs(trailingOnly=T)
fastp2tbl <- args[1]
k2_summary <- args[2]
sm_summary <- args[2]
samtoolstats2tbl <- args[3]
assembly_stats <- args[4]
sample <- args[5]
ref <- args[6]
refs_meta <- args[7]
snvs <- args[8]
snvs <- args[7]


#----- Sample ID & Reference
df.summaryline <- data.frame(ID = sample, REFERENCE = ref)

#----- Add reference metadata -----#
if(file.exists(refs_meta)){
df.refs_meta <- read.csv(refs_meta)
df.summaryline <- merge(df.summaryline, df.refs_meta, by = "REFERENCE", all.x = T)
}

#----- Full Read Stats -----#
df.fastp2tbl <- read_csv(fastp2tbl)
df.summaryline <- cbind(df.summaryline, df.fastp2tbl)

#----- Kraken2 Summary -----#
# load summary
df.k2 <- read.csv(k2_summary)
# determine main taxa - i.e., those present at >= 1% relative abundance
main_taxa <- df.k2 %>%
filter(EST_PER_ABUND >= 1) %>%
.$TAXA %>%
paste(collapse = "; ")
# determine other taxa - i.e., those preset at < 1% relative abundance
other_taxa <- df.k2 %>%
filter(EST_PER_ABUND < 1) %>%
.$TAXA %>%
paste(collapse = "; ")
# combine into a table
df.k2_main_other <- data.frame(MAIN_TAXA = main_taxa, OTHER_TAXA = other_taxa)
df.summaryline <- cbind(df.summaryline, df.k2_main_other)

#----- Mapped Reads Stats -----#
if(file.exists(samtoolstats2tbl)){
df.samtoolstats2tbl <- read.csv(samtoolstats2tbl)
Expand All @@ -67,12 +43,15 @@ if(file.exists(assembly_stats)){
df.summaryline <- cbind(df.summaryline, df.assembly_stats)
}else(cat("\nNo assembly stats provided\n"))

#---- Sourmash Species Summary ----#
df.summaryline <- df.summaryline %>%
mutate(SPECIES_SUMMARY = gsub(readLines(sm_summary), pattern = ";$", replacement = ""))

#----- Create Summaryline -----#
# make ID always show up first
column.names <- df.summaryline %>%
select(-ID) %>%
colnames()
df.summaryline <- df.summaryline %>%
select(ID, column.names) %>%
df.summaryline <- df.summaryline[,c("ID", column.names)] %>%
rename_with(toupper)
write.csv(x=df.summaryline, file = "summaryline.csv", quote = F, row.names = F)
Binary file added bin/vaper2 - Shortcut.lnk
Binary file not shown.
22 changes: 20 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
withName: 'PREPARE_REFS' {
withName: 'FORMAT_REFS' {
ext.args = ""
ext.when = { }
publishDir = [
Expand Down Expand Up @@ -73,7 +73,16 @@ process {
path: { "${params.outdir}/${meta.id}/qc/" }
]
}
withName: 'KRAKEN2' {
withName: 'SOURMASH_SKETCH' {
ext.args = "dna --param-string 'scaled=1000,k=21,abund'"
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/${meta.id}/taxonomy/" }
]
}
withName: 'SOURMASH_SEARCH' {
ext.args = ""
ext.when = { }
publishDir = [
Expand All @@ -82,6 +91,15 @@ process {
path: { "${params.outdir}/${meta.id}/taxonomy/" }
]
}
withName: 'SOURMASH_GATHER' {
ext.args = "--threshold-bp 500 -k 21"
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/${meta.id}/taxonomy/" }
]
}
withName: 'MINIMAP2_ALIGN' {
ext.args = "-x asm20 --secondary=no"
ext.when = { }
Expand Down
6 changes: 1 addition & 5 deletions lib/WorkflowVaper.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ class WorkflowVaper {
genomeExistsError(params, log)

if (!params.refs) {
Nextflow.error "No reference file was supplied. Please supply the path to a 'tar.gz' compressed directory containing reference fasta files using '--refs refs.tar.gz'. This file can be created using the command 'tar czvf refs.tar.gz refs' "
}

if (!params.k2db) {
Nextflow.error "No Kraken2 database was supplied. Please supply the path to a Kraken2 viral reference database. This file must be 'tar.gz compressed. Reference database can be found at 'https://benlangmead.github.io/aws-indexes/k2'"
Nextflow.error "No reference file was supplied "
}
}

Expand Down
10 changes: 10 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"sourmash/gather": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"sourmash/sketch": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"spades": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
Expand Down
29 changes: 11 additions & 18 deletions modules/local/bwa_mem.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,14 @@ process BWA_MEM {
'biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }"

input:
tuple val(meta), val(ref), path(reads)
path refs_tar
tuple val(meta), val(ref_id), path(ref), path(reads)

output:
tuple val(meta), val(ref), path('*.bam'), emit: bam
tuple val(meta), val(ref), path('*.coverage.txt'), emit: coverage
tuple val(meta), val(ref), path('*.stats.txt'), emit: stats
tuple val(meta), val(ref), path("*.read-list.txt"), emit: read_list

path "versions.yml", emit: versions
tuple val(meta), val(ref_id), path('*.bam'), emit: bam
tuple val(meta), val(ref_id), path('*.coverage.txt'), emit: coverage
tuple val(meta), val(ref_id), path('*.stats.txt'), emit: stats
tuple val(meta), val(ref_id), path("*.read-list.txt"), emit: read_list
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -30,23 +28,18 @@ process BWA_MEM {
# setup for pipe
set -euxo pipefail
# extract references
mkdir refs
gzip -d ${refs_tar}
tar -xvhf *.tar -C refs
# index the reference
bwa index refs/*/${ref}
bwa index ${ref}
# run bwa mem, select only mapped reads, convert to .bam, and sort
bwa mem -t ${task.cpus} refs/*/${ref} ${reads[0]} ${reads[1]} | samtools view -b -F 4 - | samtools sort - > ${prefix}-${ref}.bam
bwa mem -t ${task.cpus} ${ref} ${reads[0]} ${reads[1]} | samtools view -b -F 4 - | samtools sort - > ${prefix}-${ref_id}.bam
# gather read stats
samtools coverage ${prefix}-${ref}.bam > ${prefix}-${ref}.coverage.txt
samtools stats --threads ${task.cpus} ${prefix}-${ref}.bam > ${prefix}-${ref}.stats.txt
samtools coverage ${prefix}-${ref_id}.bam > ${prefix}-${ref_id}.coverage.txt
samtools stats --threads ${task.cpus} ${prefix}-${ref_id}.bam > ${prefix}-${ref_id}.stats.txt
# get list of read headers for fastq extraction
samtools view ${prefix}-${ref}.bam | cut -f 1 | sort | uniq > ${prefix}-${ref}.read-list.txt
samtools view ${prefix}-${ref_id}.bam | cut -f 1 | sort | uniq > ${prefix}-${ref_id}.read-list.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
9 changes: 4 additions & 5 deletions modules/local/create-summaryline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ process SUMMARYLINE {
container "docker.io/jdj0303/waphl-viral-base:1.0.0"

input:
tuple val(meta), val(ref), path(fastp2tbl), path(k2_summary), path(samtoolstats2tbl), path(assembly_stats), path(vcf)
path refs_meta
tuple val(meta), val(ref_id), path(samtoolstats2tbl), path(assembly_stats), path(vcf), path(fastp2tbl), path(sm_summary)

output:
tuple val(meta), path("*.summaryline.csv"), emit: summaryline
Expand All @@ -18,16 +17,16 @@ process SUMMARYLINE {
script: // This script is bundled with the pipeline, in nf-core/waphlviral/bin/
"""
# determine number of SNVs
if [ -f ${vcf} ]
if [ -f "${vcf}" ]
then
snvs=\$(cat ${vcf} | grep -v "#" | wc -l)
else
snvs="NA"
fi
# create summaryline
summaryline.R "${fastp2tbl}" "${k2_summary}" "${samtoolstats2tbl}" "${assembly_stats}" "${prefix}" "${ref}" "${refs_meta}" \$(echo \$snvs)
summaryline.R "${fastp2tbl}" "${sm_summary}" "${samtoolstats2tbl}" "${assembly_stats}" "${prefix}" "${ref_id}" "\$(echo \$snvs)"
# rename using prefix and reference
mv summaryline.csv ${prefix}-${ref}.summaryline.csv
mv summaryline.csv "${prefix}-${ref_id}.summaryline.csv"
"""
}
12 changes: 3 additions & 9 deletions modules/local/prepare_refs.nf → modules/local/format_refs.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
process PREPARE_REFS {
tag "$refs_tar"
process FORMAT_REFS {
label 'process_low'

conda "bioconda::seqtk"
Expand All @@ -8,7 +7,7 @@ process PREPARE_REFS {
'staphb/seqtk:1.3' }"

input:
path refs_tar
path refs

output:
path "refs.fa", emit: refs
Expand All @@ -21,13 +20,8 @@ process PREPARE_REFS {
version = "1.0"
script: // This script is bundled with the pipeline, in nf-core/waphlviral/bin/
"""
# extract references
mkdir refs
gzip -d ${refs_tar}
tar -xvhf *.tar -C refs
# rename fasta headers and combine into multi-fasta file
refs=\$(ls -d refs/*/*)
for ref in \${refs}
for ref in \$(echo "${refs}")
do
seqtk seq -C \${ref} > tmp.fa
seqtk rename tmp.fa \${ref##*/} >> refs.fa
Expand Down
8 changes: 4 additions & 4 deletions modules/local/get_mapped_fastq.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process MAPPED_FASTQ {
tag "${meta.id}-${ref}"
tag "${meta.id}-${ref_id}"
label 'process_single'

conda "bioconda::seqtk=1.3"
Expand All @@ -8,7 +8,7 @@ process MAPPED_FASTQ {
'biocontainers/seqtk:1.3--h5bf99c6_3' }"

input:
tuple val(meta), val(ref), path(filter_list), path(reads)
tuple val(meta), val(ref_id), path(filter_list), path(reads)

output:
path "*.gz" , emit: reads
Expand All @@ -28,15 +28,15 @@ process MAPPED_FASTQ {
$args \\
${reads[0]} \\
$filter_list | \\
gzip --no-name > ${prefix}-${ref}_R1.fastq.gz
gzip --no-name > ${prefix}-${ref_id}_R1.fastq.gz
# reverse reads
seqtk \\
subseq \\
$args \\
${reads[0]} \\
$filter_list | \\
gzip --no-name > ${prefix}-${ref}_R2.fastq.gz
gzip --no-name > ${prefix}-${ref_id}_R2.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
Loading

0 comments on commit 05aa846

Please sign in to comment.