From f3fa78969720daaace3b3ceb2e499e15cdaeccb4 Mon Sep 17 00:00:00 2001 From: marcoteix <146956995+marcoteix@users.noreply.github.com> Date: Mon, 23 Oct 2023 10:23:35 -0400 Subject: [PATCH 1/2] Initial commit --- README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..1092589 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# strainge-wdl +WDL workflow for strain-level detection with StrainGE From 3976d677e098520c07326dbe7cbb73f39e098bd8 Mon Sep 17 00:00:00 2001 From: Marco Teixeira Date: Mon, 23 Oct 2023 10:27:20 -0400 Subject: [PATCH 2/2] Let there be light --- LICENSE | 27 +++++++++++++++ tasks/strainGE.wdl | 69 ++++++++++++++++++++++++++++++++++++++ workflows/strainGE_PE.json | 14 ++++++++ workflows/strainGE_PE.wdl | 41 ++++++++++++++++++++++ 4 files changed, 151 insertions(+) create mode 100644 LICENSE create mode 100644 tasks/strainGE.wdl create mode 100644 workflows/strainGE_PE.json create mode 100644 workflows/strainGE_PE.wdl diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3b4a8a5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2016-2023, Broad Institute, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name Broad Institute, Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/tasks/strainGE.wdl b/tasks/strainGE.wdl new file mode 100644 index 0000000..ae7503e --- /dev/null +++ b/tasks/strainGE.wdl @@ -0,0 +1,69 @@ +version 1.0 + +task StrainGE_PE { + input { + String sample_name + File reads_1 + File reads_2 + Int kmer_size + File straingst_reference_db + File straingst_reference_similarities + File straingst_reference_fastas + String docker = "marcoteix/strainge:0.0.2" + Int disk_size = 100 + Int cpus = 4 + Int memory = 16 + } + parameter_meta { + sample_name: "Sample ID." + reads_1: "Input file containing clean reads." + reads_2: "Input file containing clean reads." + kmer_size: "K-mer sizes used to k-merize the input reads. Should match the value used in the construction of the reference database." + straingst_reference_db: "HDF5 file containing the StrainGST reference database." + straingst_reference_fastas: "Path to the directory containing all the FASTA files used to build the StrainGST database." + straingst_kmerized_reads: "HDF5 file containing the k-merized input reads." + straingst_reference_db_used: "HDF5 file containing the StrainGST reference database used." + straingst_strains: "TSV file with the strains detected by StrainGST." + straingst_statistics: "TSV file with StrainGST sample statistics." + straingr_concat_fasta: "Concatenated FASTA file of all representative sequences in the StrainGST reference database." + straingr_read_alignment: "BAM file with reads aligned to the closest reference." + straingr_variants: "HDF5 file with variants detected by StrainGR." + straingr_report: "Human readable TSV file with a summary of StrainGR results." + strainge_docker: "StrainGE docker image." + strainge_version: "StrainGE version." + straingst_reference_similarities: "TSV with similarities between the sequences in the StrainGST reference database." + } + command <<< + /opt/conda/envs/strainge/bin/strainge --version > VERSION.txt + /opt/conda/envs/strainge/bin/straingst kmerize -k ~{kmer_size} -o ~{sample_name}_kmerized_reads.hdf5 ~{reads_1} ~{reads_2} + /opt/conda/envs/strainge/bin/straingst run -O -o ~{sample_name}_straingst_results ~{straingst_reference_db} ~{sample_name}_kmerized_reads.hdf5 + /opt/conda/envs/strainge/bin/straingr prepare-ref -s ~{sample_name}_straingst_results.strains.tsv -p "~{straingst_reference_fastas}/{ref}" \ + -S ~{straingst_reference_similarities} -o ~{sample_name}_refs_concat.fasta + /opt/conda/envs/strainge/bin/bwa index ~{sample_name}_refs_concat.fasta + /opt/conda/envs/strainge/bin/bwa mem -I 300 -t 2 ~{sample_name}_refs_concat.fasta ~{reads_1} ~{reads_2} | /opt/conda/envs/strainge/bin/samtools sort -@ 2 -O BAM -o ~{sample_name}_straingr_alignment.bam + /opt/conda/envs/strainge/bin/samtools index ~{sample_name}_straingr_alignment.bam + /opt/conda/envs/strainge/bin/straingr call ~{sample_name}_refs_concat.fasta ~{sample_name}_straingr_alignment.bam --hdf5-out \ + ~{sample_name}_straingr_variants.hdf5 --summary ~{sample_name}_straingr.tsv --tracks all + >>> + output { + File straingst_kmerized_reads = "~{sample_name}_kmerized_reads.hdf5" + File straingst_reference_db_used ="~{straingst_reference_db}" + File straingst_strains = "~{sample_name}_straingst_results.strains.tsv" + File straingst_statistics = "~{sample_name}_straingst_results.stats.tsv" + File straingr_concat_fasta = "~{sample_name}_refs_concat.fasta" + File straingr_read_alignment = "~{sample_name}_straingr_alignment.bam" + File straingr_variants = "~{sample_name}_straingr_variants.hdf5" + File straingr_report = "~{sample_name}_straingr.tsv" + String strainge_docker = "~{docker}" + String strainge_version = read_string("VERSION.txt") + } + runtime { + docker: "~{docker}" + memory: "~{memory} GB" + cpu: cpus + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 1 + preemptible: 0 + } +} \ No newline at end of file diff --git a/workflows/strainGE_PE.json b/workflows/strainGE_PE.json new file mode 100644 index 0000000..51a830c --- /dev/null +++ b/workflows/strainGE_PE.json @@ -0,0 +1,14 @@ +{ + "strainge_pe.sample_id": "dzd-uci-780", + "strainge_pe.StrainGE_PE.memory": 16, + "strainge_pe.clean_reads_1": "/home/mcarvalh/strainge/test_sequencing_data/dzd-uci-780_R1.fastq.gz", + "strainge_pe.clean_reads_2": "/home/mcarvalh/strainge/test_sequencing_data/dzd-uci-780_R2.fastq.gz", + "strainge_pe.StrainGE_PE.docker": "marcoteix/strainge:0.0.2", + "strainge_pe.straingst_ref_fastas": "/home/mcarvalh/strainge/debug/fasta", + "strainge_pe.StrainGE_PE.disk_size": 100, + "strainge_pe.db_kmer_size": 23, + "strainge_pe.StrainGE_PE.cpus": 4, + "strainge_pe.straingst_reference": "/home/mcarvalh/strainge/debug/debug_straingst_db.hdf5", + "strainge_pe.straingst_similarities": "/home/mcarvalh/strainge/debug/similarities.tsv" +} + diff --git a/workflows/strainGE_PE.wdl b/workflows/strainGE_PE.wdl new file mode 100644 index 0000000..cacc981 --- /dev/null +++ b/workflows/strainGE_PE.wdl @@ -0,0 +1,41 @@ +version 1.0 + +import "../tasks/strainGE.wdl" as strainge + +workflow strainge_pe { + + meta { + description: "Strain-level detection and variant calling with StrainGE for paired-end reads." + } + input { + String sample_id + File clean_reads_1 + File clean_reads_2 + Int db_kmer_size + File straingst_reference + File straingst_similarities + File straingst_ref_fastas + } + call strainge.StrainGE_PE { + input: + sample_name = sample_id, + reads_1 = clean_reads_1, + reads_2 = clean_reads_2, + kmer_size = db_kmer_size, + straingst_reference_db = straingst_reference, + straingst_reference_fastas = straingst_ref_fastas, + straingst_reference_similarities = straingst_similarities + } + output { + File straingst_kmerized_reads = StrainGE_PE.straingst_kmerized_reads + File straingst_reference_db_used = StrainGE_PE.straingst_reference_db_used + File straingst_strains = StrainGE_PE.straingst_strains + File straingst_statistics = StrainGE_PE.straingst_statistics + File straingr_concat_fasta = StrainGE_PE.straingr_concat_fasta + File straingr_read_alignment = StrainGE_PE.straingr_read_alignment + File straingr_variants = StrainGE_PE.straingr_variants + File straingr_report = StrainGE_PE.straingr_report + String strainge_docker = StrainGE_PE.strainge_docker + String strainge_version = StrainGE_PE.strainge_version + } +} \ No newline at end of file