From 38b1caac7e8ea54e21233ad3a0ae4fcb5cad7487 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 4 Oct 2024 16:39:38 +0000 Subject: [PATCH 01/58] placeholder --- tasks/assembly/task_flye.wdl | 5 +++++ workflows/utilities/wf_flye_consensus.wdl | 12 ++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 tasks/assembly/task_flye.wdl create mode 100644 workflows/utilities/wf_flye_consensus.wdl diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl new file mode 100644 index 000000000..aa400333e --- /dev/null +++ b/tasks/assembly/task_flye.wdl @@ -0,0 +1,5 @@ +version 1.0 + +task flye { + # placeholder +} \ No newline at end of file diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl new file mode 100644 index 000000000..eb2992139 --- /dev/null +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -0,0 +1,12 @@ +version 1.0 + +workflow flye_consensus { + meta { + description: "This workflow runs flye to generate a consensus genome assembly from long reads." + } + input { + File read1 + Int genome_size + Int threads + } +} \ No newline at end of file From cdf09133db6b5ffa819eebcb4d1e8c2e67bec93d Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 9 Oct 2024 18:30:12 +0000 Subject: [PATCH 02/58] make flye task --- tasks/assembly/task_flye.wdl | 82 ++++++++++++++++++++++- workflows/utilities/wf_flye_consensus.wdl | 8 +-- 2 files changed, 82 insertions(+), 8 deletions(-) diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index aa400333e..cd34e0989 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -1,5 +1,83 @@ version 1.0 -task flye { - # placeholder +task flye_consensus { + input { + File read1 + String samplename + + # data type options; by default, uses --nano-raw + Boolean ont_corrected = false + Boolean ont_high_quality = false + Boolean pacbio_raw = false + Boolean pacbio_corrected = false + Boolean pacbio_hifi = false + + Int? genome_length # requires `asm_coverage` + Int? asm_coverage # reduced coverage for initial disjointig assembly + + Int polishing_iterations = 1 + Int minimum_overlap = 1 + + Float? read_error_rate + Boolean uneven_coverage_mode = false + Boolean keep_haplotypes = false + Boolean no_alt_contigs = false + Boolean scaffold = false + + String? additonal_parameters + + Int cpu = 4 + Int disk_size = 100 + String docker + Int memory = 32 + } + command <<< + flye --version | tee VERSION + + # determine read type + if ~{ont_corrected}; then + READ_TYPE="--nano-corr" + elif ~{ont_high_quality}; then + READ_TYPE="--nano-hq" + elif ~{pacbio_raw}; then + READ_TYPE="--pacbio-raw" + elif ~{pacbio_corrected}; then + READ_TYPE="--pacbio-corr" + elif ~{pacbio_hifi}; then + READ_TYPE="--pacbio-hifi" + else + READ_TYPE="--nano-raw" + fi + + # genome size parameter requires asm_coverage + flye \ + ${READ_TYPE} ~{read1} \ + --iterations ~{polishing_iterations} \ + --min-overlap ~{minimum_overlap} \ + ~{if defined(asm_coverage) then "--genome-size " + genome_length else ""} \ + ~{"--asm-coverage " + asm_coverage} \ + ~{"--read-error " + read_error_rate} \ + ~{true="--meta" false="" uneven_coverage_mode} \ + ~{true="--keep-haplotypes" false="" keep_haplotypes} \ + ~{true="--no-alt-contigs" false="" no_alt_contigs} \ + ~{true="--scaffold" false="" scaffold} \ + ~{"--extra-params " + additonal_parameters} \ + --threads ~{cpu} \ + --out-dir . + + >>> + output { + File assembly = "~{samplename}.assembly.fasta" + String flye_version = read_string("VERSION") + String flye_docker = "~{docker}" + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } } \ No newline at end of file diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index eb2992139..781b83593 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -2,11 +2,7 @@ version 1.0 workflow flye_consensus { meta { - description: "This workflow runs flye to generate a consensus genome assembly from long reads." - } - input { - File read1 - Int genome_size - Int threads + description: "This workflow runs either flye, miniasm, or raven to generate a consensus genome assembly from long reads." } + # placeholder } \ No newline at end of file From 46629d81e97bf9abc48dcbcdc376ce46f660327f Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 10 Oct 2024 15:17:13 +0000 Subject: [PATCH 03/58] rename fasta --- tasks/assembly/task_flye.wdl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index cd34e0989..a63109dd3 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -1,6 +1,6 @@ version 1.0 -task flye_consensus { +task flye { input { File read1 String samplename @@ -16,7 +16,7 @@ task flye_consensus { Int? asm_coverage # reduced coverage for initial disjointig assembly Int polishing_iterations = 1 - Int minimum_overlap = 1 + Int? minimum_overlap Float? read_error_rate Boolean uneven_coverage_mode = false @@ -28,7 +28,7 @@ task flye_consensus { Int cpu = 4 Int disk_size = 100 - String docker + String docker = "us-docker.pkg.dev/general-theiagen/staphb/flye:2.9.4" Int memory = 32 } command <<< @@ -53,7 +53,7 @@ task flye_consensus { flye \ ${READ_TYPE} ~{read1} \ --iterations ~{polishing_iterations} \ - --min-overlap ~{minimum_overlap} \ + ~{"--min-overlap" + minimum_overlap} \ ~{if defined(asm_coverage) then "--genome-size " + genome_length else ""} \ ~{"--asm-coverage " + asm_coverage} \ ~{"--read-error " + read_error_rate} \ @@ -65,6 +65,8 @@ task flye_consensus { --threads ~{cpu} \ --out-dir . + mv assembly.fasta ~{samplename}.assembly.fasta + >>> output { File assembly = "~{samplename}.assembly.fasta" From b45b41dae782054c15f8059f7d8c6dea397f0271 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 11 Oct 2024 17:35:35 +0000 Subject: [PATCH 04/58] make workflow a workflow --- workflows/utilities/wf_flye_consensus.wdl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 781b83593..16dd9ed20 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -1,8 +1,19 @@ version 1.0 +import "../../tasks/assembly/task_flye.wdl" as flye_task + workflow flye_consensus { meta { description: "This workflow runs either flye, miniasm, or raven to generate a consensus genome assembly from long reads." } + input { + File read1 + String samplename + } + call flye_task.flye { + input: + read1 = read1, + samplename = samplename + } # placeholder } \ No newline at end of file From 62b48e2710a1b6fc2cadfa513291319e8d334899 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 19 Nov 2024 14:03:10 -0600 Subject: [PATCH 05/58] update output files for flye --- tasks/assembly/task_flye.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index a63109dd3..276802127 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -66,10 +66,14 @@ task flye { --out-dir . mv assembly.fasta ~{samplename}.assembly.fasta + mv assembly_info.txt ~{samplename}.assembly_info.txt + mv assembly_graph.gfa ~{samplename}.assembly_graph.gfa >>> output { File assembly = "~{samplename}.assembly.fasta" + File assembly_graph = "~{samplename}.assembly_graph.gfa" + File assembly_info = "~{samplename}.assembly_info.txt" String flye_version = read_string("VERSION") String flye_docker = "~{docker}" } From 787121c5c5892292c7eba4136363fc1ae1041666 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 19 Nov 2024 14:27:56 -0600 Subject: [PATCH 06/58] v1 bandage plot flye assembly visual --- tasks/assembly/task_bandage.wdl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tasks/assembly/task_bandage.wdl diff --git a/tasks/assembly/task_bandage.wdl b/tasks/assembly/task_bandage.wdl new file mode 100644 index 000000000..ea6ef10af --- /dev/null +++ b/tasks/assembly/task_bandage.wdl @@ -0,0 +1,29 @@ +version 1.0 + +task bandage_plot { + input { + File assembly_graph_gfa + String samplename + Int cpu = 2 + Int memory = 4 + Int disk_size = 10 + String docker = "staphb/bandage:0.8.1" + } + command <<< + bandage --version | tee VERSION + Bandage image ~{assembly_graph_gfa} ~{samplename}_bandage_plot.png + >>> + output { + File plot = "~{output_prefix}.png" + String version = read_string("VERSION") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} From e9f29b4ad4bb14736e0b3f4bcac0b7bed041146a Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 19 Nov 2024 18:06:46 -0600 Subject: [PATCH 07/58] medaka initial commit --- tasks/assembly/task_medaka.wdl | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tasks/assembly/task_medaka.wdl diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl new file mode 100644 index 000000000..9c2c4f367 --- /dev/null +++ b/tasks/assembly/task_medaka.wdl @@ -0,0 +1,40 @@ +version 1.0 + +task medaka_polish { + input { + File assembly_fasta + String samplename + File read1 + String medaka_model + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 + String docker = "staphb/medaka:2.0.1" + } + command <<< + medaka --version | tee VERSION + medaka_consensus \ + -i ~{read1} \ + -d ~{assembly_fasta} \ + -m ~{medaka_model} \ + -o medaka_out \ + --threads ~{cpu} + # Rename output files with sample name + mv medaka_out/consensus.fasta ~{samplename}.polished.fasta + mv medaka_out/consensus.vcf.gz ~{samplename}.polished.vcf.gz + >>> + output { + File polished_fasta = "~{samplename}.polished.fasta" + File polished_vcf = "~{samplename}.polished.vcf.gz" + String medaka_version = read_string("VERSION") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} From df7f79fac21ead5bcd482663700d98775b51053e Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 20 Nov 2024 12:09:52 -0600 Subject: [PATCH 08/58] initial commit racon framework --- tasks/assembly/racon_polish.wdl | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tasks/assembly/racon_polish.wdl diff --git a/tasks/assembly/racon_polish.wdl b/tasks/assembly/racon_polish.wdl new file mode 100644 index 000000000..147b71a55 --- /dev/null +++ b/tasks/assembly/racon_polish.wdl @@ -0,0 +1,55 @@ +version 1.0 + +task racon_polish { + input { + File assembly_fasta + File read1 + Int racon_rounds = 1 + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 + String samplename + String minimap2_docker = "staphb/minimap2:2.24" + String racon_docker = "staphb/racon:1.4.20" + } + + command <<< + # Initialize variables + cp ~{assembly_fasta} ~{samplename}.unpolished.fasta + POLISHED_FASTA=~{samplename}.unpolished.fasta + + # Run Racon for the specified number of rounds + for i in $(seq 1 ~{racon_rounds}); do + echo "Polishing with Racon - Round $i of ~{racon_rounds}" + mkdir -p racon_round_${i} + + # Step 1: Generate PAF alignments with Minimap2 + minimap2 -t ~{cpu} -x map-ont $POLISHED_FASTA ~{read1} \ + > racon_round_${i}/alignments.paf + + # Step 2: Run Racon using the generated PAF + racon -t ~{cpu} ~{read1} racon_round_${i}/alignments.paf $POLISHED_FASTA \ + > racon_round_${i}/consensus.fasta + + # Update the polished FASTA for the next round + POLISHED_FASTA=racon_round_${i}/consensus.fasta + done + + # Copy final polished FASTA to the home directory with sample name + cp $POLISHED_FASTA ~{samplename}.polished.fasta + >>> + + output { + File raconpolished_fasta = "~{samplename}.polished.fasta" + } + + runtime { + docker: "~{racon_docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} \ No newline at end of file From 5a3fcd6f68fd0242cbcd0647d795fba1dbe6183a Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 21 Nov 2024 14:22:05 -0600 Subject: [PATCH 09/58] framework for tasks dnaapler porechop and racon --- tasks/assembly/task_dnaapler.wdl | 38 ++++++++++++++++++ tasks/assembly/task_porechop.wdl | 39 +++++++++++++++++++ ...racon_polish.wdl => task_racon_polish.wdl} | 0 3 files changed, 77 insertions(+) create mode 100644 tasks/assembly/task_dnaapler.wdl create mode 100644 tasks/assembly/task_porechop.wdl rename tasks/assembly/{racon_polish.wdl => task_racon_polish.wdl} (100%) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl new file mode 100644 index 000000000..14da4d5af --- /dev/null +++ b/tasks/assembly/task_dnaapler.wdl @@ -0,0 +1,38 @@ +version 1.0 + +task dnaapler_all { + input { + File input_fasta # Input genome assembly in FASTA format + String output_dir # Directory to store output + String prefix + Int threads = 4 + Int disk_size = 100 + Int memory = 16 + String docker # making Dockerfile for new dnaapler version + } + + command <<< + dnaapler --version | tee VERSION + + # Run DnaApler with the 'all' subcommand + dnaapler all \ + -i ~{input_fasta} \ + -o ~{output_dir} \ + -p ~{prefix} \ + -t ~{threads} + >>> + + output { + File reoriented_fasta = "~{output_dir}/~{prefix}_reoriented.fasta" # Reoriented genome assembly + String dnaapler_version = read_string("VERSION") # DnaApler version + } + runtime { + docker: "~{docker}" + cpu: threads + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } + diff --git a/tasks/assembly/task_porechop.wdl b/tasks/assembly/task_porechop.wdl new file mode 100644 index 000000000..23e59f387 --- /dev/null +++ b/tasks/assembly/task_porechop.wdl @@ -0,0 +1,39 @@ +version 1.0 + +workflow porechopabi_workflow { + input { + Array[File] raw_reads # Input raw reads (FASTQ format) + String samplename + Int cpu = 4 + Int memory = 8 + Int disk_size = 50 + String docker #need to create dockerfile + } + + command <<< + porechop_abi --version | tee VERSION + + # Combine input reads into a single FASTQ file (if more than one file) + cat ~{sep=" " raw_reads} > combined_raw_reads.fastq + + # Run Porechop + porechop_abi \ + -input combined_raw_reads.fastq \ + -output ~{samplename}.trimmed.fastq \ + --threads ~{cpu} + >>> + + output { + File trimmed_reads = "~{samplename}.trimmed.fastq" # Cleaned reads + String porechopabi_version = read_string("VERSION") # Porechop version + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/assembly/racon_polish.wdl b/tasks/assembly/task_racon_polish.wdl similarity index 100% rename from tasks/assembly/racon_polish.wdl rename to tasks/assembly/task_racon_polish.wdl From d6fe1bc426fc3d6c2b691e7c8b05384c9cee32df Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 22 Nov 2024 15:59:56 -0600 Subject: [PATCH 10/58] update docker images --- tasks/assembly/task_bandage.wdl | 2 +- tasks/assembly/task_dnaapler.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/assembly/task_bandage.wdl b/tasks/assembly/task_bandage.wdl index ea6ef10af..b7305f7c4 100644 --- a/tasks/assembly/task_bandage.wdl +++ b/tasks/assembly/task_bandage.wdl @@ -7,7 +7,7 @@ task bandage_plot { Int cpu = 2 Int memory = 4 Int disk_size = 10 - String docker = "staphb/bandage:0.8.1" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/bandage:0.8.1" } command <<< bandage --version | tee VERSION diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index 14da4d5af..bf6ce5b9c 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -8,7 +8,7 @@ task dnaapler_all { Int threads = 4 Int disk_size = 100 Int memory = 16 - String docker # making Dockerfile for new dnaapler version + String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:0.8.0" } command <<< From 89f7d3b105038626c89024e0f53523b47319901c Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 22 Nov 2024 16:01:53 -0600 Subject: [PATCH 11/58] update outdir medaka --- tasks/assembly/task_medaka.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl index 9c2c4f367..760d2d15f 100644 --- a/tasks/assembly/task_medaka.wdl +++ b/tasks/assembly/task_medaka.wdl @@ -17,11 +17,11 @@ task medaka_polish { -i ~{read1} \ -d ~{assembly_fasta} \ -m ~{medaka_model} \ - -o medaka_out \ + -o . \ --threads ~{cpu} # Rename output files with sample name - mv medaka_out/consensus.fasta ~{samplename}.polished.fasta - mv medaka_out/consensus.vcf.gz ~{samplename}.polished.vcf.gz + mv consensus.fasta ~{samplename}.polished.fasta + mv consensus.vcf.gz ~{samplename}.polished.vcf.gz >>> output { File polished_fasta = "~{samplename}.polished.fasta" From 35982db7e833a3341b5f4d7e6a7a9473cfce0d23 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 25 Nov 2024 06:11:33 -0600 Subject: [PATCH 12/58] update medaka and dnaapler --- tasks/assembly/task_dnaapler.wdl | 10 ++++---- tasks/assembly/task_medaka.wdl | 40 ++++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index bf6ce5b9c..a2a6c1e21 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -2,29 +2,27 @@ version 1.0 task dnaapler_all { input { - File input_fasta # Input genome assembly in FASTA format - String output_dir # Directory to store output + File input_fasta String prefix Int threads = 4 Int disk_size = 100 Int memory = 16 String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:0.8.0" } - command <<< dnaapler --version | tee VERSION # Run DnaApler with the 'all' subcommand dnaapler all \ -i ~{input_fasta} \ - -o ~{output_dir} \ + -o . \ -p ~{prefix} \ -t ~{threads} >>> output { - File reoriented_fasta = "~{output_dir}/~{prefix}_reoriented.fasta" # Reoriented genome assembly - String dnaapler_version = read_string("VERSION") # DnaApler version + File reoriented_fasta = "~{prefix}_reoriented.fasta" + String dnaapler_version = read_string("VERSION") } runtime { docker: "~{docker}" diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl index 760d2d15f..a6df0dd93 100644 --- a/tasks/assembly/task_medaka.wdl +++ b/tasks/assembly/task_medaka.wdl @@ -6,26 +6,42 @@ task medaka_polish { String samplename File read1 String medaka_model + Int polish_rounds = 1 Int cpu = 4 Int memory = 16 Int disk_size = 100 String docker = "staphb/medaka:2.0.1" } command <<< - medaka --version | tee VERSION - medaka_consensus \ - -i ~{read1} \ - -d ~{assembly_fasta} \ - -m ~{medaka_model} \ - -o . \ - --threads ~{cpu} - # Rename output files with sample name - mv consensus.fasta ~{samplename}.polished.fasta - mv consensus.vcf.gz ~{samplename}.polished.vcf.gz + medaka --version | tee VERSION + + # Initialize the input for polishing with the provided assembly FASTA + cp ~{assembly_fasta} polished_input.fasta + + # Perform Medaka polishing for the specified number of rounds + for i in $(seq 1 ~{polish_rounds}); do + echo "Starting Medaka polishing round $i" + polish_dir="polish_round_$i" + mkdir -p "$polish_dir" + + medaka_consensus \ + -i ~{read1} \ + -d polished_input.fasta \ + -o "$polish_dir" \ + -m ~{medaka_model} \ + --threads ~{cpu} + + # Update the input for the next round + cp "$polish_dir/consensus.fasta" polished_input.fasta + done + + # Rename final polished outputs with sample name + mv polished_input.fasta ~{samplename}.polished.fasta + mv polish_round_~{polish_rounds}/consensus.vcf.gz ~{samplename}.polished.vcf.gz >>> output { - File polished_fasta = "~{samplename}.polished.fasta" - File polished_vcf = "~{samplename}.polished.vcf.gz" + File medaka_fasta = "~{samplename}.polished.fasta" + File medaka_vcf = "~{samplename}.polished.vcf.gz" String medaka_version = read_string("VERSION") } runtime { From d738ab1f100285b48454b82d45ea838dcdba7eeb Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 25 Nov 2024 19:25:24 +0000 Subject: [PATCH 13/58] add polypolish and separate bwa mem -a tasks --- tasks/alignment/task_bwa.wdl | 38 +++++++++++++++++++- tasks/assembly/task_polypolish.wdl | 58 ++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 tasks/assembly/task_polypolish.wdl diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index 9aabb6cd7..b6929e91d 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -153,4 +153,40 @@ task bwa { preemptible: 0 maxRetries: 3 } -} \ No newline at end of file +} + +task bwa_all { + input { + File draft_assembly_fasta + File read1 + File read2 + String samplename + + Int cpu = 6 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/bwa:0.7.18" + Int memory = 16 + } + command <<< + bwa &> BWA_HELP + grep "Version" BWA_HELP | cut -d" " -f2 > BWA_VERSION + + bwa mem -t ~{cpu} -a ~{draft_assembly_fasta} ~{read1} > ~{samplename}_R1.sam + bwa mem -t ~{cpu} -a ~{draft_assembly_fasta} ~{read2} > ~{samplename}_R2.sam + + >>> + output { + File read1_sam = "~{samplename}_R1.sam" + File read2_sam = "~{samplename}_R2.sam" + String bwa_version = read_string("BWA_VERSION") + } + runtime { + docker: "~{docker}" + memory: "~{memory} GB" + cpu: "~{cpu}" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/assembly/task_polypolish.wdl b/tasks/assembly/task_polypolish.wdl new file mode 100644 index 000000000..4676415bc --- /dev/null +++ b/tasks/assembly/task_polypolish.wdl @@ -0,0 +1,58 @@ +version 1.0 + +task polypolish { + input { + String samplename + File assembly_fasta + File read1_sam # these files need to be aligned to the draft assembly with `-a` flag + File read2_sam # see also the task_bwa.wdl#bwa_all task + + String? pair_orientation # default: auto + Float? low_percentile_threshold # default: 0.1 + Float? high_percentile_threshold # default: 99.9 + + Float? fraction_invalid # default: 0.2 + Float? fraction_valid # default 0.5 + Int? maximum_errors # default: 10 + Int? minimum_depth # default: 5 + Boolean careful = false # ignore any reads with multiple alignments + + Int cpu = 1 # polypolish is single-threaded + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/polypolish:0.6.0" + Int memory = 4 + } + command <<< + polypolish --version | tee VERSION + + polypolish filter \ + --in1 ~{read1_sam} \ + --in2 ~{read2_sam} \ + --out1 ~{samplename}_filtered1.sam \ + --out2 ~{samplename}_filtered2.sam \ + ~{"--orientation " + pair_orientation} \ + ~{"--low " + low_percentile_threshold} \ + ~{"--high " + high_percentile_threshold} + + polypolish polish ~{assembly_fasta} ~{samplename}_filtered1.sam ~{samplename}_filtered2.sam \ + ~{"--fraction_invalid " + fraction_invalid} \ + ~{"--fraction_valid " + fraction_valid} \ + ~{"--max_errors " + maximum_errors} \ + ~{"--min_depth " + minimum_depth} \ + ~{true="--careful" false="" careful} \ + > ~{samplename}_polished.fasta + >>> + output { + String polypolish_version = read_string("VERSION") + File polished_assembly = "~{samplename}_polished.fasta" # i don't actually know what the otuput is called + } + runtime { + docker: "~{docker}" + memory: "~{memory} GB" + cpu: "~{cpu}" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 1 + } +} \ No newline at end of file From 85a68dc6f8ee636acbc459729090c4633a1a02f2 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 25 Nov 2024 19:26:14 +0000 Subject: [PATCH 14/58] remove comment cruft --- tasks/assembly/task_polypolish.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/assembly/task_polypolish.wdl b/tasks/assembly/task_polypolish.wdl index 4676415bc..e2376f142 100644 --- a/tasks/assembly/task_polypolish.wdl +++ b/tasks/assembly/task_polypolish.wdl @@ -44,7 +44,7 @@ task polypolish { >>> output { String polypolish_version = read_string("VERSION") - File polished_assembly = "~{samplename}_polished.fasta" # i don't actually know what the otuput is called + File polished_assembly = "~{samplename}_polished.fasta" } runtime { docker: "~{docker}" From c7db697611d49de8a3cca6ac8ca744eb90829046 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 25 Nov 2024 16:41:47 -0600 Subject: [PATCH 15/58] initial commit bash contig filtering --- tasks/assembly/task_filtercontigs.wdl | 63 +++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 tasks/assembly/task_filtercontigs.wdl diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl new file mode 100644 index 000000000..d8bddae89 --- /dev/null +++ b/tasks/assembly/task_filtercontigs.wdl @@ -0,0 +1,63 @@ +version 1.0 + +task contig_filter { + input { + File assembly_fasta + Int min_len + Float min_cov = 0.0 + Boolean filter_homopolymers = true + Int cpu = 4 + Int memory = 8 + Int disk_size = 50 + String docker = "biotools" + } + command <<< + set -e + echo "Starting contig filtering" + + # Define output file paths + output_filtered="filtered_contigs.fasta" + + # Perform filtering using seqkit and awk + seqkit fx2tab -l -i -H ~{assembly_fasta} | \ + awk -v min_len=~{min_len} -v min_cov=~{min_cov} -v filter_homopolymers=~{filter_homopolymers} ' + BEGIN { OFS="\t" } + { + # Parse length and optional coverage from headers + length = $2 + coverage = (match($1, /cov=([0-9.]+)/, arr) ? arr[1] : 0) + + # Filter criteria + if (length >= min_len && coverage >= min_cov) { + # Remove homopolymers if enabled + if (filter_homopolymers) { + if ($3 !~ /^(A+|T+|C+|G+)$/) { + print $1, $2, $3 + } + } else { + print $1, $2, $3 + } + } + }' | seqkit tab2fx -o $output_filtered + + # Ensure the output is not empty + if [ ! -s $output_filtered ]; then + echo "Error: No contigs passed filtering criteria!" >&2 + exit 1 + fi + + echo "Filtering complete. Filtered contigs written to $output_filtered." + >>> + output { + File filtered_fasta = "filtered_contigs.fasta" + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} From 037197268a38836b20935c4b0bd7acfbc7eb1252 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 25 Nov 2024 16:42:15 -0600 Subject: [PATCH 16/58] initial commit bash contig filtering --- tasks/assembly/task_filtercontigs.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl index d8bddae89..4d37edd21 100644 --- a/tasks/assembly/task_filtercontigs.wdl +++ b/tasks/assembly/task_filtercontigs.wdl @@ -55,7 +55,7 @@ task contig_filter { docker: "~{docker}" cpu: cpu memory: "~{memory} GB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" maxRetries: 3 preemptible: 0 From facd02f3b07998d12793cac6e98864e36e279c52 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 27 Nov 2024 12:02:52 -0600 Subject: [PATCH 17/58] update medaka docker image --- tasks/assembly/task_medaka.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl index a6df0dd93..faea441aa 100644 --- a/tasks/assembly/task_medaka.wdl +++ b/tasks/assembly/task_medaka.wdl @@ -10,7 +10,7 @@ task medaka_polish { Int cpu = 4 Int memory = 16 Int disk_size = 100 - String docker = "staphb/medaka:2.0.1" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/medaka:2.0.1" } command <<< medaka --version | tee VERSION From d26fc957db7fd4be2b44e86989aadfb1351f10cb Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 27 Nov 2024 14:22:51 -0600 Subject: [PATCH 18/58] refactor assembly tasks and workflows for clarity and consistency --- ...{task_bandage.wdl => task_bandageplot.wdl} | 4 +- tasks/assembly/task_flye.wdl | 2 +- tasks/assembly/task_medaka.wdl | 3 +- workflows/utilities/wf_flye_consensus.wdl | 41 +++++++++++++++---- 4 files changed, 39 insertions(+), 11 deletions(-) rename tasks/assembly/{task_bandage.wdl => task_bandageplot.wdl} (85%) diff --git a/tasks/assembly/task_bandage.wdl b/tasks/assembly/task_bandageplot.wdl similarity index 85% rename from tasks/assembly/task_bandage.wdl rename to tasks/assembly/task_bandageplot.wdl index b7305f7c4..4cb558c1b 100644 --- a/tasks/assembly/task_bandage.wdl +++ b/tasks/assembly/task_bandageplot.wdl @@ -14,8 +14,8 @@ task bandage_plot { Bandage image ~{assembly_graph_gfa} ~{samplename}_bandage_plot.png >>> output { - File plot = "~{output_prefix}.png" - String version = read_string("VERSION") + File plot = "~{samplename}_bandage_plot.png" + String bandage_version = read_string("VERSION") } runtime { docker: "~{docker}" diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index 276802127..2ccc6c7ec 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -71,7 +71,7 @@ task flye { >>> output { - File assembly = "~{samplename}.assembly.fasta" + File assembly_fasta = "~{samplename}.assembly.fasta" File assembly_graph = "~{samplename}.assembly_graph.gfa" File assembly_info = "~{samplename}.assembly_info.txt" String flye_version = read_string("VERSION") diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl index faea441aa..3088ec1cf 100644 --- a/tasks/assembly/task_medaka.wdl +++ b/tasks/assembly/task_medaka.wdl @@ -1,6 +1,6 @@ version 1.0 -task medaka_polish { +task medaka_consensus { input { File assembly_fasta String samplename @@ -17,6 +17,7 @@ task medaka_polish { # Initialize the input for polishing with the provided assembly FASTA cp ~{assembly_fasta} polished_input.fasta + cp ~{assembly_fasta} polished_input.fasta # Perform Medaka polishing for the specified number of rounds for i in $(seq 1 ~{polish_rounds}); do diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 16dd9ed20..cd5e4fce1 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -1,19 +1,46 @@ version 1.0 -import "../../tasks/assembly/task_flye.wdl" as flye_task +import "../../tasks/assembly/task_flye.wdl" as task_flye +import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage +import "../../tasks/assembly/task_medaka.wdl" as task_medaka workflow flye_consensus { - meta { - description: "This workflow runs either flye, miniasm, or raven to generate a consensus genome assembly from long reads." - } + # Input parameters input { File read1 String samplename + String medaka_model + Int polish_rounds } - call flye_task.flye { + # Step 1: Run Flye assembly + call task_flye.flye as flye { input: read1 = read1, + samplename = samplename, + } + # Step 2: Generate Bandage plot from Flye assembly graph + call task_bandage.bandage_plot as bandage { + input: + assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - # placeholder -} \ No newline at end of file + # Step 3: Perform Medaka polishing on Flye assembly + call task_medaka.medaka_consensus as medaka { + input: + assembly_fasta = flye.assembly_fasta, + samplename = samplename, + read1 = read1, + medaka_model = medaka_model, + polish_rounds = polish_rounds, + } + # Outputs + output { + File final_assembly = medaka.medaka_fasta + File bandage_plot = bandage.plot + File medaka_vcf = medaka.medaka_vcf + String flye_version = flye.flye_version + String bandage_version = bandage.bandage_version + String medaka_version = medaka.medaka_version + } +} + From d45481e29d741bc132e353b9904c43def5e6eb4d Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 27 Nov 2024 14:26:28 -0600 Subject: [PATCH 19/58] add dnaapler to wf --- workflows/utilities/wf_flye_consensus.wdl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index cd5e4fce1..da186278b 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -3,6 +3,7 @@ version 1.0 import "../../tasks/assembly/task_flye.wdl" as task_flye import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage import "../../tasks/assembly/task_medaka.wdl" as task_medaka +import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler workflow flye_consensus { # Input parameters @@ -33,9 +34,16 @@ workflow flye_consensus { medaka_model = medaka_model, polish_rounds = polish_rounds, } + #final step dnaapler + call task_dnaapler.dnaapler as dnaapler { + input: + assembly_fasta = medaka.medaka_fasta, + samplename = samplename, + read1 = read1, + } # Outputs output { - File final_assembly = medaka.medaka_fasta + File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot File medaka_vcf = medaka.medaka_vcf String flye_version = flye.flye_version From eef27a9189216480bff22a031009588b5d76128e Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 27 Nov 2024 15:09:05 -0600 Subject: [PATCH 20/58] update racon --- tasks/assembly/task_racon_polish.wdl | 94 ++++++++++------------ workflows/utilities/wf_racon_polishing.wdl | 49 +++++++++++ 2 files changed, 90 insertions(+), 53 deletions(-) create mode 100644 workflows/utilities/wf_racon_polishing.wdl diff --git a/tasks/assembly/task_racon_polish.wdl b/tasks/assembly/task_racon_polish.wdl index 147b71a55..4ea2a5a12 100644 --- a/tasks/assembly/task_racon_polish.wdl +++ b/tasks/assembly/task_racon_polish.wdl @@ -1,55 +1,43 @@ version 1.0 -task racon_polish { - input { - File assembly_fasta - File read1 - Int racon_rounds = 1 - Int cpu = 4 - Int memory = 16 - Int disk_size = 100 - String samplename - String minimap2_docker = "staphb/minimap2:2.24" - String racon_docker = "staphb/racon:1.4.20" - } - - command <<< - # Initialize variables - cp ~{assembly_fasta} ~{samplename}.unpolished.fasta - POLISHED_FASTA=~{samplename}.unpolished.fasta - - # Run Racon for the specified number of rounds - for i in $(seq 1 ~{racon_rounds}); do - echo "Polishing with Racon - Round $i of ~{racon_rounds}" - mkdir -p racon_round_${i} - - # Step 1: Generate PAF alignments with Minimap2 - minimap2 -t ~{cpu} -x map-ont $POLISHED_FASTA ~{read1} \ - > racon_round_${i}/alignments.paf - - # Step 2: Run Racon using the generated PAF - racon -t ~{cpu} ~{read1} racon_round_${i}/alignments.paf $POLISHED_FASTA \ - > racon_round_${i}/consensus.fasta - - # Update the polished FASTA for the next round - POLISHED_FASTA=racon_round_${i}/consensus.fasta - done - - # Copy final polished FASTA to the home directory with sample name - cp $POLISHED_FASTA ~{samplename}.polished.fasta - >>> - - output { - File raconpolished_fasta = "~{samplename}.polished.fasta" - } - - runtime { - docker: "~{racon_docker}" - cpu: cpu - memory: "~{memory} GB" - disks: "local-disk " + disk_size + " HDD" - disk: disk_size + " GB" - maxRetries: 3 - preemptible: 0 - } -} \ No newline at end of file +task racon { + input { + File unpolished_fasta # Assembly (contigs) + File reads # Sequencing reads + File alignments # Minimap2 alignments (PAF/SAM) + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 + String samplename # Output file name prefix + String docker = "staphb/racon:1.4.20" + } + + command <<< + # Capture version for reproducibility + racon --version | tee VERSION + + # Run Racon for polishing + racon \ + -t ~{cpu} \ + ~{reads} \ + ~{alignments} \ + ~{unpolished_fasta} \ + > ~{samplename}.polished.fasta + >>> + + output { + File polished_fasta = "~{samplename}.polished.fasta" + String racon_version = read_string("VERSION") + String racon_docker = "~{docker}" + } + + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/workflows/utilities/wf_racon_polishing.wdl b/workflows/utilities/wf_racon_polishing.wdl new file mode 100644 index 000000000..870f00ca9 --- /dev/null +++ b/workflows/utilities/wf_racon_polishing.wdl @@ -0,0 +1,49 @@ +version 1.0 + +import "../../tasks/assembly/task_racon.wdl" as task_racon +import "../../tasks/assembly/task_minimap2.wdl" as task_minimap2 + +workflow racon_polishing_workflow { + input { + File assembly_fasta # Initial unpolished assembly + File read1 # Input sequencing reads + String samplename # Base name for output files + Int racon_rounds = 1 # Number of Racon polishing rounds + } + + # Step 1: Run Minimap2 to produce alignments + call minimap2 { + input: + query1 = read1, + reference = assembly_fasta, + samplename = samplename + "_alignments", + } + + # Step 2: Iteratively run Racon for polishing + scatter (round in range(racon_rounds)) { + if (round == 0) { + # First round uses the initial assembly and Minimap2 alignments + call racon { + input: + unpolished_fasta = assembly_fasta, + reads = read1, + alignments = minimap2.minimap2_out, + samplename = samplename + "_racon_round" + round, + } + } else { + # Subsequent rounds use the output from the previous Racon run + call racon { + input: + unpolished_fasta = racon.polished_fasta, + reads = read1, + alignments = minimap2.minimap2_out, + samplename = samplename + "_racon_round" + round, + } + } + } + + # Output the final polished assembly + output { + File final_polished_fasta = racon.polished_fasta + } +} From 0fee522e80750b8fa5008555cfd0e88644881086 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 27 Nov 2024 16:33:37 -0600 Subject: [PATCH 21/58] add polisher options to flye_consensus wf --- workflows/utilities/wf_flye_consensus.wdl | 28 +++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index da186278b..77d6b9ea5 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -3,6 +3,7 @@ version 1.0 import "../../tasks/assembly/task_flye.wdl" as task_flye import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage import "../../tasks/assembly/task_medaka.wdl" as task_medaka +import "../../workflows/utilities/wf_racon_polishing.wdl" as wf_racon_polishing import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler workflow flye_consensus { @@ -11,6 +12,7 @@ workflow flye_consensus { File read1 String samplename String medaka_model + String polisher = "medaka" Int polish_rounds } # Step 1: Run Flye assembly @@ -25,19 +27,31 @@ workflow flye_consensus { assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - # Step 3: Perform Medaka polishing on Flye assembly - call task_medaka.medaka_consensus as medaka { + # Step 3: Perform polishing based on the selected polisher + if (polisher == "medaka") { + call task_medaka.medaka_consensus as medaka { + input: + assembly_fasta = flye.assembly_fasta, + samplename = samplename, + read1 = read1, + medaka_model = medaka_model, + polish_rounds = polish_rounds, + } + } + + if (polisher == "racon") { + call workflow_racon.racon_polishing_workflow as racon { input: assembly_fasta = flye.assembly_fasta, - samplename = samplename, read1 = read1, - medaka_model = medaka_model, - polish_rounds = polish_rounds, + samplename = samplename, + racon_rounds = polish_rounds, } +} #final step dnaapler call task_dnaapler.dnaapler as dnaapler { input: - assembly_fasta = medaka.medaka_fasta, + assembly_fasta = select_first([medaka.medaka_fasta, racon.final_polished_fasta]), samplename = samplename, read1 = read1, } @@ -45,7 +59,7 @@ workflow flye_consensus { output { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot - File medaka_vcf = medaka.medaka_vcf + File polisher_output_vcf = select_first([medaka.medaka_vcf, racon.final_polished_fasta]) String flye_version = flye.flye_version String bandage_version = bandage.bandage_version String medaka_version = medaka.medaka_version From 17289d1b17b82c06f95396c173eeaae9708e2b51 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 29 Nov 2024 16:25:09 -0600 Subject: [PATCH 22/58] update workflow and tasks and altered racon with minimap in docker to add --- tasks/assembly/task_bandageplot.wdl | 5 ++- tasks/assembly/task_dnaapler.wdl | 5 ++- tasks/assembly/task_flye.wdl | 1 + tasks/assembly/task_medaka.wdl | 15 ++++--- tasks/assembly/task_racon.wdl | 42 +++++++++++++++++++ tasks/assembly/task_racon_polish.wdl | 43 ------------------- workflows/utilities/wf_flye_consensus.wdl | 46 ++++++++++---------- workflows/utilities/wf_racon_polishing.wdl | 49 ---------------------- 8 files changed, 79 insertions(+), 127 deletions(-) create mode 100644 tasks/assembly/task_racon.wdl delete mode 100644 tasks/assembly/task_racon_polish.wdl delete mode 100644 workflows/utilities/wf_racon_polishing.wdl diff --git a/tasks/assembly/task_bandageplot.wdl b/tasks/assembly/task_bandageplot.wdl index 4cb558c1b..23549a5be 100644 --- a/tasks/assembly/task_bandageplot.wdl +++ b/tasks/assembly/task_bandageplot.wdl @@ -10,7 +10,8 @@ task bandage_plot { String docker = "us-docker.pkg.dev/general-theiagen/staphb/bandage:0.8.1" } command <<< - bandage --version | tee VERSION + set -euo pipefail + Bandage --version | tee VERSION Bandage image ~{assembly_graph_gfa} ~{samplename}_bandage_plot.png >>> output { @@ -23,7 +24,7 @@ task bandage_plot { memory: "~{memory} GB" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 1 preemptible: 0 } } diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index a2a6c1e21..3f510dfa2 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -10,6 +10,7 @@ task dnaapler_all { String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:0.8.0" } command <<< + set -euo pipefail dnaapler --version | tee VERSION # Run DnaApler with the 'all' subcommand @@ -19,7 +20,6 @@ task dnaapler_all { -p ~{prefix} \ -t ~{threads} >>> - output { File reoriented_fasta = "~{prefix}_reoriented.fasta" String dnaapler_version = read_string("VERSION") @@ -30,7 +30,8 @@ task dnaapler_all { memory: "~{memory} GB" disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 1 preemptible: 0 } +} diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index 2ccc6c7ec..69ac46c09 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -32,6 +32,7 @@ task flye { Int memory = 32 } command <<< + set -euo pipefail flye --version | tee VERSION # determine read type diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl index 3088ec1cf..9b30182c7 100644 --- a/tasks/assembly/task_medaka.wdl +++ b/tasks/assembly/task_medaka.wdl @@ -5,7 +5,7 @@ task medaka_consensus { File assembly_fasta String samplename File read1 - String medaka_model + String medaka_model = "r1041_e82_400bps_sup_v5.0.0" Int polish_rounds = 1 Int cpu = 4 Int memory = 16 @@ -13,7 +13,8 @@ task medaka_consensus { String docker = "us-docker.pkg.dev/general-theiagen/staphb/medaka:2.0.1" } command <<< - medaka --version | tee VERSION + set -euo pipefail + medaka --version | tee MEDAKA_VERSION # Initialize the input for polishing with the provided assembly FASTA cp ~{assembly_fasta} polished_input.fasta @@ -30,20 +31,18 @@ task medaka_consensus { -d polished_input.fasta \ -o "$polish_dir" \ -m ~{medaka_model} \ - --threads ~{cpu} + -t ~{cpu} - # Update the input for the next round + # Updates the input for the next round cp "$polish_dir/consensus.fasta" polished_input.fasta done # Rename final polished outputs with sample name mv polished_input.fasta ~{samplename}.polished.fasta - mv polish_round_~{polish_rounds}/consensus.vcf.gz ~{samplename}.polished.vcf.gz >>> output { File medaka_fasta = "~{samplename}.polished.fasta" - File medaka_vcf = "~{samplename}.polished.vcf.gz" - String medaka_version = read_string("VERSION") + String medaka_version = read_string("MEDAKA_VERSION") } runtime { docker: "~{docker}" @@ -51,7 +50,7 @@ task medaka_consensus { memory: "~{memory} GB" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" - maxRetries: 3 + maxRetries: 1 preemptible: 0 } } diff --git a/tasks/assembly/task_racon.wdl b/tasks/assembly/task_racon.wdl new file mode 100644 index 000000000..b4d0eda7d --- /dev/null +++ b/tasks/assembly/task_racon.wdl @@ -0,0 +1,42 @@ +version 1.0 + +task racon { + input { + File unpolished_fasta + File reads + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 + String samplename + String docker = "staphb/racon-minimap2:latest" + } + command <<< + set -euo pipefail + minimap2 --version | tee MINIMAP2_VERSION + racon --version | tee -a RACON_VERSION + # Generate alignments with Minimap2 + minimap2 -t ~{cpu} ~{unpolished_fasta} ~{reads} > ~{samplename}.paf + + # Run Racon for polishing + racon \ + -t ~{cpu} \ + ~{reads} \ + ~{samplename}.paf \ + ~{unpolished_fasta} \ + > ~{samplename}.polished.fasta + >>> + output { + File polished_fasta = "~{samplename}.polished.fasta" + File alignments = "~{samplename}.paf" + String racon_version = read_string("RACON_VERSION") + String minimap2_version = read_string("MINIMAP2_VERSION") + } + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/assembly/task_racon_polish.wdl b/tasks/assembly/task_racon_polish.wdl deleted file mode 100644 index 4ea2a5a12..000000000 --- a/tasks/assembly/task_racon_polish.wdl +++ /dev/null @@ -1,43 +0,0 @@ -version 1.0 - -task racon { - input { - File unpolished_fasta # Assembly (contigs) - File reads # Sequencing reads - File alignments # Minimap2 alignments (PAF/SAM) - Int cpu = 4 - Int memory = 16 - Int disk_size = 100 - String samplename # Output file name prefix - String docker = "staphb/racon:1.4.20" - } - - command <<< - # Capture version for reproducibility - racon --version | tee VERSION - - # Run Racon for polishing - racon \ - -t ~{cpu} \ - ~{reads} \ - ~{alignments} \ - ~{unpolished_fasta} \ - > ~{samplename}.polished.fasta - >>> - - output { - File polished_fasta = "~{samplename}.polished.fasta" - String racon_version = read_string("VERSION") - String racon_docker = "~{docker}" - } - - runtime { - docker: "~{docker}" - memory: memory + " GB" - cpu: cpu - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - maxRetries: 3 - preemptible: 0 - } -} diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 77d6b9ea5..54ecb8a6c 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -3,11 +3,14 @@ version 1.0 import "../../tasks/assembly/task_flye.wdl" as task_flye import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage import "../../tasks/assembly/task_medaka.wdl" as task_medaka -import "../../workflows/utilities/wf_racon_polishing.wdl" as wf_racon_polishing +import "../../tasks/assembly/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler workflow flye_consensus { - # Input parameters + meta { + description: "Run Flye assembly, Bandage plot, and consensus polishing with Medaka or Racon from long reads" + } + input { File read1 String samplename @@ -15,19 +18,19 @@ workflow flye_consensus { String polisher = "medaka" Int polish_rounds } - # Step 1: Run Flye assembly + call task_flye.flye as flye { input: read1 = read1, - samplename = samplename, + samplename = samplename } - # Step 2: Generate Bandage plot from Flye assembly graph + call task_bandage.bandage_plot as bandage { input: assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - # Step 3: Perform polishing based on the selected polisher + if (polisher == "medaka") { call task_medaka.medaka_consensus as medaka { input: @@ -35,34 +38,31 @@ workflow flye_consensus { samplename = samplename, read1 = read1, medaka_model = medaka_model, - polish_rounds = polish_rounds, + polish_rounds = polish_rounds } } if (polisher == "racon") { - call workflow_racon.racon_polishing_workflow as racon { - input: - assembly_fasta = flye.assembly_fasta, - read1 = read1, - samplename = samplename, - racon_rounds = polish_rounds, + call task_racon.racon as racon { + input: + unpolished_fasta = flye.assembly_fasta, + reads = read1, + samplename = samplename + } } -} - #final step dnaapler - call task_dnaapler.dnaapler as dnaapler { + call task_dnaapler.dnaapler_all as dnaapler { input: - assembly_fasta = select_first([medaka.medaka_fasta, racon.final_polished_fasta]), - samplename = samplename, - read1 = read1, + input_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta]), + prefix = samplename, + threads = polish_rounds } - # Outputs + output { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot - File polisher_output_vcf = select_first([medaka.medaka_vcf, racon.final_polished_fasta]) String flye_version = flye.flye_version String bandage_version = bandage.bandage_version - String medaka_version = medaka.medaka_version + String? medaka_version = medaka.medaka_version + String? racon_version = racon.racon_version } } - diff --git a/workflows/utilities/wf_racon_polishing.wdl b/workflows/utilities/wf_racon_polishing.wdl deleted file mode 100644 index 870f00ca9..000000000 --- a/workflows/utilities/wf_racon_polishing.wdl +++ /dev/null @@ -1,49 +0,0 @@ -version 1.0 - -import "../../tasks/assembly/task_racon.wdl" as task_racon -import "../../tasks/assembly/task_minimap2.wdl" as task_minimap2 - -workflow racon_polishing_workflow { - input { - File assembly_fasta # Initial unpolished assembly - File read1 # Input sequencing reads - String samplename # Base name for output files - Int racon_rounds = 1 # Number of Racon polishing rounds - } - - # Step 1: Run Minimap2 to produce alignments - call minimap2 { - input: - query1 = read1, - reference = assembly_fasta, - samplename = samplename + "_alignments", - } - - # Step 2: Iteratively run Racon for polishing - scatter (round in range(racon_rounds)) { - if (round == 0) { - # First round uses the initial assembly and Minimap2 alignments - call racon { - input: - unpolished_fasta = assembly_fasta, - reads = read1, - alignments = minimap2.minimap2_out, - samplename = samplename + "_racon_round" + round, - } - } else { - # Subsequent rounds use the output from the previous Racon run - call racon { - input: - unpolished_fasta = racon.polished_fasta, - reads = read1, - alignments = minimap2.minimap2_out, - samplename = samplename + "_racon_round" + round, - } - } - } - - # Output the final polished assembly - output { - File final_polished_fasta = racon.polished_fasta - } -} From 646b1161671ab1fe28a48421fa08f1cb60380fa7 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 2 Dec 2024 14:52:56 -0600 Subject: [PATCH 23/58] update dnapler and tody flye consensus wf --- tasks/assembly/task_dnaapler.wdl | 89 ++++++++++++++++------- workflows/utilities/wf_flye_consensus.wdl | 21 ++---- 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index 3f510dfa2..9c396dea4 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -2,36 +2,73 @@ version 1.0 task dnaapler_all { input { - File input_fasta - String prefix - Int threads = 4 - Int disk_size = 100 - Int memory = 16 + File input_fasta + String samplename + Int threads = 4 + Int disk_size = 100 + Int memory = 16 String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:0.8.0" - } + } command <<< - set -euo pipefail - dnaapler --version | tee VERSION - - # Run DnaApler with the 'all' subcommand - dnaapler all \ - -i ~{input_fasta} \ - -o . \ - -p ~{prefix} \ - -t ~{threads} + set -euo pipefail + + echo "Starting dnaapler task for sample: ~{samplename}" + echo "Input file: ~{input_fasta}" + + # Check input FASTA validity + echo "Validating input FASTA file..." + if ! grep -q "^>" ~{input_fasta}; then + echo "ERROR: Input file ~{input_fasta} is not in FASTA format." >&2 + exit 1 + else + echo "Input FASTA file is valid." + fi + + # dnaapler version + dnaapler --version | tee VERSION + + # Create a subdirectory for dnaapler outputs + output_dir="dnaapler_output" + mkdir -p "$output_dir" + echo "Output directory created: $output_dir" + + # Run dnaapler with the 'all' subcommand + echo "Running dnaapler..." + dnaapler all \ + -i ~{input_fasta} \ + -o "$output_dir" \ + -p ~{samplename} \ + -t ~{threads} \ + -f || { + echo "ERROR: dnaapler command failed. Check logs for details." >&2 + exit 1 + } + + echo "dnaapler command completed successfully." + + # Check if output FASTA file exists + if [[ ! -f "$output_dir"/~{samplename}_reoriented.fasta ]]; then + echo "ERROR: Expected output file not found: $output_dir/~{samplename}_reoriented.fasta" >&2 + exit 1 + fi + + # Move the final reoriented FASTA file to the task's working directory + echo "Moving output FASTA file to working directory..." + mv "$output_dir"/~{samplename}_reoriented.fasta . + + echo "dnaapler task completed successfully for sample: ~{samplename}" >>> output { - File reoriented_fasta = "~{prefix}_reoriented.fasta" - String dnaapler_version = read_string("VERSION") + File reoriented_fasta = "~{samplename}_reoriented.fasta" + String dnaapler_version = read_string("VERSION") } runtime { - docker: "~{docker}" - cpu: threads - memory: "~{memory} GB" - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - maxRetries: 1 - preemptible: 0 + docker: "~{docker}" + cpu: threads + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 1 + preemptible: 0 } -} - +} \ No newline at end of file diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 54ecb8a6c..734c6de0b 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -5,12 +5,12 @@ import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage import "../../tasks/assembly/task_medaka.wdl" as task_medaka import "../../tasks/assembly/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler +import "../../tasks/task_versioning.wdl" as versioning_task workflow flye_consensus { meta { description: "Run Flye assembly, Bandage plot, and consensus polishing with Medaka or Racon from long reads" } - input { File read1 String samplename @@ -18,19 +18,19 @@ workflow flye_consensus { String polisher = "medaka" Int polish_rounds } - + call versioning_task.version_capture { + input: + } call task_flye.flye as flye { input: read1 = read1, samplename = samplename } - call task_bandage.bandage_plot as bandage { input: assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - if (polisher == "medaka") { call task_medaka.medaka_consensus as medaka { input: @@ -41,7 +41,6 @@ workflow flye_consensus { polish_rounds = polish_rounds } } - if (polisher == "racon") { call task_racon.racon as racon { input: @@ -53,16 +52,12 @@ workflow flye_consensus { call task_dnaapler.dnaapler_all as dnaapler { input: input_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta]), - prefix = samplename, - threads = polish_rounds + samplename = samplename } - output { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot - String flye_version = flye.flye_version - String bandage_version = bandage.bandage_version - String? medaka_version = medaka.medaka_version - String? racon_version = racon.racon_version + String flye_phb_version = version_capture.phb_version + String flye_analysis_date = version_capture.date } -} +} \ No newline at end of file From 8e56dc0f8069936998b6c304548498b0e76e9fef Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 3 Dec 2024 09:14:18 -0600 Subject: [PATCH 24/58] update filter contigs task initial attempt aowrking --- tasks/assembly/task_dnaapler.wdl | 5 +- tasks/assembly/task_filtercontigs.wdl | 98 +++++++++++++++------------ tasks/assembly/task_racon.wdl | 2 +- 3 files changed, 56 insertions(+), 49 deletions(-) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index 9c396dea4..cc6c14c18 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -12,10 +12,7 @@ task dnaapler_all { command <<< set -euo pipefail - echo "Starting dnaapler task for sample: ~{samplename}" - echo "Input file: ~{input_fasta}" - - # Check input FASTA validity + # Check input FASTA is valid echo "Validating input FASTA file..." if ! grep -q "^>" ~{input_fasta}; then echo "ERROR: Input file ~{input_fasta} is not in FASTA format." >&2 diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl index 4d37edd21..f46875b45 100644 --- a/tasks/assembly/task_filtercontigs.wdl +++ b/tasks/assembly/task_filtercontigs.wdl @@ -3,61 +3,71 @@ version 1.0 task contig_filter { input { File assembly_fasta - Int min_len - Float min_cov = 0.0 + Int min_len = 1000 Boolean filter_homopolymers = true - Int cpu = 4 - Int memory = 8 - Int disk_size = 50 - String docker = "biotools" + Int disk_size = 100 + Int memory = 16 + Int threads = 4 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.8.2" } command <<< - set -e - echo "Starting contig filtering" - - # Define output file paths - output_filtered="filtered_contigs.fasta" - - # Perform filtering using seqkit and awk - seqkit fx2tab -l -i -H ~{assembly_fasta} | \ - awk -v min_len=~{min_len} -v min_cov=~{min_cov} -v filter_homopolymers=~{filter_homopolymers} ' - BEGIN { OFS="\t" } - { - # Parse length and optional coverage from headers - length = $2 - coverage = (match($1, /cov=([0-9.]+)/, arr) ? arr[1] : 0) - - # Filter criteria - if (length >= min_len && coverage >= min_cov) { - # Remove homopolymers if enabled - if (filter_homopolymers) { - if ($3 !~ /^(A+|T+|C+|G+)$/) { - print $1, $2, $3 - } - } else { - print $1, $2, $3 - } - } - }' | seqkit tab2fx -o $output_filtered - - # Ensure the output is not empty - if [ ! -s $output_filtered ]; then - echo "Error: No contigs passed filtering criteria!" >&2 - exit 1 + set -euo pipefail + echo "Filtering contigs from ~{assembly_fasta}" >&2 + + # Calculate initial metrics + total_contigs=$(grep -c "^>" ~{assembly_fasta}) + echo "Total contigs: $total_contigs" >&2 + + total_bases=$(awk '/^>/ {if (seqlen){print seqlen}; seqlen=0; next} {seqlen += length($0)} END {if (seqlen) print seqlen}' ~{assembly_fasta}) + echo "Total bases: $total_bases" >&2 + + # Filter by length + seqkit seq -m ~{min_len} ~{assembly_fasta} > length_filtered.fasta + echo "Length filtering complete. File size:" >&2 + ls -lh length_filtered.fasta >&2 + + # If homopolymer filtering is enabled, process further + if [ "~{filter_homopolymers}" = "true" ]; then + awk ' + BEGIN {RS=">"; ORS=""} + NR > 1 { + header = $1; seq = $2 + if (seq !~ /^(.)\1+$/) {print ">" header seq "\n"} + }' length_filtered.fasta > filtered_contigs.fasta + else + mv length_filtered.fasta filtered_contigs.fasta fi - echo "Filtering complete. Filtered contigs written to $output_filtered." + # Validate the final file + if [ ! -s filtered_contigs.fasta ]; then + echo "Error: No contigs passed filtering criteria!" >&2 + exit 1 + fi + + # Count final retained contigs + retained_contigs=$(grep -c "^>" filtered_contigs.fasta) + retained_bases=$(awk '/^>/ {if (seqlen){print seqlen}; seqlen=0; next} {seqlen += length($0)} END {if (seqlen) print seqlen}' filtered_contigs.fasta) + contigs_removed_homopolymers=$((total_contigs - retained_contigs)) + + # Write metrics to file + metrics_file="filtering_metrics.txt" + echo "Total contigs: $total_contigs" > $metrics_file + echo "Total bases: $total_bases" >> $metrics_file + echo "Contigs retained: $retained_contigs" >> $metrics_file + echo "Bases retained: $retained_bases" >> $metrics_file + echo "Contigs removed (short length): $((total_contigs - retained_contigs))" >> $metrics_file + echo "Contigs removed (homopolymers): $contigs_removed_homopolymers" >> $metrics_file + + cat $metrics_file >&2 >>> output { File filtered_fasta = "filtered_contigs.fasta" + File metrics = "filtering_metrics.txt" } runtime { docker: "~{docker}" - cpu: cpu - memory: "~{memory} GB" + cpu: threads + memory: "~{memory}G" disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - maxRetries: 3 - preemptible: 0 } } diff --git a/tasks/assembly/task_racon.wdl b/tasks/assembly/task_racon.wdl index b4d0eda7d..10ea7788c 100644 --- a/tasks/assembly/task_racon.wdl +++ b/tasks/assembly/task_racon.wdl @@ -8,7 +8,7 @@ task racon { Int memory = 16 Int disk_size = 100 String samplename - String docker = "staphb/racon-minimap2:latest" + String docker = "staphb/racon-minimap2" # need to add minimap2 to staph-b docker-builds } command <<< set -euo pipefail From f37bf3bcc5247dadcc64d2187e90fa874f35204c Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 3 Dec 2024 15:50:57 -0600 Subject: [PATCH 25/58] updated flye consensus wf and filter contigs --- tasks/assembly/task_filtercontigs.wdl | 13 ++++++++++--- tasks/assembly/task_racon.wdl | 2 +- workflows/utilities/wf_flye_consensus.wdl | 12 +++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl index f46875b45..57b4c0e10 100644 --- a/tasks/assembly/task_filtercontigs.wdl +++ b/tasks/assembly/task_filtercontigs.wdl @@ -31,8 +31,15 @@ task contig_filter { awk ' BEGIN {RS=">"; ORS=""} NR > 1 { - header = $1; seq = $2 - if (seq !~ /^(.)\1+$/) {print ">" header seq "\n"} + split($0, lines, "\n"); + header = lines[1]; + seq = ""; + for (i=2; i<=length(lines); i++) { + seq = seq lines[i]; + } + if (seq !~ /^(.)\1+$/) { + print ">" header "\n" seq "\n" + } }' length_filtered.fasta > filtered_contigs.fasta else mv length_filtered.fasta filtered_contigs.fasta @@ -62,7 +69,7 @@ task contig_filter { >>> output { File filtered_fasta = "filtered_contigs.fasta" - File metrics = "filtering_metrics.txt" + File assembly_filtering_metrics = "filtering_metrics.txt" } runtime { docker: "~{docker}" diff --git a/tasks/assembly/task_racon.wdl b/tasks/assembly/task_racon.wdl index 10ea7788c..1c37a23f8 100644 --- a/tasks/assembly/task_racon.wdl +++ b/tasks/assembly/task_racon.wdl @@ -8,7 +8,7 @@ task racon { Int memory = 16 Int disk_size = 100 String samplename - String docker = "staphb/racon-minimap2" # need to add minimap2 to staph-b docker-builds + String docker = "staphb/racon" } command <<< set -euo pipefail diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 734c6de0b..b94e5bb45 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -6,6 +6,7 @@ import "../../tasks/assembly/task_medaka.wdl" as task_medaka import "../../tasks/assembly/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler import "../../tasks/task_versioning.wdl" as versioning_task +import "../../tasks/assembly/task_filtercontigs.wdl" as task_filtercontigs workflow flye_consensus { meta { @@ -14,9 +15,9 @@ workflow flye_consensus { input { File read1 String samplename - String medaka_model String polisher = "medaka" - Int polish_rounds + Int? polish_rounds + String? medaka_model } call versioning_task.version_capture { input: @@ -49,14 +50,19 @@ workflow flye_consensus { samplename = samplename } } + call task_filtercontigs.contig_filter as contig_filter { + input: + assembly_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta]) + } call task_dnaapler.dnaapler_all as dnaapler { input: - input_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta]), + input_fasta = contig_filter.filtered_fasta, samplename = samplename } output { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot + File filtered_assembly = contig_filter.filtered_fasta String flye_phb_version = version_capture.phb_version String flye_analysis_date = version_capture.date } From 2865a2de9888e49fd75bfcf99f5a42d437cca3d0 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 10:12:04 -0600 Subject: [PATCH 26/58] update docker images porechop and dnaapler --- tasks/assembly/task_dnaapler.wdl | 8 ++++---- tasks/assembly/task_filtercontigs.wdl | 2 +- tasks/assembly/task_porechop.wdl | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index cc6c14c18..45bcb3b6d 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -4,10 +4,10 @@ task dnaapler_all { input { File input_fasta String samplename - Int threads = 4 + Int cpu = 4 Int disk_size = 100 Int memory = 16 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:0.8.0" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:1.0.1" } command <<< set -euo pipefail @@ -35,7 +35,7 @@ task dnaapler_all { -i ~{input_fasta} \ -o "$output_dir" \ -p ~{samplename} \ - -t ~{threads} \ + -t ~{cpu} \ -f || { echo "ERROR: dnaapler command failed. Check logs for details." >&2 exit 1 @@ -61,7 +61,7 @@ task dnaapler_all { } runtime { docker: "~{docker}" - cpu: threads + cpu: cpu memory: "~{memory} GB" disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl index 57b4c0e10..5268e08a9 100644 --- a/tasks/assembly/task_filtercontigs.wdl +++ b/tasks/assembly/task_filtercontigs.wdl @@ -18,7 +18,7 @@ task contig_filter { total_contigs=$(grep -c "^>" ~{assembly_fasta}) echo "Total contigs: $total_contigs" >&2 - total_bases=$(awk '/^>/ {if (seqlen){print seqlen}; seqlen=0; next} {seqlen += length($0)} END {if (seqlen) print seqlen}' ~{assembly_fasta}) + total_bases=$(awk '/^>/ {if (seqlen){total+=seqlen}; seqlen=0; next} {seqlen += length($0)} END {total+=seqlen; print total}' ~{assembly_fasta}) echo "Total bases: $total_bases" >&2 # Filter by length diff --git a/tasks/assembly/task_porechop.wdl b/tasks/assembly/task_porechop.wdl index 23e59f387..42eba0d49 100644 --- a/tasks/assembly/task_porechop.wdl +++ b/tasks/assembly/task_porechop.wdl @@ -1,6 +1,6 @@ version 1.0 -workflow porechopabi_workflow { +task porechop { input { Array[File] raw_reads # Input raw reads (FASTQ format) String samplename From f9cbde4550fe8f4c943acd6e2fd71eb24042a434 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 11:55:53 -0600 Subject: [PATCH 27/58] optional trim and polish tasks, update porechop and dnaapler mode --- tasks/assembly/task_dnaapler.wdl | 3 +- tasks/assembly/task_porechop.wdl | 43 +++++++++++++---------- workflows/utilities/wf_flye_consensus.wdl | 32 +++++++++++------ 3 files changed, 48 insertions(+), 30 deletions(-) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index 45bcb3b6d..dce0a5045 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -4,6 +4,7 @@ task dnaapler_all { input { File input_fasta String samplename + String dmaapler_mode = "all" # The mode of reorientation to execute (default: 'all') Int cpu = 4 Int disk_size = 100 Int memory = 16 @@ -31,7 +32,7 @@ task dnaapler_all { # Run dnaapler with the 'all' subcommand echo "Running dnaapler..." - dnaapler all \ + dnaapler ~{dmaapler_mode} \ -i ~{input_fasta} \ -o "$output_dir" \ -p ~{samplename} \ diff --git a/tasks/assembly/task_porechop.wdl b/tasks/assembly/task_porechop.wdl index 42eba0d49..797a6808f 100644 --- a/tasks/assembly/task_porechop.wdl +++ b/tasks/assembly/task_porechop.wdl @@ -2,30 +2,35 @@ version 1.0 task porechop { input { - Array[File] raw_reads # Input raw reads (FASTQ format) - String samplename - Int cpu = 4 - Int memory = 8 - Int disk_size = 50 - String docker #need to create dockerfile + File read1 # Raw read input file + String samplename + String? trimopts # Optional trimming options + Int cpu = 4 + Int memory = 8 + Int disk_size = 50 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/porechop:0.2.4" } - command <<< - porechop_abi --version | tee VERSION + set -euo pipefail - # Combine input reads into a single FASTQ file (if more than one file) - cat ~{sep=" " raw_reads} > combined_raw_reads.fastq + echo "Porechop version:" + porechop --version | tee VERSION - # Run Porechop - porechop_abi \ - -input combined_raw_reads.fastq \ - -output ~{samplename}.trimmed.fastq \ - --threads ~{cpu} - >>> + echo "Trimming reads with Porechop..." + porechop \ + --input ~{read1} \ + --output ~{samplename}.trimmed.fastq \ + --threads ~{cpu} \ + ~{trimopts} || { + echo "Porechop failed."; exit 1; + } + echo "Compressing trimmed reads..." + gzip -f ~{samplename}.trimmed.fastq + >>> output { - File trimmed_reads = "~{samplename}.trimmed.fastq" # Cleaned reads - String porechopabi_version = read_string("VERSION") # Porechop version + File trimmed_reads = "~{samplename}.trimmed.fastq.gz" + String porechop_version = read_string("VERSION") } runtime { docker: "~{docker}" @@ -35,5 +40,5 @@ task porechop { disk: disk_size + " GB" maxRetries: 3 preemptible: 0 - } + } } diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index b94e5bb45..9124cd3ac 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -1,5 +1,6 @@ version 1.0 +import "../../tasks/assembly/task_porechop.wdl" as task_porechop import "../../tasks/assembly/task_flye.wdl" as task_flye import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage import "../../tasks/assembly/task_medaka.wdl" as task_medaka @@ -10,21 +11,32 @@ import "../../tasks/assembly/task_filtercontigs.wdl" as task_filtercontigs workflow flye_consensus { meta { - description: "Run Flye assembly, Bandage plot, and consensus polishing with Medaka or Racon from long reads" + description: "Run Flye assembly, Bandage plot, and consensus polishing with Medaka or Racon from long reads and reorient contigs with dnaapler" } input { - File read1 + File read1 # raw input reads String samplename String polisher = "medaka" Int? polish_rounds String? medaka_model + Boolean trim_reads = false # Default: No trimming + Boolean no_polishing = false # Default: Polishing enabled } call versioning_task.version_capture { input: } + # Optional Porechop trimming before Flye + if (trim_reads) { + call task_porechop.porechop as porechop { + input: + read1 = read1, + samplename = samplename + } + } + # Call Flye using either trimmed reads or raw reads call task_flye.flye as flye { input: - read1 = read1, + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename } call task_bandage.bandage_plot as bandage { @@ -32,27 +44,27 @@ workflow flye_consensus { assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - if (polisher == "medaka") { + if (!no_polishing && polisher == "medaka") { call task_medaka.medaka_consensus as medaka { input: assembly_fasta = flye.assembly_fasta, samplename = samplename, - read1 = read1, + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available medaka_model = medaka_model, polish_rounds = polish_rounds } } - if (polisher == "racon") { + if (!no_polishing && polisher == "racon") { call task_racon.racon as racon { input: unpolished_fasta = flye.assembly_fasta, - reads = read1, + reads = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename } } call task_filtercontigs.contig_filter as contig_filter { input: - assembly_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta]) + assembly_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), #use Flye assembly if no polishing is true } call task_dnaapler.dnaapler_all as dnaapler { input: @@ -63,7 +75,7 @@ workflow flye_consensus { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot File filtered_assembly = contig_filter.filtered_fasta - String flye_phb_version = version_capture.phb_version + String flye_phb_version = version_capture.phb_version String flye_analysis_date = version_capture.date } -} \ No newline at end of file +} From 4b97b30f7b3fcc863c2d5355a33a0c72b0df84d9 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 13:18:03 -0600 Subject: [PATCH 28/58] incporporate hybrid assemblies with polypolish --- tasks/assembly/task_polypolish.wdl | 53 ++++++++++++------ workflows/utilities/wf_flye_consensus.wdl | 66 ++++++++++++++++++----- 2 files changed, 89 insertions(+), 30 deletions(-) diff --git a/tasks/assembly/task_polypolish.wdl b/tasks/assembly/task_polypolish.wdl index e2376f142..7907cfcc2 100644 --- a/tasks/assembly/task_polypolish.wdl +++ b/tasks/assembly/task_polypolish.wdl @@ -7,6 +7,7 @@ task polypolish { File read1_sam # these files need to be aligned to the draft assembly with `-a` flag File read2_sam # see also the task_bwa.wdl#bwa_all task + Int polypolish_rounds = 1 # Default: 1 round of polishing String? pair_orientation # default: auto Float? low_percentile_threshold # default: 0.1 Float? high_percentile_threshold # default: 99.9 @@ -23,28 +24,46 @@ task polypolish { Int memory = 4 } command <<< + set -euo pipefail + polypolish --version | tee VERSION - polypolish filter \ - --in1 ~{read1_sam} \ - --in2 ~{read2_sam} \ - --out1 ~{samplename}_filtered1.sam \ - --out2 ~{samplename}_filtered2.sam \ - ~{"--orientation " + pair_orientation} \ - ~{"--low " + low_percentile_threshold} \ - ~{"--high " + high_percentile_threshold} - - polypolish polish ~{assembly_fasta} ~{samplename}_filtered1.sam ~{samplename}_filtered2.sam \ - ~{"--fraction_invalid " + fraction_invalid} \ - ~{"--fraction_valid " + fraction_valid} \ - ~{"--max_errors " + maximum_errors} \ - ~{"--min_depth " + minimum_depth} \ - ~{true="--careful" false="" careful} \ - > ~{samplename}_polished.fasta + # Initial input for polishing + polished_assembly="~{assembly_fasta}" + + for i in $(seq 1 ~{polypolish_rounds}); do + echo "Starting Polypolish round $i..." + + # Filter SAM files + polypolish filter \ + --in1 ~{read1_sam} \ + --in2 ~{read2_sam} \ + --out1 ${polished_assembly}_filtered1.sam \ + --out2 ${polished_assembly}_filtered2.sam \ + ~{"--orientation " + pair_orientation} \ + ~{"--low " + low_percentile_threshold} \ + ~{"--high " + high_percentile_threshold} + + # Perform polishing + polypolish polish ${polished_assembly} \ + ${polished_assembly}_filtered1.sam \ + ${polished_assembly}_filtered2.sam \ + ~{"--fraction_invalid " + fraction_invalid} \ + ~{"--fraction_valid " + fraction_valid} \ + ~{"--max_errors " + maximum_errors} \ + ~{"--min_depth " + minimum_depth} \ + ~{true="--careful" false="" careful} \ + > "${polished_assembly}_round${i}.fasta" + + polished_assembly="${polished_assembly}_round${i}.fasta" + done + + # Final polished output + mv "${polished_assembly}" "~{samplename}_final_polished.fasta" >>> output { String polypolish_version = read_string("VERSION") - File polished_assembly = "~{samplename}_polished.fasta" + File polished_assembly = "~{samplename}_final_polished.fasta" } runtime { docker: "~{docker}" diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 9124cd3ac..c6dd92cbc 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -8,23 +8,30 @@ import "../../tasks/assembly/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler import "../../tasks/task_versioning.wdl" as versioning_task import "../../tasks/assembly/task_filtercontigs.wdl" as task_filtercontigs +import "../../tasks/alignment/task_bwamem.wdl" as task_bwamem +import "../../tasks/assembly/task_polypolish.wdl" as task_polypolish workflow flye_consensus { meta { description: "Run Flye assembly, Bandage plot, and consensus polishing with Medaka or Racon from long reads and reorient contigs with dnaapler" } input { - File read1 # raw input reads + File read1 # raw input reads + File? illumina_read1 # Optional Illumina short-read R1 for hybrid assembly + File? illumina_read2 # Optional Illumina short-read R2 for hybrid assembly String samplename String polisher = "medaka" Int? polish_rounds String? medaka_model Boolean trim_reads = false # Default: No trimming Boolean no_polishing = false # Default: Polishing enabled + Boolean run_polypolish = false # Default: Do not run Polypolish } + # Capture versioning information call versioning_task.version_capture { input: } + # Optional Porechop trimming before Flye if (trim_reads) { call task_porechop.porechop as porechop { @@ -33,38 +40,71 @@ workflow flye_consensus { samplename = samplename } } + # Call Flye using either trimmed reads or raw reads call task_flye.flye as flye { input: read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename } + + # Generate Bandage plot call task_bandage.bandage_plot as bandage { input: assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - if (!no_polishing && polisher == "medaka") { - call task_medaka.medaka_consensus as medaka { + + # Hybrid Assembly Path: Polypolish + if (defined(illumina_read1) && defined(illumina_read2) && run_polypolish) { + call task_bwamem.bwa_index as bwa_index { + input: + fasta = flye.assembly_fasta + } + + call task_bwamem.bwa_all as bwa_all { + input: + index = bwa_index.index, + read1 = illumina_read1, + read2 = illumina_read2, + samplename = samplename + } + + call task_polypolish.polypolish as polypolish { input: assembly_fasta = flye.assembly_fasta, + read1_sam = bwa_all.sam1, + read2_sam = bwa_all.sam2, samplename = samplename, - read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available - medaka_model = medaka_model, - polish_rounds = polish_rounds + polypolish_rounds = select_default(1, polish_rounds) } } - if (!no_polishing && polisher == "racon") { - call task_racon.racon as racon { - input: - unpolished_fasta = flye.assembly_fasta, - reads = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available - samplename = samplename + + # ONT-only Polishing Path: Medaka or Racon + if (!defined(illumina_read1) || !defined(illumina_read2)) { + if (!no_polishing && polisher == "medaka") { + call task_medaka.medaka_consensus as medaka { + input: + assembly_fasta = flye.assembly_fasta, + samplename = samplename, + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available + medaka_model = medaka_model, + polish_rounds = polish_rounds + } + } + + if (!no_polishing && polisher == "racon") { + call task_racon.racon as racon { + input: + unpolished_fasta = flye.assembly_fasta, + reads = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available + samplename = samplename + } } } call task_filtercontigs.contig_filter as contig_filter { input: - assembly_fasta = select_first([medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), #use Flye assembly if no polishing is true + assembly_fasta = select_first([polypolish.polished_assembly,medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), #use Flye assembly if no polishing is true } call task_dnaapler.dnaapler_all as dnaapler { input: From 1993791bef1f06ae09cd17b4470556d921559caa Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 13:21:01 -0600 Subject: [PATCH 29/58] update meta wf description --- workflows/utilities/wf_flye_consensus.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index c6dd92cbc..b865749f0 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -13,7 +13,7 @@ import "../../tasks/assembly/task_polypolish.wdl" as task_polypolish workflow flye_consensus { meta { - description: "Run Flye assembly, Bandage plot, and consensus polishing with Medaka or Racon from long reads and reorient contigs with dnaapler" + description: "This workflow assembles long-read sequencing data using Flye, optionally trims reads with Porechop, and generates an assembly graph visualization with Bandage. It supports consensus polishing with Medaka or Racon for long reads, or Polypolish for hybrid assemblies with Illumina short reads. The workflow concludes by reorienting contigs with Dnaapler for a final assembly." } input { File read1 # raw input reads @@ -27,7 +27,6 @@ workflow flye_consensus { Boolean no_polishing = false # Default: Polishing enabled Boolean run_polypolish = false # Default: Do not run Polypolish } - # Capture versioning information call versioning_task.version_capture { input: } From fa10d0c3af6c05c90f3538668cd5a36977e51d99 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 15:52:44 -0600 Subject: [PATCH 30/58] start updating docs, remove run polypolish logic and update po0lypolish task --- .../genomic_characterization/theiaprok.md | 22 ++++++++----------- tasks/assembly/task_polypolish.wdl | 2 +- workflows/utilities/wf_flye_consensus.wdl | 12 ++++------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 3e9329a3b..de86e058b 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -145,19 +145,15 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | | clean_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | -| dragonflye | **assembler** | String | The assembler to use in dragonflye. Three options: raven, miniasm, flye | flye | Optional | ONT | -| dragonflye | **assembler_options** | String | Enables extra assembler options in quote | | Optional | ONT | -| dragonflye | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | -| dragonflye | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | -| dragonflye | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/dragonflye:1.0.14--hdfd78af_0 | Optional | ONT | -| dragonflye | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | -| dragonflye | **illumina_read1** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | -| dragonflye | **illumina_read2** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | -| dragonflye | **medaka_model** | String | The model of medaka to use for assembly | r941_min_hac_g507 | Optional | ONT | -| dragonflye | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT | -| dragonflye | **polishing_rounds** | Int | The number of polishing rounds to conduct (without Illumina) | 1 | Optional | ONT | -| dragonflye | **use_pilon_illumina_polisher** | Boolean | Set to true to use Pilon to polish Illumina reads | FALSE | Optional | ONT | -| dragonflye | **use_racon** | Boolean | Set to true to use Racon to polish instead of Medaka | FALSE | Optional | ONT | +| flye_consensus | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | +| flue_consensus | **illumina_read1** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | +| flye_consensus | **illumina_read2** | File | If Illumina reads are provided, flye_consensus worflow will perform Illumina polishing | | Optional | ONT | +| flye_consensus | **medaka_model** | String | The model of medaka to use for assembly | r1041_e82_400bps_sup_v5.0.0 | Optional | ONT | +| flye_consensus | **polisher** | String | The polishing tool to use for assembly | medaka | Optional | ONT | +| flye_consensus | **polishing_rounds** | Int | The number of polishing rounds to conduct for medaka or racon (without Illumina) | 1 | Optional | ONT | +| flye_consensus | **read1** | File | ONT read file in FASTQ file format (compression optional) | | Optional | ONT | +| flye_consensus | **trim_reads** | Boolean | If true, trims reads before assembly | FALSE | Optional | ONT | +| flye_consensus | **no_polishing** | Boolean | If true, skips polishing | FALSE | Optional | ONT | | export_taxon_tables | **asembly_fasta** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | | export_taxon_tables | **bbduk_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | | export_taxon_tables | **cg_pipeline_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | diff --git a/tasks/assembly/task_polypolish.wdl b/tasks/assembly/task_polypolish.wdl index 7907cfcc2..8a1d55dc5 100644 --- a/tasks/assembly/task_polypolish.wdl +++ b/tasks/assembly/task_polypolish.wdl @@ -7,7 +7,7 @@ task polypolish { File read1_sam # these files need to be aligned to the draft assembly with `-a` flag File read2_sam # see also the task_bwa.wdl#bwa_all task - Int polypolish_rounds = 1 # Default: 1 round of polishing + Int illumina_polishing_rounds = 1 # Default: 1 round of polishing String? pair_orientation # default: auto Float? low_percentile_threshold # default: 0.1 Float? high_percentile_threshold # default: 99.9 diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index b865749f0..c864b90eb 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -24,13 +24,10 @@ workflow flye_consensus { Int? polish_rounds String? medaka_model Boolean trim_reads = false # Default: No trimming - Boolean no_polishing = false # Default: Polishing enabled - Boolean run_polypolish = false # Default: Do not run Polypolish - } + Boolean no_polishing = false # Default: Polishing enabled } call versioning_task.version_capture { input: } - # Optional Porechop trimming before Flye if (trim_reads) { call task_porechop.porechop as porechop { @@ -39,23 +36,21 @@ workflow flye_consensus { samplename = samplename } } - # Call Flye using either trimmed reads or raw reads call task_flye.flye as flye { input: read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename } - # Generate Bandage plot call task_bandage.bandage_plot as bandage { input: assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - # Hybrid Assembly Path: Polypolish - if (defined(illumina_read1) && defined(illumina_read2) && run_polypolish) { + # Hybrid Assembly Path: Polypolish + if (defined(illumina_read1) && defined(illumina_read2)) { call task_bwamem.bwa_index as bwa_index { input: fasta = flye.assembly_fasta @@ -101,6 +96,7 @@ workflow flye_consensus { } } } + call task_filtercontigs.contig_filter as contig_filter { input: assembly_fasta = select_first([polypolish.polished_assembly,medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), #use Flye assembly if no polishing is true From 63c7e2e59efcf48d3654af5121c5f7eb44397e52 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 16:23:39 -0600 Subject: [PATCH 31/58] update racon with polishing round logic with updated minimap2 in docker and naming logic of fasta in wf --- tasks/assembly/task_medaka.wdl | 5 +-- tasks/assembly/task_racon.wdl | 54 +++++++++++++++-------- workflows/utilities/wf_flye_consensus.wdl | 5 +-- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl index 9b30182c7..985a13176 100644 --- a/tasks/assembly/task_medaka.wdl +++ b/tasks/assembly/task_medaka.wdl @@ -2,7 +2,7 @@ version 1.0 task medaka_consensus { input { - File assembly_fasta + File unpolished_fasta String samplename File read1 String medaka_model = "r1041_e82_400bps_sup_v5.0.0" @@ -17,8 +17,7 @@ task medaka_consensus { medaka --version | tee MEDAKA_VERSION # Initialize the input for polishing with the provided assembly FASTA - cp ~{assembly_fasta} polished_input.fasta - cp ~{assembly_fasta} polished_input.fasta + cp ~{unpolished_fasta} polished_input.fasta # Perform Medaka polishing for the specified number of rounds for i in $(seq 1 ~{polish_rounds}); do diff --git a/tasks/assembly/task_racon.wdl b/tasks/assembly/task_racon.wdl index 1c37a23f8..3d32ac51d 100644 --- a/tasks/assembly/task_racon.wdl +++ b/tasks/assembly/task_racon.wdl @@ -3,33 +3,48 @@ version 1.0 task racon { input { File unpolished_fasta - File reads - Int cpu = 4 - Int memory = 16 - Int disk_size = 100 + File read1 + Int polishing_rounds = 1 # Default: 1 polishing round + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 String samplename - String docker = "staphb/racon" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2" } command <<< set -euo pipefail + minimap2 --version | tee MINIMAP2_VERSION racon --version | tee -a RACON_VERSION - # Generate alignments with Minimap2 - minimap2 -t ~{cpu} ~{unpolished_fasta} ~{reads} > ~{samplename}.paf - - # Run Racon for polishing - racon \ - -t ~{cpu} \ - ~{reads} \ - ~{samplename}.paf \ - ~{unpolished_fasta} \ - > ~{samplename}.polished.fasta + + # Start with the unpolished FASTA as the input + intermediate_fasta="~{unpolished_fasta}" + + for round in $(seq 1 ~{polishing_rounds}); do + echo "Starting Racon polishing round $round..." + + # Generate alignments with Minimap2 + minimap2 -t ~{cpu} "${intermediate_fasta}" "~{read1}" > "~{samplename}_round${round}.paf" + + # Run Racon for polishing + racon \ + -t ~{cpu} \ + "~{read1}" \ + "~{samplename}_round${round}.paf" \ + "${intermediate_fasta}" \ + > "~{samplename}_round${round}.polished.fasta" + + # Update current_fasta for the next round + intermediate_fasta="~{samplename}_round${round}.polished.fasta" + done + + # Move the final polished assembly to the output + mv "${intermediate_fasta}" "~{samplename}_final_polished.fasta" >>> output { - File polished_fasta = "~{samplename}.polished.fasta" - File alignments = "~{samplename}.paf" - String racon_version = read_string("RACON_VERSION") - String minimap2_version = read_string("MINIMAP2_VERSION") + File polished_fasta = "~{samplename}_final_polished.fasta" + String racon_version = read_string("RACON_VERSION") + String minimap2_version = read_string("MINIMAP2_VERSION") } runtime { docker: "~{docker}" @@ -40,3 +55,4 @@ task racon { preemptible: 0 } } + diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index c864b90eb..61f5d031b 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -49,7 +49,6 @@ workflow flye_consensus { samplename = samplename } # Hybrid Assembly Path: Polypolish - # Hybrid Assembly Path: Polypolish if (defined(illumina_read1) && defined(illumina_read2)) { call task_bwamem.bwa_index as bwa_index { input: @@ -79,7 +78,7 @@ workflow flye_consensus { if (!no_polishing && polisher == "medaka") { call task_medaka.medaka_consensus as medaka { input: - assembly_fasta = flye.assembly_fasta, + unpolished_fasta = flye.assembly_fasta, samplename = samplename, read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available medaka_model = medaka_model, @@ -96,7 +95,7 @@ workflow flye_consensus { } } } - + # Contig Filtering and Final Assembly orientation call task_filtercontigs.contig_filter as contig_filter { input: assembly_fasta = select_first([polypolish.polished_assembly,medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), #use Flye assembly if no polishing is true From c92358b6250a49f9cf89e2f1fb7033e480cc0add Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 9 Dec 2024 20:57:53 -0600 Subject: [PATCH 32/58] additional comments for filtering contigs logic --- tasks/assembly/task_filtercontigs.wdl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl index 5268e08a9..d00480fe0 100644 --- a/tasks/assembly/task_filtercontigs.wdl +++ b/tasks/assembly/task_filtercontigs.wdl @@ -18,45 +18,50 @@ task contig_filter { total_contigs=$(grep -c "^>" ~{assembly_fasta}) echo "Total contigs: $total_contigs" >&2 + # Calculate total base pairs in all contigs total_bases=$(awk '/^>/ {if (seqlen){total+=seqlen}; seqlen=0; next} {seqlen += length($0)} END {total+=seqlen; print total}' ~{assembly_fasta}) echo "Total bases: $total_bases" >&2 - # Filter by length + # Filter contigs by length + # Retain only contigs that are >= minimum length and write to a new FASTA file seqkit seq -m ~{min_len} ~{assembly_fasta} > length_filtered.fasta echo "Length filtering complete. File size:" >&2 ls -lh length_filtered.fasta >&2 - # If homopolymer filtering is enabled, process further + # Optionally filter out contigs with homopolymers if [ "~{filter_homopolymers}" = "true" ]; then awk ' BEGIN {RS=">"; ORS=""} NR > 1 { + # Split each contig entry into header and sequence split($0, lines, "\n"); header = lines[1]; seq = ""; for (i=2; i<=length(lines); i++) { seq = seq lines[i]; } + # Exclude sequences composed entirely of a single repeated base if (seq !~ /^(.)\1+$/) { print ">" header "\n" seq "\n" } }' length_filtered.fasta > filtered_contigs.fasta else + # If homopolymer filtering is not enabled, use length-filtered file as final output mv length_filtered.fasta filtered_contigs.fasta fi - # Validate the final file + # Validate the final file is not empty if [ ! -s filtered_contigs.fasta ]; then echo "Error: No contigs passed filtering criteria!" >&2 exit 1 fi - # Count final retained contigs + # Calculate final metrics for the filtered contigs retained_contigs=$(grep -c "^>" filtered_contigs.fasta) retained_bases=$(awk '/^>/ {if (seqlen){print seqlen}; seqlen=0; next} {seqlen += length($0)} END {if (seqlen) print seqlen}' filtered_contigs.fasta) contigs_removed_homopolymers=$((total_contigs - retained_contigs)) - # Write metrics to file + # Write filtering metrics to a summary file metrics_file="filtering_metrics.txt" echo "Total contigs: $total_contigs" > $metrics_file echo "Total bases: $total_bases" >> $metrics_file From 420ed91bb86bbf41a476115bb6acff77c6e18a49 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 10 Dec 2024 08:36:43 -0600 Subject: [PATCH 33/58] add assembly stats output --- tasks/assembly/task_racon.wdl | 12 ++++---- .../basic_statistics/flye_assembly_stats.wdl | 28 +++++++++++++++++++ workflows/utilities/wf_flye_consensus.wdl | 8 ++++++ 3 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 tasks/quality_control/basic_statistics/flye_assembly_stats.wdl diff --git a/tasks/assembly/task_racon.wdl b/tasks/assembly/task_racon.wdl index 3d32ac51d..416cd036c 100644 --- a/tasks/assembly/task_racon.wdl +++ b/tasks/assembly/task_racon.wdl @@ -20,22 +20,22 @@ task racon { # Start with the unpolished FASTA as the input intermediate_fasta="~{unpolished_fasta}" - for round in $(seq 1 ~{polishing_rounds}); do - echo "Starting Racon polishing round $round..." + for i in $(seq 1 ~{polishing_rounds}); do + echo "Starting Medaka polishing round $i" # Generate alignments with Minimap2 - minimap2 -t ~{cpu} "${intermediate_fasta}" "~{read1}" > "~{samplename}_round${round}.paf" + minimap2 -t ~{cpu} "${intermediate_fasta}" "~{read1}" > "~{samplename}_round${i}.paf" # Run Racon for polishing racon \ -t ~{cpu} \ "~{read1}" \ - "~{samplename}_round${round}.paf" \ + "~{samplename}_round${i}.paf" \ "${intermediate_fasta}" \ - > "~{samplename}_round${round}.polished.fasta" + > "~{samplename}_round${i}.polished.fasta" # Update current_fasta for the next round - intermediate_fasta="~{samplename}_round${round}.polished.fasta" + intermediate_fasta="~{samplename}_round${i}.polished.fasta" done # Move the final polished assembly to the output diff --git a/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl b/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl new file mode 100644 index 000000000..d640d854b --- /dev/null +++ b/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl @@ -0,0 +1,28 @@ +version 1.0 + +task assembly_stats { + input { + File assembly_fasta + String samplename + Int cpu = 1 + Int memory = 4 + Int disk_size = 10 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.8.2" + } + command <<< + set -euo pipefail + + seqkit stats --all --threads ~{cpu} ~{assembly_fasta} > ~{samplename}_assembly_stats.txt + >>> + output { + File assembly_stats_file = "~{samplename}_assembly_stats.txt" + } + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_consensus.wdl index 61f5d031b..e199f6d9e 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_consensus.wdl @@ -10,6 +10,7 @@ import "../../tasks/task_versioning.wdl" as versioning_task import "../../tasks/assembly/task_filtercontigs.wdl" as task_filtercontigs import "../../tasks/alignment/task_bwamem.wdl" as task_bwamem import "../../tasks/assembly/task_polypolish.wdl" as task_polypolish +import "../../tasks/quality_control/basic_statistics/flye_assembly_stats.wdl" as task_assembly_stats workflow flye_consensus { meta { @@ -105,10 +106,17 @@ workflow flye_consensus { input_fasta = contig_filter.filtered_fasta, samplename = samplename } + # Assembly stats + call task_assembly_stats as assembly_stats { + input: + assembly_fasta = dnaapler.reoriented_fasta, + samplename = samplename + } output { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot File filtered_assembly = contig_filter.filtered_fasta + File flye_assembly_stats = assembly_stats.assembly_stats_file String flye_phb_version = version_capture.phb_version String flye_analysis_date = version_capture.date } From 74d62a9c084c92b22c2d610a800aa8d7c5900e77 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 10 Dec 2024 08:39:46 -0600 Subject: [PATCH 34/58] update filter task metrics output --- tasks/assembly/task_filtercontigs.wdl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/assembly/task_filtercontigs.wdl index d00480fe0..147bae33c 100644 --- a/tasks/assembly/task_filtercontigs.wdl +++ b/tasks/assembly/task_filtercontigs.wdl @@ -63,12 +63,14 @@ task contig_filter { # Write filtering metrics to a summary file metrics_file="filtering_metrics.txt" - echo "Total contigs: $total_contigs" > $metrics_file - echo "Total bases: $total_bases" >> $metrics_file - echo "Contigs retained: $retained_contigs" >> $metrics_file - echo "Bases retained: $retained_bases" >> $metrics_file - echo "Contigs removed (short length): $((total_contigs - retained_contigs))" >> $metrics_file - echo "Contigs removed (homopolymers): $contigs_removed_homopolymers" >> $metrics_file + { + echo "Total contigs: $total_contigs" + echo "Total bases: $total_bases" + echo "Contigs retained: $retained_contigs" + echo "Bases retained: $retained_bases" + echo "Contigs removed (short length): $((total_contigs - retained_contigs))" + echo "Contigs removed (homopolymers): $contigs_removed_homopolymers" + } > $metrics_file cat $metrics_file >&2 >>> From ac3667fbb743a2445db5f0d5b1b2332a60596869 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 10 Dec 2024 11:06:19 -0600 Subject: [PATCH 35/58] per dev meeting update wf name, remove metrics output, re arrange folder structure tasks and rename to skip polish and skip trim, medaka single polish --- tasks/assembly/task_medaka.wdl | 55 ------------------- tasks/polishing/task_medaka.wdl | 43 +++++++++++++++ .../task_polypolish.wdl | 0 tasks/{assembly => polishing}/task_racon.wdl | 0 .../basic_statistics/flye_assembly_stats.wdl | 2 +- .../read_filtering}/task_filtercontigs.wdl | 0 .../read_filtering}/task_porechop.wdl | 0 ..._flye_consensus.wdl => wf_flye_denovo.wdl} | 29 ++++------ 8 files changed, 55 insertions(+), 74 deletions(-) delete mode 100644 tasks/assembly/task_medaka.wdl create mode 100644 tasks/polishing/task_medaka.wdl rename tasks/{assembly => polishing}/task_polypolish.wdl (100%) rename tasks/{assembly => polishing}/task_racon.wdl (100%) rename tasks/{assembly => quality_control/read_filtering}/task_filtercontigs.wdl (100%) rename tasks/{assembly => quality_control/read_filtering}/task_porechop.wdl (100%) rename workflows/utilities/{wf_flye_consensus.wdl => wf_flye_denovo.wdl} (81%) diff --git a/tasks/assembly/task_medaka.wdl b/tasks/assembly/task_medaka.wdl deleted file mode 100644 index 985a13176..000000000 --- a/tasks/assembly/task_medaka.wdl +++ /dev/null @@ -1,55 +0,0 @@ -version 1.0 - -task medaka_consensus { - input { - File unpolished_fasta - String samplename - File read1 - String medaka_model = "r1041_e82_400bps_sup_v5.0.0" - Int polish_rounds = 1 - Int cpu = 4 - Int memory = 16 - Int disk_size = 100 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/medaka:2.0.1" - } - command <<< - set -euo pipefail - medaka --version | tee MEDAKA_VERSION - - # Initialize the input for polishing with the provided assembly FASTA - cp ~{unpolished_fasta} polished_input.fasta - - # Perform Medaka polishing for the specified number of rounds - for i in $(seq 1 ~{polish_rounds}); do - echo "Starting Medaka polishing round $i" - polish_dir="polish_round_$i" - mkdir -p "$polish_dir" - - medaka_consensus \ - -i ~{read1} \ - -d polished_input.fasta \ - -o "$polish_dir" \ - -m ~{medaka_model} \ - -t ~{cpu} - - # Updates the input for the next round - cp "$polish_dir/consensus.fasta" polished_input.fasta - done - - # Rename final polished outputs with sample name - mv polished_input.fasta ~{samplename}.polished.fasta - >>> - output { - File medaka_fasta = "~{samplename}.polished.fasta" - String medaka_version = read_string("MEDAKA_VERSION") - } - runtime { - docker: "~{docker}" - cpu: cpu - memory: "~{memory} GB" - disks: "local-disk " + disk_size + " HDD" - disk: disk_size + " GB" - maxRetries: 1 - preemptible: 0 - } -} diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl new file mode 100644 index 000000000..f130e4660 --- /dev/null +++ b/tasks/polishing/task_medaka.wdl @@ -0,0 +1,43 @@ +version 1.0 + +task medaka_consensus { + input { + File unpolished_fasta + String samplename + File read1 + String medaka_model = "r1041_e82_400bps_sup_v5.0.0" + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/medaka:2.0.1" + } + command <<< + set -euo pipefail + + medaka --version | tee MEDAKA_VERSION + + # Perform a single round of Medaka polishing + medaka_consensus \ + -i ~{read1} \ + -d ~{unpolished_fasta} \ + -o . \ + -m ~{medaka_model} \ + -t ~{cpu} + + # Rename final polished output with sample name + mv consensus.fasta ~{samplename}.polished.fasta + >>> + output { + File medaka_fasta = "~{samplename}.polished.fasta" + String medaka_version = read_string("MEDAKA_VERSION") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 1 + preemptible: 0 + } +} diff --git a/tasks/assembly/task_polypolish.wdl b/tasks/polishing/task_polypolish.wdl similarity index 100% rename from tasks/assembly/task_polypolish.wdl rename to tasks/polishing/task_polypolish.wdl diff --git a/tasks/assembly/task_racon.wdl b/tasks/polishing/task_racon.wdl similarity index 100% rename from tasks/assembly/task_racon.wdl rename to tasks/polishing/task_racon.wdl diff --git a/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl b/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl index d640d854b..39d93deeb 100644 --- a/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl +++ b/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl @@ -6,7 +6,7 @@ task assembly_stats { String samplename Int cpu = 1 Int memory = 4 - Int disk_size = 10 + Int disk_size = 50 String docker = "us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.8.2" } command <<< diff --git a/tasks/assembly/task_filtercontigs.wdl b/tasks/quality_control/read_filtering/task_filtercontigs.wdl similarity index 100% rename from tasks/assembly/task_filtercontigs.wdl rename to tasks/quality_control/read_filtering/task_filtercontigs.wdl diff --git a/tasks/assembly/task_porechop.wdl b/tasks/quality_control/read_filtering/task_porechop.wdl similarity index 100% rename from tasks/assembly/task_porechop.wdl rename to tasks/quality_control/read_filtering/task_porechop.wdl diff --git a/workflows/utilities/wf_flye_consensus.wdl b/workflows/utilities/wf_flye_denovo.wdl similarity index 81% rename from workflows/utilities/wf_flye_consensus.wdl rename to workflows/utilities/wf_flye_denovo.wdl index e199f6d9e..e5b5c45af 100644 --- a/workflows/utilities/wf_flye_consensus.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -3,16 +3,15 @@ version 1.0 import "../../tasks/assembly/task_porechop.wdl" as task_porechop import "../../tasks/assembly/task_flye.wdl" as task_flye import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage -import "../../tasks/assembly/task_medaka.wdl" as task_medaka -import "../../tasks/assembly/task_racon.wdl" as task_racon +import "../../tasks/polyshing/task_medaka.wdl" as task_medaka +import "../../tasks/polyshing/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler import "../../tasks/task_versioning.wdl" as versioning_task -import "../../tasks/assembly/task_filtercontigs.wdl" as task_filtercontigs +import "../../tasks/read_filtering/task_filtercontigs.wdl" as task_filtercontigs import "../../tasks/alignment/task_bwamem.wdl" as task_bwamem -import "../../tasks/assembly/task_polypolish.wdl" as task_polypolish -import "../../tasks/quality_control/basic_statistics/flye_assembly_stats.wdl" as task_assembly_stats +import "../../tasks/polyshing/task_polypolish.wdl" as task_polypolish -workflow flye_consensus { +workflow flye_denovo { meta { description: "This workflow assembles long-read sequencing data using Flye, optionally trims reads with Porechop, and generates an assembly graph visualization with Bandage. It supports consensus polishing with Medaka or Racon for long reads, or Polypolish for hybrid assemblies with Illumina short reads. The workflow concludes by reorienting contigs with Dnaapler for a final assembly." } @@ -24,13 +23,14 @@ workflow flye_consensus { String polisher = "medaka" Int? polish_rounds String? medaka_model - Boolean trim_reads = false # Default: No trimming - Boolean no_polishing = false # Default: Polishing enabled } + Boolean skip_trim_reads = false # Default: No trimming + Boolean skip_polishing = false # Default: Polishing enabled + } call versioning_task.version_capture { input: } # Optional Porechop trimming before Flye - if (trim_reads) { + if (skip_trim_reads) { call task_porechop.porechop as porechop { input: read1 = read1, @@ -76,7 +76,7 @@ workflow flye_consensus { # ONT-only Polishing Path: Medaka or Racon if (!defined(illumina_read1) || !defined(illumina_read2)) { - if (!no_polishing && polisher == "medaka") { + if (!skip_polishing && polisher == "medaka") { call task_medaka.medaka_consensus as medaka { input: unpolished_fasta = flye.assembly_fasta, @@ -87,7 +87,7 @@ workflow flye_consensus { } } - if (!no_polishing && polisher == "racon") { + if (!skip_polishing && polisher == "racon") { call task_racon.racon as racon { input: unpolished_fasta = flye.assembly_fasta, @@ -106,17 +106,10 @@ workflow flye_consensus { input_fasta = contig_filter.filtered_fasta, samplename = samplename } - # Assembly stats - call task_assembly_stats as assembly_stats { - input: - assembly_fasta = dnaapler.reoriented_fasta, - samplename = samplename - } output { File final_assembly = dnaapler.reoriented_fasta File bandage_plot = bandage.plot File filtered_assembly = contig_filter.filtered_fasta - File flye_assembly_stats = assembly_stats.assembly_stats_file String flye_phb_version = version_capture.phb_version String flye_analysis_date = version_capture.date } From c6064cb9c1e0787abd6f28adc9a6d10dc82e4a40 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 10 Dec 2024 11:41:47 -0600 Subject: [PATCH 36/58] update wf to pass miniwdl checks --- tasks/polishing/task_polypolish.wdl | 2 +- workflows/utilities/wf_flye_denovo.wdl | 41 ++++++++++---------------- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/tasks/polishing/task_polypolish.wdl b/tasks/polishing/task_polypolish.wdl index 8a1d55dc5..8fc6f63af 100644 --- a/tasks/polishing/task_polypolish.wdl +++ b/tasks/polishing/task_polypolish.wdl @@ -31,7 +31,7 @@ task polypolish { # Initial input for polishing polished_assembly="~{assembly_fasta}" - for i in $(seq 1 ~{polypolish_rounds}); do + for i in $(seq 1 ~{illumina_polishing_rounds}); do echo "Starting Polypolish round $i..." # Filter SAM files diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index e5b5c45af..ab056ed8a 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -1,15 +1,15 @@ version 1.0 -import "../../tasks/assembly/task_porechop.wdl" as task_porechop +import "../../tasks/quality_control/read_filtering/task_porechop.wdl" as task_porechop import "../../tasks/assembly/task_flye.wdl" as task_flye import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage -import "../../tasks/polyshing/task_medaka.wdl" as task_medaka -import "../../tasks/polyshing/task_racon.wdl" as task_racon +import "../../tasks/polishing/task_medaka.wdl" as task_medaka +import "../../tasks/polishing/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler import "../../tasks/task_versioning.wdl" as versioning_task -import "../../tasks/read_filtering/task_filtercontigs.wdl" as task_filtercontigs -import "../../tasks/alignment/task_bwamem.wdl" as task_bwamem -import "../../tasks/polyshing/task_polypolish.wdl" as task_polypolish +import "../../tasks/quality_control/read_filtering/task_filtercontigs.wdl" as task_filtercontigs +import "../../tasks/alignment/task_bwa.wdl" as task_bwamem +import "../../tasks/polishing/task_polypolish.wdl" as task_polypolish workflow flye_denovo { meta { @@ -21,7 +21,7 @@ workflow flye_denovo { File? illumina_read2 # Optional Illumina short-read R2 for hybrid assembly String samplename String polisher = "medaka" - Int? polish_rounds + Int polish_rounds = 1 # Default: 1 polishing round String? medaka_model Boolean skip_trim_reads = false # Default: No trimming Boolean skip_polishing = false # Default: Polishing enabled @@ -30,7 +30,7 @@ workflow flye_denovo { input: } # Optional Porechop trimming before Flye - if (skip_trim_reads) { + if (!skip_trim_reads) { call task_porechop.porechop as porechop { input: read1 = read1, @@ -49,31 +49,24 @@ workflow flye_denovo { assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - # Hybrid Assembly Path: Polypolish + # Hybrid Assembly Path: Polypolish if (defined(illumina_read1) && defined(illumina_read2)) { - call task_bwamem.bwa_index as bwa_index { - input: - fasta = flye.assembly_fasta - } - call task_bwamem.bwa_all as bwa_all { input: - index = bwa_index.index, - read1 = illumina_read1, - read2 = illumina_read2, + draft_assembly_fasta = flye.assembly_fasta, + read1 = select_first([illumina_read1]), + read2 = select_first([illumina_read2]), samplename = samplename } - call task_polypolish.polypolish as polypolish { input: assembly_fasta = flye.assembly_fasta, - read1_sam = bwa_all.sam1, - read2_sam = bwa_all.sam2, + read1_sam = bwa_all.read1_sam, + read2_sam = bwa_all.read2_sam, samplename = samplename, - polypolish_rounds = select_default(1, polish_rounds) + illumina_polishing_rounds = polish_rounds } } - # ONT-only Polishing Path: Medaka or Racon if (!defined(illumina_read1) || !defined(illumina_read2)) { if (!skip_polishing && polisher == "medaka") { @@ -83,15 +76,13 @@ workflow flye_denovo { samplename = samplename, read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available medaka_model = medaka_model, - polish_rounds = polish_rounds } } - if (!skip_polishing && polisher == "racon") { call task_racon.racon as racon { input: unpolished_fasta = flye.assembly_fasta, - reads = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename } } From 6878957c4487bdfb52a1f73f96ab817792e08811 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 10 Dec 2024 15:13:24 -0600 Subject: [PATCH 37/58] update medaka top use auto model selection or user provide overide --- tasks/polishing/task_medaka.wdl | 36 ++++++++++++++++--- .../basic_statistics/flye_assembly_stats.wdl | 28 --------------- workflows/utilities/wf_flye_denovo.wdl | 25 +++++++------ 3 files changed, 45 insertions(+), 44 deletions(-) delete mode 100644 tasks/quality_control/basic_statistics/flye_assembly_stats.wdl diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl index f130e4660..655bdb4c3 100644 --- a/tasks/polishing/task_medaka.wdl +++ b/tasks/polishing/task_medaka.wdl @@ -5,26 +5,52 @@ task medaka_consensus { File unpolished_fasta String samplename File read1 - String medaka_model = "r1041_e82_400bps_sup_v5.0.0" + Boolean auto_model = true # Automatically resolve model by inspecting input + String? medaka_model_override # Optional user-specified model Int cpu = 4 Int memory = 16 Int disk_size = 100 String docker = "us-docker.pkg.dev/general-theiagen/staphb/medaka:2.0.1" } - command <<< + command <<< set -euo pipefail medaka --version | tee MEDAKA_VERSION - # Perform a single round of Medaka polishing + # Initialize selected_model variable + selected_model="" + + # Check if auto_model is enabled + if [[ "~{auto_model}" == "true" ]]; then + echo "Attempting automatic model selection..." + medaka tools resolve_model --auto_model consensus ~{read1} > auto_model.txt || true + auto_model=$(cat auto_model.txt || echo "") + + # Alert if automatic model selection fails + if [[ -z "$auto_model" ]]; then + echo "Warning: Automatic model selection failed. Defaulting to fallback model." + fi + fi + + # Use auto_model if detected, otherwise fallback to user-provided or default + if [[ -n "$auto_model" ]]; then + selected_model="$auto_model" + elif [[ -n "~{medaka_model_override}" ]]; then + selected_model="~{medaka_model_override}" + else + selected_model="r1041_e82_400bps_sup_v5.0.0" + fi + + echo "Using Medaka model for polishing: $selected_model" + + # Perform Medaka polishing medaka_consensus \ -i ~{read1} \ -d ~{unpolished_fasta} \ -o . \ - -m ~{medaka_model} \ + -m $selected_model \ -t ~{cpu} - # Rename final polished output with sample name mv consensus.fasta ~{samplename}.polished.fasta >>> output { diff --git a/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl b/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl deleted file mode 100644 index 39d93deeb..000000000 --- a/tasks/quality_control/basic_statistics/flye_assembly_stats.wdl +++ /dev/null @@ -1,28 +0,0 @@ -version 1.0 - -task assembly_stats { - input { - File assembly_fasta - String samplename - Int cpu = 1 - Int memory = 4 - Int disk_size = 50 - String docker = "us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.8.2" - } - command <<< - set -euo pipefail - - seqkit stats --all --threads ~{cpu} ~{assembly_fasta} > ~{samplename}_assembly_stats.txt - >>> - output { - File assembly_stats_file = "~{samplename}_assembly_stats.txt" - } - runtime { - docker: "~{docker}" - memory: memory + " GB" - cpu: cpu - disk: disk_size + " GB" - maxRetries: 3 - preemptible: 0 - } -} diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index ab056ed8a..28c0f1a83 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -21,10 +21,11 @@ workflow flye_denovo { File? illumina_read2 # Optional Illumina short-read R2 for hybrid assembly String samplename String polisher = "medaka" - Int polish_rounds = 1 # Default: 1 polishing round - String? medaka_model - Boolean skip_trim_reads = false # Default: No trimming - Boolean skip_polishing = false # Default: Polishing enabled + Int? polish_rounds # Optional number of polishing rounds + String? medaka_model_override # Optional user-specified Medaka model + Boolean auto_medaka_model = true # Enable automatic Medaka model selection + Boolean skip_trim_reads = true # Default: No trimming + Boolean skip_polishing = false # Default: Polishing enabled } call versioning_task.version_capture { input: @@ -49,13 +50,13 @@ workflow flye_denovo { assembly_graph_gfa = flye.assembly_graph, samplename = samplename } - # Hybrid Assembly Path: Polypolish + # Hybrid Assembly Path: Polypolish if (defined(illumina_read1) && defined(illumina_read2)) { call task_bwamem.bwa_all as bwa_all { input: draft_assembly_fasta = flye.assembly_fasta, - read1 = select_first([illumina_read1]), - read2 = select_first([illumina_read2]), + read1 = select_first([illumina_read1]), + read2 = select_first([illumina_read2]), samplename = samplename } call task_polypolish.polypolish as polypolish { @@ -75,7 +76,8 @@ workflow flye_denovo { unpolished_fasta = flye.assembly_fasta, samplename = samplename, read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available - medaka_model = medaka_model, + medaka_model_override = medaka_model_override, + auto_model = auto_medaka_model } } if (!skip_polishing && polisher == "racon") { @@ -83,14 +85,15 @@ workflow flye_denovo { input: unpolished_fasta = flye.assembly_fasta, read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available - samplename = samplename + samplename = samplename, + polishing_rounds = polish_rounds } } } # Contig Filtering and Final Assembly orientation call task_filtercontigs.contig_filter as contig_filter { input: - assembly_fasta = select_first([polypolish.polished_assembly,medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), #use Flye assembly if no polishing is true + assembly_fasta = select_first([polypolish.polished_assembly, medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), # Use Flye assembly if no polishing } call task_dnaapler.dnaapler_all as dnaapler { input: @@ -104,4 +107,4 @@ workflow flye_denovo { String flye_phb_version = version_capture.phb_version String flye_analysis_date = version_capture.date } -} +} \ No newline at end of file From 1d2de80fd4b28b0ed51f8bf2a0e29a0e6e36c65b Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 11 Dec 2024 13:36:45 -0600 Subject: [PATCH 38/58] all local tests successful for each path now to add to theiaprok --- .../genomic_characterization/theiaprok.md | 66 ++++++++++++++----- tasks/alignment/task_bwa.wdl | 7 ++ tasks/assembly/task_flye.wdl | 2 +- tasks/polishing/task_polypolish.wdl | 4 +- .../read_filtering/task_filtercontigs.wdl | 27 +++++--- .../data_export/task_broad_terra_tools.wdl | 2 - workflows/theiaprok/wf_theiaprok_ont.wdl | 54 ++++++++------- workflows/utilities/wf_flye_denovo.wdl | 25 +++---- 8 files changed, 117 insertions(+), 70 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index de86e058b..1bca4a97c 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -145,15 +145,16 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | | clean_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | -| flye_consensus | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | -| flue_consensus | **illumina_read1** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | -| flye_consensus | **illumina_read2** | File | If Illumina reads are provided, flye_consensus worflow will perform Illumina polishing | | Optional | ONT | -| flye_consensus | **medaka_model** | String | The model of medaka to use for assembly | r1041_e82_400bps_sup_v5.0.0 | Optional | ONT | -| flye_consensus | **polisher** | String | The polishing tool to use for assembly | medaka | Optional | ONT | -| flye_consensus | **polishing_rounds** | Int | The number of polishing rounds to conduct for medaka or racon (without Illumina) | 1 | Optional | ONT | -| flye_consensus | **read1** | File | ONT read file in FASTQ file format (compression optional) | | Optional | ONT | -| flye_consensus | **trim_reads** | Boolean | If true, trims reads before assembly | FALSE | Optional | ONT | -| flye_consensus | **no_polishing** | Boolean | If true, skips polishing | FALSE | Optional | ONT | +| flye_denovo | **auto_medaka_model** | Boolean | If true, medaka will automatically select the best Medaka model for assembly | TRUE | Optional | ONT | +| flye_denovo | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | +| flye_denovo | **illumina_read1** | File | If Illumina reads are provided, flye_denovo subworkflow will perform Illumina polishing | | Optional | ONT | +| flye_denovo | **illumina_read2** | File | If Illumina reads are provided, flye_denovo subworflow will perform Illumina polishing | | Optional | ONT | +| flye_denovo | **medaka_model_override** | String | The model of medaka to use for assembly | r1041_e82_400bps_sup_v5.0.0 | Optional | ONT | +| flye_denovo | **polisher** | String | The polishing tool to use for assembly | medaka | Optional | ONT | +| flye_denovo | **polishing_rounds** | Int | The number of polishing rounds to conduct for medaka or racon (without Illumina) | 1 | Optional | ONT | +| flye_denovo | **read1** | File | ONT read file in FASTQ file format (compression optional) | | Optional | ONT | +| flye_denovo | **skip_trim_reads** | Boolean | If true, trims reads before assembly | FALSE | Optional | ONT | +| flye_denovo | **skip_polishing** | Boolean | If true, skips polishing | FALSE | Optional | ONT | | export_taxon_tables | **asembly_fasta** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | | export_taxon_tables | **bbduk_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | | export_taxon_tables | **cg_pipeline_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | @@ -724,7 +725,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al **Estimated genome length**: - By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `RASUSA` and `dragonflye`. + By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `RASUSA`. **Plotting and quantifying long-read sequencing data:** `nanoplot` @@ -747,13 +748,46 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan), [NanoPlot](https://github.com/wdecoster/NanoPlot), [RASUSA](https://github.com/mbhall88/rasusa), [tiptoft](https://github.com/andrewjpage/tiptoft), [nanoq](https://github.com/esteinig/nanoq) | | Original Publication(s) | [NanoPlot paper](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911)
[RASUSA paper](https://doi.org/10.21105/joss.03941)
[Nanoq Paper](https://doi.org/10.21105/joss.02991)
[Tiptoft paper](https://doi.org/10.21105/joss.01021) | -??? task "`dragonflye`: _De novo_ Assembly" - !!! techdetails "dragonflye Technical Details" +??? task "`Flye`: _De novo_ Assembly" + + `Flye` is a sub-workflow that performs _de novo_ assembly using Flye for ONT data and supports additional polishing and visualization steps. The detailed steps and tasks are as follows: + + ### Workflow Steps + + 1. **Version Capture**: + - Captures software and runtime versions for reproducibility. + + 2. **Optional Read Trimming**: + - Utilizes `task_porechop.wdl` to trim input reads if trimming is enabled. + + 3. **_De novo_ Assembly**: + - Uses `task_flye.wdl` for assembling long reads into contigs. + + 4. **Graph Visualization**: + - Employs `task_bandageplot.wdl` to visualize the assembly graph using Bandage. + + 5. **Polishing**: + - Supports polishing the assembly from Flye using: + - **Medaka** (`task_medaka.wdl`) for long reads. + - **Racon** (`task_racon.wdl`) for long reads. + - **Polypolish** (`task_polypolish.wdl`) for hybrid assemblies with Illumina data. + + 6. **Contig Filtering**: + - Filters contigs using `task_filtercontigs.wdl` based on user-defined criteria. + + 7. **Final Assembly Orientation**: + - Reorients contigs using `task_dnaapler.wdl` to start at a standard reference point. + + !!! techdetails "Flye_Denovo Technical Details" | | Links | | --- | --- | - | Task | [task_dragonflye.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_dragonflye.wdl) | - | Software Source Code | [dragonflye on GitHub](https://github.com/rpetit3/dragonflye) | - | Software Documentation | [dragonflye on GitHub](https://github.com/rpetit3/dragonflye) | + | Subworkflow | [wf_flye_denovo.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_flye_denovo.wdl) | + | Assembly Task | [task_flye.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_flye.wdl) | + | Polishing Tasks | [task_medaka.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_medaka.wdl), [task_racon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_racon.wdl), [task_polypolish.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_polypolish.wdl) | + | Contig Filtering Task | [task_filtercontigs.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_filtercontigs.wdl) | + | Final Orientation Task | [task_dnaapler.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_dnaapler.wdl) | + | Software Source Code | [Flye on GitHub](https://github.com/fenderglass/Flye) | + | Software Documentation | [Flye Documentation](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md) | #### Post-Assembly Tasks (performed for all taxa) @@ -1651,7 +1685,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | combined_mean_readlength_raw | Float | Mean read length for the combined raw reads | PE | | contigs_fastg | File | Assembly graph if megahit used for genome assembly | PE | | contigs_gfa | File | Assembly graph if spades used for genome assembly | ONT, PE, SE | -| contigs_lastgraph | File | Assembly graph if velvet used for genome assembly | PE | +| contigs_lastgraph | File | Assemb7ly graph if velvet used for genome assembly | PE | | dragonflye_version | String | Version of dragonflye used for de novo assembly | ONT | | ectyper_predicted_serotype | String | Serotype predicted by ECTyper | FASTA, ONT, PE, SE | | ectyper_results | File | TSV file of evidence for ECTyper predicted serotype (see https://github.com/phac-nml/ecoli_serotyping#report-format) | FASTA, ONT, PE, SE | diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index b6929e91d..9c908f3bb 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -171,6 +171,13 @@ task bwa_all { bwa &> BWA_HELP grep "Version" BWA_HELP | cut -d" " -f2 > BWA_VERSION + if [[ ! -f "~{draft_assembly_fasta}.bwt" ]]; then + echo "Indexing reference genome: ~{draft_assembly_fasta}" + bwa index ~{draft_assembly_fasta} + else + echo "Reference genome is already indexed: ~{draft_assembly_fasta}" + fi + bwa mem -t ~{cpu} -a ~{draft_assembly_fasta} ~{read1} > ~{samplename}_R1.sam bwa mem -t ~{cpu} -a ~{draft_assembly_fasta} ~{read2} > ~{samplename}_R2.sam diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index 69ac46c09..f2dfc7a44 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -73,7 +73,7 @@ task flye { >>> output { File assembly_fasta = "~{samplename}.assembly.fasta" - File assembly_graph = "~{samplename}.assembly_graph.gfa" + File assembly_graph_gfa = "~{samplename}.assembly_graph.gfa" File assembly_info = "~{samplename}.assembly_info.txt" String flye_version = read_string("VERSION") String flye_docker = "~{docker}" diff --git a/tasks/polishing/task_polypolish.wdl b/tasks/polishing/task_polypolish.wdl index 8fc6f63af..1f3f76360 100644 --- a/tasks/polishing/task_polypolish.wdl +++ b/tasks/polishing/task_polypolish.wdl @@ -21,7 +21,7 @@ task polypolish { Int cpu = 1 # polypolish is single-threaded Int disk_size = 100 String docker = "us-docker.pkg.dev/general-theiagen/staphb/polypolish:0.6.0" - Int memory = 4 + Int memory = 8 } command <<< set -euo pipefail @@ -72,6 +72,6 @@ task polypolish { disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" maxRetries: 3 - preemptible: 1 + preemptible: 0 } } \ No newline at end of file diff --git a/tasks/quality_control/read_filtering/task_filtercontigs.wdl b/tasks/quality_control/read_filtering/task_filtercontigs.wdl index 147bae33c..985ca7474 100644 --- a/tasks/quality_control/read_filtering/task_filtercontigs.wdl +++ b/tasks/quality_control/read_filtering/task_filtercontigs.wdl @@ -18,12 +18,12 @@ task contig_filter { total_contigs=$(grep -c "^>" ~{assembly_fasta}) echo "Total contigs: $total_contigs" >&2 - # Calculate total base pairs in all contigs - total_bases=$(awk '/^>/ {if (seqlen){total+=seqlen}; seqlen=0; next} {seqlen += length($0)} END {total+=seqlen; print total}' ~{assembly_fasta}) + # Calculate total base pairs in all contigs and output individual lengths + awk '/^>/ {if (seqlen) {print seqlen; total+=seqlen}; seqlen=0; next} {seqlen+=length($0)} END {if (seqlen) {print seqlen; total+=seqlen}; print "Total:" total}' ~{assembly_fasta} > contig_lengths_before.txt + total_bases=$(tail -n 1 contig_lengths_before.txt | cut -d ':' -f 2) echo "Total bases: $total_bases" >&2 # Filter contigs by length - # Retain only contigs that are >= minimum length and write to a new FASTA file seqkit seq -m ~{min_len} ~{assembly_fasta} > length_filtered.fasta echo "Length filtering complete. File size:" >&2 ls -lh length_filtered.fasta >&2 @@ -33,20 +33,17 @@ task contig_filter { awk ' BEGIN {RS=">"; ORS=""} NR > 1 { - # Split each contig entry into header and sequence split($0, lines, "\n"); header = lines[1]; seq = ""; for (i=2; i<=length(lines); i++) { seq = seq lines[i]; } - # Exclude sequences composed entirely of a single repeated base if (seq !~ /^(.)\1+$/) { print ">" header "\n" seq "\n" } }' length_filtered.fasta > filtered_contigs.fasta else - # If homopolymer filtering is not enabled, use length-filtered file as final output mv length_filtered.fasta filtered_contigs.fasta fi @@ -57,9 +54,13 @@ task contig_filter { fi # Calculate final metrics for the filtered contigs + awk '/^>/ {if (seqlen) {print seqlen; total+=seqlen}; seqlen=0; next} {seqlen+=length($0)} END {if (seqlen) {print seqlen; total+=seqlen}; print "Total:" total}' filtered_contigs.fasta > contig_lengths_after.txt retained_contigs=$(grep -c "^>" filtered_contigs.fasta) - retained_bases=$(awk '/^>/ {if (seqlen){print seqlen}; seqlen=0; next} {seqlen += length($0)} END {if (seqlen) print seqlen}' filtered_contigs.fasta) - contigs_removed_homopolymers=$((total_contigs - retained_contigs)) + retained_bases=$(tail -n 1 contig_lengths_after.txt | cut -d ':' -f 2) + + # Calculate removed contigs + contigs_removed_short=$((total_contigs - $(grep -c "^>" length_filtered.fasta))) + contigs_removed_homopolymers=$((total_contigs - retained_contigs - contigs_removed_short)) # Write filtering metrics to a summary file metrics_file="filtering_metrics.txt" @@ -68,15 +69,23 @@ task contig_filter { echo "Total bases: $total_bases" echo "Contigs retained: $retained_contigs" echo "Bases retained: $retained_bases" - echo "Contigs removed (short length): $((total_contigs - retained_contigs))" + echo "Contigs removed (short length): $contigs_removed_short" echo "Contigs removed (homopolymers): $contigs_removed_homopolymers" } > $metrics_file + # Output individual contig lengths before and after filtering + echo "Contig lengths before filtering:" >> $metrics_file + cat contig_lengths_before.txt >> $metrics_file + echo "Contig lengths after filtering:" >> $metrics_file + cat contig_lengths_after.txt >> $metrics_file + cat $metrics_file >&2 >>> output { File filtered_fasta = "filtered_contigs.fasta" File assembly_filtering_metrics = "filtering_metrics.txt" + File contig_lengths_before = "contig_lengths_before.txt" + File contig_lengths_after = "contig_lengths_after.txt" } runtime { docker: "~{docker}" diff --git a/tasks/utilities/data_export/task_broad_terra_tools.wdl b/tasks/utilities/data_export/task_broad_terra_tools.wdl index 3a3fba0fd..9b609ab7c 100644 --- a/tasks/utilities/data_export/task_broad_terra_tools.wdl +++ b/tasks/utilities/data_export/task_broad_terra_tools.wdl @@ -82,7 +82,6 @@ task export_taxon_tables { String? tiptoft_version File? assembly_fasta File? contigs_gfa - String? dragonflye_version String? shovill_pe_version String? shovill_se_version File? quast_report @@ -493,7 +492,6 @@ task export_taxon_tables { "tiptoft_version": "~{tiptoft_version}", "assembly_fasta": "~{assembly_fasta}", "contigs_gfa": "~{contigs_gfa}", - "dragonflye_version": "~{dragonflye_version}", "shovill_pe_version": "~{shovill_pe_version}", "shovill_se_version": "~{shovill_se_version}", "quast_report": "~{quast_report}", diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index fecfb744e..fb3d215e0 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -1,6 +1,5 @@ version 1.0 -import "../../tasks/assembly/task_dragonflye.wdl" as dragonflye_task import "../../tasks/gene_typing/annotation/task_bakta.wdl" as bakta_task import "../../tasks/gene_typing/annotation/task_prokka.wdl" as prokka_task import "../../tasks/gene_typing/drug_resistance/task_amrfinderplus.wdl" as amrfinderplus @@ -20,6 +19,7 @@ import "../../tasks/taxon_id/task_gambit.wdl" as gambit_task import "../../tasks/utilities/data_export/task_broad_terra_tools.wdl" as terra_tools_task import "../utilities/wf_merlin_magic.wdl" as merlin_magic_workflow import "../utilities/wf_read_QC_trim_ont.wdl" as read_qc_workflow +import "../utilities/wf_flye_denovo.wdl" as flye_workflow workflow theiaprok_ont { meta { @@ -97,16 +97,16 @@ workflow theiaprok_ont { expected_genome_length = genome_length } if (clean_check_reads.read_screen == "PASS") { - call dragonflye_task.dragonflye { - input: - read1 = read_qc_trim.read1_clean, - genome_length = select_first([genome_length, read_qc_trim.est_genome_length]), - samplename = samplename + call flye_workflow.flye_denovo { + input: + read1 = read_qc_trim.read1_clean, + genome_length = select_first([genome_length, read_qc_trim.est_genome_length]), + samplename = samplename } call quast_task.quast { - input: - assembly = dragonflye.assembly_fasta, - samplename = samplename + input: + assembly = flye_denovo.final_assembly, + samplename = samplename } # nanoplot for basic QC metrics call nanoplot_task.nanoplot as nanoplot_raw { @@ -123,73 +123,73 @@ workflow theiaprok_ont { } call busco_task.busco { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } if (perform_characterization) { call gambit_task.gambit { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } if (call_ani) { call ani_task.animummer as ani { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } } if (call_kmerfinder) { call kmerfinder_task.kmerfinder_bacteria as kmerfinder { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } } call amrfinderplus.amrfinderplus_nuc as amrfinderplus_task { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename, organism = select_first([expected_taxon, gambit.gambit_predicted_taxon]) } if (call_resfinder) { call resfinder_task.resfinder as resfinder_task { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename, organism = select_first([expected_taxon, gambit.gambit_predicted_taxon]) } } call ts_mlst_task.ts_mlst { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } if (genome_annotation == "prokka") { call prokka_task.prokka { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } } if (genome_annotation == "bakta") { call bakta_task.bakta { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } } if (call_plasmidfinder) { call plasmidfinder_task.plasmidfinder { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename } } if (call_abricate) { call abricate_task.abricate { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename, database = abricate_db } @@ -220,7 +220,7 @@ workflow theiaprok_ont { call merlin_magic_workflow.merlin_magic { input: merlin_tag = select_first([expected_taxon, gambit.merlin_tag]), - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.final_assembly, samplename = samplename, read1 = read_qc_trim.read1_clean, ont_data = true @@ -277,9 +277,8 @@ workflow theiaprok_ont { tiptoft_plasmid_replicon_fastq = read_qc_trim.tiptoft_plasmid_replicon_fastq, tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes, tiptoft_version = read_qc_trim.tiptoft_version, - assembly_fasta = dragonflye.assembly_fasta, - contigs_gfa = dragonflye.contigs_gfa, - dragonflye_version = dragonflye.dragonflye_version, + assembly_fasta = flye_denovo.final_assembly, + contigs_gfa = flye_denovo.contigs_gfa, quast_report = quast.quast_report, quast_version = quast.version, assembly_length = quast.genome_length, @@ -591,10 +590,9 @@ workflow theiaprok_ont { File? tiptoft_plasmid_replicon_fastq = read_qc_trim.tiptoft_plasmid_replicon_fastq String? tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes String? tiptoft_version = read_qc_trim.tiptoft_version - # Assembly - dragonflye outputs - File? assembly_fasta = dragonflye.assembly_fasta - File? contigs_gfa = dragonflye.contigs_gfa - String? dragonflye_version = dragonflye.dragonflye_version + # Assembly - flye_denovo outputs + File? assembly_fasta = flye_denovo.final_assembly + File? contigs_gfa = flye_denovo.contigs_gfa # Assembly QC - quast outputs File? quast_report = quast.quast_report String? quast_version = quast.version diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index 28c0f1a83..e588d8888 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -8,7 +8,7 @@ import "../../tasks/polishing/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler import "../../tasks/task_versioning.wdl" as versioning_task import "../../tasks/quality_control/read_filtering/task_filtercontigs.wdl" as task_filtercontigs -import "../../tasks/alignment/task_bwa.wdl" as task_bwamem +import "../../tasks/alignment/task_bwa.wdl" as task_bwaall import "../../tasks/polishing/task_polypolish.wdl" as task_polypolish workflow flye_denovo { @@ -21,7 +21,7 @@ workflow flye_denovo { File? illumina_read2 # Optional Illumina short-read R2 for hybrid assembly String samplename String polisher = "medaka" - Int? polish_rounds # Optional number of polishing rounds + Int? polish_rounds = 1 # Optional number of polishing rounds String? medaka_model_override # Optional user-specified Medaka model Boolean auto_medaka_model = true # Enable automatic Medaka model selection Boolean skip_trim_reads = true # Default: No trimming @@ -47,23 +47,23 @@ workflow flye_denovo { # Generate Bandage plot call task_bandage.bandage_plot as bandage { input: - assembly_graph_gfa = flye.assembly_graph, + assembly_graph_gfa = flye.assembly_graph_gfa, samplename = samplename } # Hybrid Assembly Path: Polypolish if (defined(illumina_read1) && defined(illumina_read2)) { - call task_bwamem.bwa_all as bwa_all { - input: - draft_assembly_fasta = flye.assembly_fasta, - read1 = select_first([illumina_read1]), - read2 = select_first([illumina_read2]), - samplename = samplename + call task_bwaall.bwa_all as bwa { + input: + draft_assembly_fasta = flye.assembly_fasta, + read1 = select_first([illumina_read1]), + read2 = select_first([illumina_read2]), + samplename = samplename } call task_polypolish.polypolish as polypolish { input: assembly_fasta = flye.assembly_fasta, - read1_sam = bwa_all.read1_sam, - read2_sam = bwa_all.read2_sam, + read1_sam = bwa.read1_sam, + read2_sam = bwa.read2_sam, samplename = samplename, illumina_polishing_rounds = polish_rounds } @@ -102,8 +102,9 @@ workflow flye_denovo { } output { File final_assembly = dnaapler.reoriented_fasta + #add into terra export table File bandage_plot = bandage.plot - File filtered_assembly = contig_filter.filtered_fasta + File contigs_gfa = flye.assembly_graph_gfa String flye_phb_version = version_capture.phb_version String flye_analysis_date = version_capture.date } From afeceb1966c73e77ef522851f479e0abce4ecaa0 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 11 Dec 2024 14:29:08 -0600 Subject: [PATCH 39/58] update flye call --- workflows/theiaprok/wf_theiaprok_ont.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index fb3d215e0..404db79e8 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -100,7 +100,6 @@ workflow theiaprok_ont { call flye_workflow.flye_denovo { input: read1 = read_qc_trim.read1_clean, - genome_length = select_first([genome_length, read_qc_trim.est_genome_length]), samplename = samplename } call quast_task.quast { From cc5f96635d3d4d7c469edfbdb0e65b5f2f7b15ee Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 10:45:56 -0600 Subject: [PATCH 40/58] add all tasks inputs to subworkflow for terra, increase CPU allocation in racon, update flye param names passed miniwdl check --- tasks/assembly/task_dragonflye.wdl | 96 ------------------ tasks/assembly/task_flye.wdl | 10 +- tasks/polishing/task_medaka.wdl | 2 +- tasks/polishing/task_racon.wdl | 4 +- workflows/utilities/wf_flye_denovo.wdl | 134 +++++++++++++++++++++++-- 5 files changed, 134 insertions(+), 112 deletions(-) delete mode 100644 tasks/assembly/task_dragonflye.wdl diff --git a/tasks/assembly/task_dragonflye.wdl b/tasks/assembly/task_dragonflye.wdl deleted file mode 100644 index 794c5888d..000000000 --- a/tasks/assembly/task_dragonflye.wdl +++ /dev/null @@ -1,96 +0,0 @@ -version 1.0 - -task dragonflye { - input { - File read1 # intended for ONT data only - String samplename - String? assembler # default is flye - String? assembler_options # default '' - String? genome_length # default autodetect - File? illumina_read1 # illumina read1 fastq to use for polishing - File? illumina_read2 # illumina read2 fastq to use for polishing - Boolean use_pilon_illumina_polisher = false # use polypolish by default - Int illumina_polishing_rounds = 1 # default 1 - Int polishing_rounds = 1 # default 1 - Boolean use_racon = false # use medaka polishing by default - String medaka_model = "r941_min_hac_g507" - String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/dragonflye:1.0.14--hdfd78af_0" - Int disk_size = 100 - Int cpu = 4 - Int memory = 32 - } - command <<< - # get version information - dragonflye --version | tee VERSION - - # determine polishing parameter - if [ "~{use_racon}" = true ]; then - # --racon indicates number of polishing rounds to conduct with Racon - POLISHER="--racon ~{polishing_rounds}" - else - # --medaka indicates number of polishing rounds to conduct with medaka using medaka_model - POLISHER="--model ~{medaka_model} --medaka ~{polishing_rounds}" - fi - - # determine if illumina polishing will be performed based on presence of two files - if [ -f "~{illumina_read1}" ] && [ -f "~{illumina_read2}" ]; then - echo "Illumina reads provided; will perform Illumina polishing" - if [ "~{use_pilon_illumina_polisher}" = true ]; then - # use --pilon instead of polypolish as default - ILLUMINA_POLISHER="--R1 ~{illumina_read1} --R2 ~{illumina_read2} --pilon ~{illumina_polishing_rounds}" - else - # use --polypolish by default - ILLUMINA_POLISHER="--R1 ~{illumina_read1} --R2 ~{illumina_read2} --polypolish ~{illumina_polishing_rounds}" - fi - else - echo "Illumina reads were not provided." - ILLUMINA_POLISHER="" - fi - - # reduce requested memory to prevent out of memory errors - mem=$(( ~{memory}-1 )) - - # run dragonflye - # --reads for input nanopore fastq - # --depth 0 disables sub-sampling of reads (performed with rasusa already) - # --outdir indicates output directory - # --gsize indicates genome size; if this is not set it will autodetect size - # --cpus number of cpus to use - # --ram try to keep RAM usage below this number - # --nofilter disables read length filtering (performed with nanoq already) - # --assembler has three options: raven, miniasm, flye (default: flye) - # --opts enables extra assembler options in quotes - # see above for polisher input explanation - # see above for illumina polisher input explanation - dragonflye \ - --reads ~{read1} \ - --depth 0 \ - --outdir dragonflye \ - ~{'--gsize ' + genome_length} \ - --cpus ~{cpu} \ - --ram ${mem} \ - --nofilter \ - ~{'--assembler ' + assembler} \ - ~{'--opts "' + assembler_options + '"'} \ - ${POLISHER} \ - ${ILLUMINA_POLISHER} - - # rename final output file to have .fasta ending instead of .fa - mv dragonflye/contigs.fa ~{samplename}.fasta - mv dragonflye/*.gfa ~{samplename}_contigs.gfa - >>> - output { - File assembly_fasta = "~{samplename}.fasta" - File contigs_gfa = "~{samplename}_contigs.gfa" - String dragonflye_version = read_string("VERSION") - } - runtime { - docker: "~{docker}" - memory: "~{memory} GB" - cpu: cpu - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - maxRetries: 3 - preemptible: 0 - } -} \ No newline at end of file diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl index f2dfc7a44..bbcfd084a 100644 --- a/tasks/assembly/task_flye.wdl +++ b/tasks/assembly/task_flye.wdl @@ -15,7 +15,7 @@ task flye { Int? genome_length # requires `asm_coverage` Int? asm_coverage # reduced coverage for initial disjointig assembly - Int polishing_iterations = 1 + Int flye_polishing_iterations = 1 Int? minimum_overlap Float? read_error_rate @@ -24,7 +24,7 @@ task flye { Boolean no_alt_contigs = false Boolean scaffold = false - String? additonal_parameters + String? additional_parameters # Any extra Flye-specific parameters Int cpu = 4 Int disk_size = 100 @@ -53,7 +53,7 @@ task flye { # genome size parameter requires asm_coverage flye \ ${READ_TYPE} ~{read1} \ - --iterations ~{polishing_iterations} \ + --iterations ~{flye_polishing_iterations} \ ~{"--min-overlap" + minimum_overlap} \ ~{if defined(asm_coverage) then "--genome-size " + genome_length else ""} \ ~{"--asm-coverage " + asm_coverage} \ @@ -62,7 +62,7 @@ task flye { ~{true="--keep-haplotypes" false="" keep_haplotypes} \ ~{true="--no-alt-contigs" false="" no_alt_contigs} \ ~{true="--scaffold" false="" scaffold} \ - ~{"--extra-params " + additonal_parameters} \ + ~{"--extra-params " + additional_parameters } \ --threads ~{cpu} \ --out-dir . @@ -82,7 +82,7 @@ task flye { docker: "~{docker}" cpu: cpu memory: "~{memory} GB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" maxRetries: 3 preemptible: 0 diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl index 655bdb4c3..66b239a66 100644 --- a/tasks/polishing/task_medaka.wdl +++ b/tasks/polishing/task_medaka.wdl @@ -61,7 +61,7 @@ task medaka_consensus { docker: "~{docker}" cpu: cpu memory: "~{memory} GB" - disks: "local-disk " + disk_size + " HDD" + disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" maxRetries: 1 preemptible: 0 diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index 416cd036c..41d8fa9d8 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -5,7 +5,7 @@ task racon { File unpolished_fasta File read1 Int polishing_rounds = 1 # Default: 1 polishing round - Int cpu = 4 + Int cpu = 8 Int memory = 16 Int disk_size = 100 String samplename @@ -50,7 +50,7 @@ task racon { docker: "~{docker}" memory: memory + " GB" cpu: cpu - disk: disk_size + " GB" + disks: "local-disk " + disk_size + " SSD" maxRetries: 3 preemptible: 0 } diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index e588d8888..78275425e 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -21,9 +21,79 @@ workflow flye_denovo { File? illumina_read2 # Optional Illumina short-read R2 for hybrid assembly String samplename String polisher = "medaka" - Int? polish_rounds = 1 # Optional number of polishing rounds - String? medaka_model_override # Optional user-specified Medaka model + Int polish_rounds = 1 # Optional number of polishing rounds + + # Task-level inputs for Porechop + Int porechop_cpu = 4 + Int porechop_memory = 8 + Int porechop_disk_size = 50 + String? porechop_trimopts # Optional Porechop trimming options + + # Task-level parameters for Flye + Boolean ont_corrected = false + Boolean ont_high_quality = false + Boolean pacbio_raw = false + Boolean pacbio_corrected = false + Boolean pacbio_hifi = false + Int? genome_length # Requires `asm_coverage` + Int? asm_coverage # Reduced coverage for initial disjointig assembly + Int flye_polishing_iterations = 1 # Default polishing iterations + Int? minimum_overlap # Minimum overlap between reads + Float? read_error_rate # Maximum expected read error rate + Boolean uneven_coverage_mode = false + Boolean keep_haplotypes = false + Boolean no_alt_contigs = false + Boolean scaffold = false + String? additional_parameters # Any extra Flye-specific parameters + Int flye_cpu = 4 + Int flye_memory = 32 + Int flye_disk_size = 100 + + # Bandage-specific inputs + Int bandage_cpu = 2 + Int bandage_memory = 4 + Int bandage_disk_size = 10 + + #Task-level parameters for Polypolish + # Polypolish-specific inputs + String? pair_orientation + Float? low_percentile_threshold + Float? high_percentile_threshold + Float? fraction_invalid + Float? fraction_valid + Int? maximum_errors + Int? minimum_depth + Boolean careful = false + Int polypolish_cpu = 1 + Int polypolish_memory = 8 + Int polypolish_disk_size = 100 + + # Medaka-specific inputs Boolean auto_medaka_model = true # Enable automatic Medaka model selection + String? medaka_model_override # Optional user-specified Medaka model + Int medaka_cpu = 4 + Int medaka_memory = 16 + Int medaka_disk_size = 100 + + # Racon-specific inputs + Int racon_cpu = 8 + Int racon_memory = 16 + Int racon_disk_size = 100 + + # Contig filter-specific inputs + Int contig_filter_min_len = 1000 + Boolean contig_filter_homopolymers = true + Int contig_filter_cpu = 4 + Int contig_filter_memory = 16 + Int contig_filter_disk_size = 100 + + # Dnaapler-specific inputs + String dnaapler_mode = "all" + Int dnaapler_cpu = 4 + Int dnaapler_memory = 16 + Int dnaapler_disk_size = 100 + + #Other workdlow-level inputs Boolean skip_trim_reads = true # Default: No trimming Boolean skip_polishing = false # Default: Polishing enabled } @@ -35,20 +105,45 @@ workflow flye_denovo { call task_porechop.porechop as porechop { input: read1 = read1, - samplename = samplename + samplename = samplename, + trimopts = porechop_trimopts, + cpu = porechop_cpu, + memory = porechop_memory, + disk_size = porechop_disk_size } } # Call Flye using either trimmed reads or raw reads call task_flye.flye as flye { input: read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available - samplename = samplename + samplename = samplename, + ont_corrected = ont_corrected, + pacbio_corrected = pacbio_corrected, + pacbio_hifi = pacbio_hifi, + pacbio_raw = pacbio_raw, + ont_high_quality = ont_high_quality, + genome_length = genome_length, + asm_coverage = asm_coverage, + flye_polishing_iterations = flye_polishing_iterations, + minimum_overlap = minimum_overlap, + read_error_rate = read_error_rate, + uneven_coverage_mode = uneven_coverage_mode, + keep_haplotypes = keep_haplotypes, + no_alt_contigs = no_alt_contigs, + scaffold = scaffold, + additional_parameters = additional_parameters, + cpu = flye_cpu, + memory = flye_memory, + disk_size = flye_disk_size } # Generate Bandage plot call task_bandage.bandage_plot as bandage { input: assembly_graph_gfa = flye.assembly_graph_gfa, - samplename = samplename + samplename = samplename, + cpu = bandage_cpu, + memory = bandage_memory, + disk_size = bandage_disk_size } # Hybrid Assembly Path: Polypolish if (defined(illumina_read1) && defined(illumina_read2)) { @@ -65,7 +160,18 @@ workflow flye_denovo { read1_sam = bwa.read1_sam, read2_sam = bwa.read2_sam, samplename = samplename, - illumina_polishing_rounds = polish_rounds + illumina_polishing_rounds = polish_rounds, + pair_orientation = pair_orientation, + low_percentile_threshold = low_percentile_threshold, + high_percentile_threshold = high_percentile_threshold, + fraction_invalid = fraction_invalid, + fraction_valid = fraction_valid, + maximum_errors = maximum_errors, + minimum_depth = minimum_depth, + careful = careful, + cpu = polypolish_cpu, + memory = polypolish_memory, + disk_size = polypolish_disk_size } } # ONT-only Polishing Path: Medaka or Racon @@ -86,7 +192,10 @@ workflow flye_denovo { unpolished_fasta = flye.assembly_fasta, read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename, - polishing_rounds = polish_rounds + polishing_rounds = polish_rounds, + cpu = racon_cpu, + memory = racon_memory, + disk_size = racon_disk_size } } } @@ -94,11 +203,20 @@ workflow flye_denovo { call task_filtercontigs.contig_filter as contig_filter { input: assembly_fasta = select_first([polypolish.polished_assembly, medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), # Use Flye assembly if no polishing + min_len = contig_filter_min_len, + filter_homopolymers = contig_filter_homopolymers, + threads = contig_filter_cpu, + memory = contig_filter_memory, + disk_size = contig_filter_disk_size } call task_dnaapler.dnaapler_all as dnaapler { input: input_fasta = contig_filter.filtered_fasta, - samplename = samplename + samplename = samplename, + dmaapler_mode = dnaapler_mode, + cpu = dnaapler_cpu, + memory = dnaapler_memory, + disk_size = dnaapler_disk_size } output { File final_assembly = dnaapler.reoriented_fasta From 475e543809b9596ac95b35ddfae8252e7438d27d Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 11:04:13 -0600 Subject: [PATCH 41/58] rename some input specific variables for terra users to know which task it relates to --- .../read_filtering/task_filtercontigs.wdl | 4 +- workflows/utilities/wf_flye_denovo.wdl | 116 +++++++++--------- 2 files changed, 61 insertions(+), 59 deletions(-) diff --git a/tasks/quality_control/read_filtering/task_filtercontigs.wdl b/tasks/quality_control/read_filtering/task_filtercontigs.wdl index 985ca7474..27ec30a46 100644 --- a/tasks/quality_control/read_filtering/task_filtercontigs.wdl +++ b/tasks/quality_control/read_filtering/task_filtercontigs.wdl @@ -7,7 +7,7 @@ task contig_filter { Boolean filter_homopolymers = true Int disk_size = 100 Int memory = 16 - Int threads = 4 + Int cpu = 4 String docker = "us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.8.2" } command <<< @@ -89,7 +89,7 @@ task contig_filter { } runtime { docker: "~{docker}" - cpu: threads + cpu: cpu memory: "~{memory}G" disks: "local-disk " + disk_size + " SSD" } diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index 78275425e..01ca33d8e 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -29,22 +29,22 @@ workflow flye_denovo { Int porechop_disk_size = 50 String? porechop_trimopts # Optional Porechop trimming options - # Task-level parameters for Flye - Boolean ont_corrected = false - Boolean ont_high_quality = false - Boolean pacbio_raw = false - Boolean pacbio_corrected = false - Boolean pacbio_hifi = false - Int? genome_length # Requires `asm_coverage` - Int? asm_coverage # Reduced coverage for initial disjointig assembly + # Flye-specific inputs + Boolean flye_ont_corrected = false + Boolean flye_ont_high_quality = false + Boolean flye_pacbio_raw = false + Boolean flye_pacbio_corrected = false + Boolean flye_pacbio_hifi = false + Int? flye_genome_length # Requires `asm_coverage` + Int? flye_asm_coverage # Reduced coverage for initial disjointig assembly Int flye_polishing_iterations = 1 # Default polishing iterations - Int? minimum_overlap # Minimum overlap between reads - Float? read_error_rate # Maximum expected read error rate - Boolean uneven_coverage_mode = false - Boolean keep_haplotypes = false - Boolean no_alt_contigs = false - Boolean scaffold = false - String? additional_parameters # Any extra Flye-specific parameters + Int? flye_minimum_overlap # Minimum overlap between reads + Float? flye_read_error_rate # Maximum expected read error rate + Boolean flye_uneven_coverage_mode = false + Boolean flye_keep_haplotypes = false + Boolean flye_no_alt_contigs = false + Boolean flye_scaffold = false + String? flye_additional_parameters # Any extra Flye-specific parameters Int flye_cpu = 4 Int flye_memory = 32 Int flye_disk_size = 100 @@ -54,16 +54,15 @@ workflow flye_denovo { Int bandage_memory = 4 Int bandage_disk_size = 10 - #Task-level parameters for Polypolish # Polypolish-specific inputs - String? pair_orientation - Float? low_percentile_threshold - Float? high_percentile_threshold - Float? fraction_invalid - Float? fraction_valid - Int? maximum_errors - Int? minimum_depth - Boolean careful = false + String? polypolish_pair_orientation + Float? polypolish_low_percentile_threshold + Float? polypolish_high_percentile_threshold + Float? polypolish_fraction_invalid + Float? polypolish_fraction_valid + Int? polypolish_maximum_errors + Int? polypolish_minimum_depth + Boolean polypolish_careful = false Int polypolish_cpu = 1 Int polypolish_memory = 8 Int polypolish_disk_size = 100 @@ -81,11 +80,11 @@ workflow flye_denovo { Int racon_disk_size = 100 # Contig filter-specific inputs - Int contig_filter_min_len = 1000 - Boolean contig_filter_homopolymers = true - Int contig_filter_cpu = 4 - Int contig_filter_memory = 16 - Int contig_filter_disk_size = 100 + Int filtercontigs_min_len = 1000 + Boolean filtercontigs_homopolymers = true + Int filtercontigs_cpu = 4 + Int filtercontigs_memory = 16 + Int filtercontigs_disk_size = 100 # Dnaapler-specific inputs String dnaapler_mode = "all" @@ -117,21 +116,21 @@ workflow flye_denovo { input: read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available samplename = samplename, - ont_corrected = ont_corrected, - pacbio_corrected = pacbio_corrected, - pacbio_hifi = pacbio_hifi, - pacbio_raw = pacbio_raw, - ont_high_quality = ont_high_quality, - genome_length = genome_length, - asm_coverage = asm_coverage, + ont_corrected = flye_ont_corrected, + pacbio_corrected = flye_pacbio_corrected, + pacbio_hifi = flye_pacbio_hifi, + pacbio_raw = flye_pacbio_raw, + ont_high_quality = flye_ont_high_quality, + genome_length = flye_genome_length, + asm_coverage = flye_asm_coverage, flye_polishing_iterations = flye_polishing_iterations, - minimum_overlap = minimum_overlap, - read_error_rate = read_error_rate, - uneven_coverage_mode = uneven_coverage_mode, - keep_haplotypes = keep_haplotypes, - no_alt_contigs = no_alt_contigs, - scaffold = scaffold, - additional_parameters = additional_parameters, + minimum_overlap = flye_minimum_overlap, + read_error_rate = flye_read_error_rate, + uneven_coverage_mode = flye_uneven_coverage_mode, + keep_haplotypes = flye_keep_haplotypes, + no_alt_contigs = flye_no_alt_contigs, + scaffold = flye_scaffold, + additional_parameters = flye_additional_parameters, cpu = flye_cpu, memory = flye_memory, disk_size = flye_disk_size @@ -161,14 +160,14 @@ workflow flye_denovo { read2_sam = bwa.read2_sam, samplename = samplename, illumina_polishing_rounds = polish_rounds, - pair_orientation = pair_orientation, - low_percentile_threshold = low_percentile_threshold, - high_percentile_threshold = high_percentile_threshold, - fraction_invalid = fraction_invalid, - fraction_valid = fraction_valid, - maximum_errors = maximum_errors, - minimum_depth = minimum_depth, - careful = careful, + pair_orientation = polypolish_pair_orientation, + low_percentile_threshold = polypolish_low_percentile_threshold, + high_percentile_threshold = polypolish_high_percentile_threshold, + fraction_invalid = polypolish_fraction_invalid, + fraction_valid = polypolish_fraction_valid, + maximum_errors = polypolish_maximum_errors, + minimum_depth = polypolish_minimum_depth, + careful = polypolish_careful, cpu = polypolish_cpu, memory = polypolish_memory, disk_size = polypolish_disk_size @@ -183,7 +182,10 @@ workflow flye_denovo { samplename = samplename, read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available medaka_model_override = medaka_model_override, - auto_model = auto_medaka_model + auto_model = auto_medaka_model, + cpu = medaka_cpu, + memory = medaka_memory, + disk_size = medaka_disk_size } } if (!skip_polishing && polisher == "racon") { @@ -203,11 +205,11 @@ workflow flye_denovo { call task_filtercontigs.contig_filter as contig_filter { input: assembly_fasta = select_first([polypolish.polished_assembly, medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), # Use Flye assembly if no polishing - min_len = contig_filter_min_len, - filter_homopolymers = contig_filter_homopolymers, - threads = contig_filter_cpu, - memory = contig_filter_memory, - disk_size = contig_filter_disk_size + min_len = filtercontigs_min_len, + filter_homopolymers = filtercontigs_homopolymers, + cpu = filtercontigs_cpu, + memory = filtercontigs_memory, + disk_size = filtercontigs_disk_size } call task_dnaapler.dnaapler_all as dnaapler { input: From e86e193ad71381fd09711ec0b63482d48524c652 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 13:27:44 -0600 Subject: [PATCH 42/58] debugging racon terra --- tasks/polishing/task_racon.wdl | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index 41d8fa9d8..4591bdd44 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -17,16 +17,18 @@ task racon { minimap2 --version | tee MINIMAP2_VERSION racon --version | tee -a RACON_VERSION - # Start with the unpolished FASTA as the input + echo "Starting Racon polishing process..." + # Initialize the input assembly intermediate_fasta="~{unpolished_fasta}" - for i in $(seq 1 ~{polishing_rounds}); do - echo "Starting Medaka polishing round $i" + # Loop through polishing rounds + for i in $(seq 1 ~{polishing_rounds}); do + echo "Polishing round $i..." - # Generate alignments with Minimap2 - minimap2 -t ~{cpu} "${intermediate_fasta}" "~{read1}" > "~{samplename}_round${i}.paf" + # Align reads to the current assembly + minimap2 -x map-ont -t ~{cpu} "${intermediate_fasta}" "~{read1}" > "~{samplename}_round${i}.paf" - # Run Racon for polishing + # Run Racon to polish the assembly racon \ -t ~{cpu} \ "~{read1}" \ @@ -34,12 +36,13 @@ task racon { "${intermediate_fasta}" \ > "~{samplename}_round${i}.polished.fasta" - # Update current_fasta for the next round + # Update for the next round intermediate_fasta="~{samplename}_round${i}.polished.fasta" done - # Move the final polished assembly to the output + # Save the final polished assembly mv "${intermediate_fasta}" "~{samplename}_final_polished.fasta" + echo "Polishing complete. Final assembly saved as ~{samplename}_final_polished.fasta" >>> output { File polished_fasta = "~{samplename}_final_polished.fasta" From 97df2ececfff369b4db31426a9399bab09f21872 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 14:37:21 -0600 Subject: [PATCH 43/58] debugging potential memory usage increase for racon --- tasks/polishing/task_racon.wdl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index 4591bdd44..64e373563 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -6,7 +6,7 @@ task racon { File read1 Int polishing_rounds = 1 # Default: 1 polishing round Int cpu = 8 - Int memory = 16 + Int memory = 32 Int disk_size = 100 String samplename String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2" @@ -18,6 +18,13 @@ task racon { racon --version | tee -a RACON_VERSION echo "Starting Racon polishing process..." + + #debugging system info + echo "CPU Info:" + lscpu + echo "Memory Info:" + free -h + # Initialize the input assembly intermediate_fasta="~{unpolished_fasta}" From 6309913d3139cebcea91c1ace3f21bcc64c790d9 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 15:57:48 -0600 Subject: [PATCH 44/58] more debugging for racon failing terra only --- tasks/polishing/task_racon.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index 64e373563..aadbe8177 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -25,6 +25,9 @@ task racon { echo "Memory Info:" free -h + echo "Checking CPU compatibility in Terra..." + lscpu | grep -E 'sse|avx' + # Initialize the input assembly intermediate_fasta="~{unpolished_fasta}" @@ -61,7 +64,7 @@ task racon { memory: memory + " GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" - maxRetries: 3 + maxRetries: 1 preemptible: 0 } } From fac2a23725d00986b7cd8bcba32f1a031039504d Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 16:20:44 -0600 Subject: [PATCH 45/58] trying new dockerfile with updated cmake command for cpu compatibility --- tasks/polishing/task_racon.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index aadbe8177..c1f36d952 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -9,7 +9,7 @@ task racon { Int memory = 32 Int disk_size = 100 String samplename - String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2-generic" } command <<< set -euo pipefail From bc8fafae41b483b053607e7f85a398a95a8800ac Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 12 Dec 2024 21:02:42 -0600 Subject: [PATCH 46/58] trying test dockerfile with updated cpu installs --- tasks/polishing/task_racon.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index c1f36d952..2a4b9aff6 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -9,7 +9,7 @@ task racon { Int memory = 32 Int disk_size = 100 String samplename - String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2-generic" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2-generic-v2" } command <<< set -euo pipefail From a379f6b7ad89722bfbc2f099bcb6c2782da4f97c Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 13 Dec 2024 08:19:11 -0600 Subject: [PATCH 47/58] trying new racon build with flags for cpu optimization on terra --- tasks/polishing/task_racon.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index 2a4b9aff6..d32a3ba05 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -9,7 +9,7 @@ task racon { Int memory = 32 Int disk_size = 100 String samplename - String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2-generic-v2" + String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2-generic-v3" } command <<< set -euo pipefail From 3938c186a1a6a84b00c75526614adad7a0a2df5e Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 13 Dec 2024 13:37:28 -0600 Subject: [PATCH 48/58] Increase maxRetries for dnaapler and contig_filter tasks; add bandage_plot output to workflows --- tasks/assembly/task_dnaapler.wdl | 2 +- tasks/polishing/task_racon.wdl | 9 --------- .../read_filtering/task_filtercontigs.wdl | 2 ++ tasks/utilities/data_export/task_broad_terra_tools.wdl | 2 ++ workflows/theiaprok/wf_theiaprok_ont.wdl | 1 + workflows/utilities/wf_flye_denovo.wdl | 3 +-- 6 files changed, 7 insertions(+), 12 deletions(-) diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl index dce0a5045..c97210da9 100644 --- a/tasks/assembly/task_dnaapler.wdl +++ b/tasks/assembly/task_dnaapler.wdl @@ -66,7 +66,7 @@ task dnaapler_all { memory: "~{memory} GB" disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 1 + maxRetries: 3 preemptible: 0 } } \ No newline at end of file diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl index d32a3ba05..21a9572e2 100644 --- a/tasks/polishing/task_racon.wdl +++ b/tasks/polishing/task_racon.wdl @@ -19,15 +19,6 @@ task racon { echo "Starting Racon polishing process..." - #debugging system info - echo "CPU Info:" - lscpu - echo "Memory Info:" - free -h - - echo "Checking CPU compatibility in Terra..." - lscpu | grep -E 'sse|avx' - # Initialize the input assembly intermediate_fasta="~{unpolished_fasta}" diff --git a/tasks/quality_control/read_filtering/task_filtercontigs.wdl b/tasks/quality_control/read_filtering/task_filtercontigs.wdl index 27ec30a46..1910c3308 100644 --- a/tasks/quality_control/read_filtering/task_filtercontigs.wdl +++ b/tasks/quality_control/read_filtering/task_filtercontigs.wdl @@ -92,5 +92,7 @@ task contig_filter { cpu: cpu memory: "~{memory}G" disks: "local-disk " + disk_size + " SSD" + maxRetries: 3 + preemptible: 0 } } diff --git a/tasks/utilities/data_export/task_broad_terra_tools.wdl b/tasks/utilities/data_export/task_broad_terra_tools.wdl index 9b609ab7c..e578624c0 100644 --- a/tasks/utilities/data_export/task_broad_terra_tools.wdl +++ b/tasks/utilities/data_export/task_broad_terra_tools.wdl @@ -81,6 +81,7 @@ task export_taxon_tables { String? tiptoft_plasmid_replicon_genes String? tiptoft_version File? assembly_fasta + File? bandage_plot File? contigs_gfa String? shovill_pe_version String? shovill_se_version @@ -491,6 +492,7 @@ task export_taxon_tables { "tiptoft_plasmid_replicon_genes": "~{tiptoft_plasmid_replicon_genes}", "tiptoft_version": "~{tiptoft_version}", "assembly_fasta": "~{assembly_fasta}", + "bandage_plot": "~{bandage_plot}", "contigs_gfa": "~{contigs_gfa}", "shovill_pe_version": "~{shovill_pe_version}", "shovill_se_version": "~{shovill_se_version}", diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 404db79e8..71db65af9 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -592,6 +592,7 @@ workflow theiaprok_ont { # Assembly - flye_denovo outputs File? assembly_fasta = flye_denovo.final_assembly File? contigs_gfa = flye_denovo.contigs_gfa + File? bandage_plot = flye_denovo.bandage_plot # Assembly QC - quast outputs File? quast_report = quast.quast_report String? quast_version = quast.version diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index 01ca33d8e..66224a451 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -220,9 +220,8 @@ workflow flye_denovo { memory = dnaapler_memory, disk_size = dnaapler_disk_size } - output { + output { File final_assembly = dnaapler.reoriented_fasta - #add into terra export table File bandage_plot = bandage.plot File contigs_gfa = flye.assembly_graph_gfa String flye_phb_version = version_capture.phb_version From 01ebb7b6163758a56a02a05e1302734d5fe2ef8f Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 13 Dec 2024 13:57:21 -0600 Subject: [PATCH 49/58] docs update theiaprok --- docs/workflows/genomic_characterization/theiaprok.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 1bca4a97c..bdfb282e7 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -146,7 +146,6 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | clean_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | flye_denovo | **auto_medaka_model** | Boolean | If true, medaka will automatically select the best Medaka model for assembly | TRUE | Optional | ONT | -| flye_denovo | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | | flye_denovo | **illumina_read1** | File | If Illumina reads are provided, flye_denovo subworkflow will perform Illumina polishing | | Optional | ONT | | flye_denovo | **illumina_read2** | File | If Illumina reads are provided, flye_denovo subworflow will perform Illumina polishing | | Optional | ONT | | flye_denovo | **medaka_model_override** | String | The model of medaka to use for assembly | r1041_e82_400bps_sup_v5.0.0 | Optional | ONT | @@ -1668,11 +1667,12 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | bakta_summary | File | Bakta summary output TXT file | FASTA, ONT, PE, SE | | bakta_tsv | File | Annotations as simple human readable TSV | FASTA, ONT, PE, SE | | bakta_version | String | Bakta version used | FASTA, ONT, PE, SE | +| bandage_plot | File | Image file (PNG) visualizing the Flye assembly graph generated by Bandage | FASTA, ONT, PE, SE | | bbduk_docker | String | BBDuk docker image used | PE, SE | | busco_database | String | BUSCO database used | FASTA, ONT, PE, SE | | busco_docker | String | BUSCO docker image used | FASTA, ONT, PE, SE | | busco_report | File | A plain text summary of the results in BUSCO notation | FASTA, ONT, PE, SE | -| busco_results | String | BUSCO results (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) | FASTA, ONT, PE, SE | +| busco_results | String | BUSCO results (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) | ONT | | busco_version | String | BUSCO software version used | FASTA, ONT, PE, SE | | cg_pipeline_docker | String | Docker file used for running CG-Pipeline on cleaned reads | PE, SE | | cg_pipeline_report_clean | File | TSV file of read metrics from clean reads, including average read length, number of reads, and estimated genome coverage | PE, SE | @@ -1684,9 +1684,8 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | combined_mean_readlength_clean | Float | Mean read length for the combined clean reads | PE | | combined_mean_readlength_raw | Float | Mean read length for the combined raw reads | PE | | contigs_fastg | File | Assembly graph if megahit used for genome assembly | PE | -| contigs_gfa | File | Assembly graph if spades used for genome assembly | ONT, PE, SE | +| contigs_gfa | File | Assembly graph if spades or Flye is used for genome assembly | ONT, PE, SE | | contigs_lastgraph | File | Assemb7ly graph if velvet used for genome assembly | PE | -| dragonflye_version | String | Version of dragonflye used for de novo assembly | ONT | | ectyper_predicted_serotype | String | Serotype predicted by ECTyper | FASTA, ONT, PE, SE | | ectyper_results | File | TSV file of evidence for ECTyper predicted serotype (see https://github.com/phac-nml/ecoli_serotyping#report-format) | FASTA, ONT, PE, SE | | ectyper_version | String | Version of ECTyper used | FASTA, ONT, PE, SE | @@ -1717,6 +1716,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | fastqc_raw1_html | File | Graphical visualization of raw forward read quality from fastqc to open in an internet browser | PE, SE | | fastqc_raw2_html | File | Graphical visualization of raw reverse read qualityfrom fastqc to open in an internet browser | PE | | fastqc_version | String | Version of fastqc software used | PE, SE | +| final_assembly | File | Final assembly file output from flye_denovo assembly| FASTA, ONT, PE, SE | | gambit_closest_genomes | File | CSV file listing genomes in the GAMBIT database that are most similar to the query assembly | FASTA, ONT, PE, SE | | gambit_db_version | String | Version of GAMBIT used | FASTA, ONT, PE, SE | | gambit_docker | String | GAMBIT docker file used | FASTA, ONT, PE, SE | From 84df24f91d6571565f282ba8f1c057803739a158 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 13 Dec 2024 15:54:01 -0600 Subject: [PATCH 50/58] Refactor workflows to standardize assembly output variable names; increase maxRetries for Medaka task and capture selected model, update docs theiaprok --- .../genomic_characterization/theiaprok.md | 184 ++++++++++++++---- tasks/alignment/task_bwa.wdl | 2 +- tasks/polishing/task_medaka.wdl | 4 +- workflows/theiaprok/wf_theiaprok_ont.wdl | 30 +-- workflows/utilities/wf_flye_denovo.wdl | 18 +- 5 files changed, 175 insertions(+), 63 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index bdfb282e7..21ea06229 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -146,15 +146,67 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | clean_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | flye_denovo | **auto_medaka_model** | Boolean | If true, medaka will automatically select the best Medaka model for assembly | TRUE | Optional | ONT | +| flye_denovo | **bandage_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT | +| flye_denovo | **bandage_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | ONT | +| flye_denovo | **bandage_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **dnaapler_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **dnaapler_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **dnaapler_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **dnaapler_mode** | String | Dnaapler-specific inputs | all | Optional | ONT | +| flye_denovo | **filtercontigs_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **filtercontigs_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **filtercontigs_homopolymers** | Boolean | If true, filters out homopolymers | TRUE | Optional | ONT | +| flye_denovo | **filtercontigs_min_len** | Int | Minimum contig length to keep | 1000 | Optional | ONT | +| flye_denovo | **filtercontigs_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **flye_ont_corrected** | File | If true, uses ONT corrected reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_ont_high_quality** | File | If true, uses ONT high-quality reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_pacbio_raw** | File | If true, uses PacBio raw reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_pacbio_corrected** | File | If true, uses PacBio corrected reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_pacbio_hifi** | File | If true, uses PacBio HiFi reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_genome_length** | Int | User-specified expected genome length to be used in genome statistics calculations | | Optional | ONT | +| flye_denovo | **flye_asm_coverage** | Int | Reduced coverage for initial disjointig assembly | | Optional | ONT | +| flye_denovo | **flye_polishing_iterations** | Int | Default polishing iterations | 1 | Optional | ONT | +| flye_denovo | **flye_minimum_overlap** | Int | Minimum overlap between reads | | Optional | ONT | +| flye_denovo | **flye_read_error_rate** | Float | Maximum expected read error rate | | Optional | ONT | +| flye_denovo | **flye_uneven_coverage_mode** | Boolean | | FALSE | Optional | ONT | +| flye_denovo | **flye_keep_haplotypes** | Boolean | If true keep haplotypes | FALSE | Optional | ONT | +| flye_denovo | **flye_no_alt_contigs** | Boolean | If true, do not generate alternative contigs | FALSE | Optional | ONT | +| flye_denovo | **flye_scaffold** | Boolean | If true, scaffold | FALSE | Optional | ONT | +| flye_denovo | **flye_additional_parameters** | String | Any extra Flye-specific parameters | | Optional | ONT | +| flye_denovo | **flye_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **flye_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT | +| flye_denovo | **flye_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | | flye_denovo | **illumina_read1** | File | If Illumina reads are provided, flye_denovo subworkflow will perform Illumina polishing | | Optional | ONT | | flye_denovo | **illumina_read2** | File | If Illumina reads are provided, flye_denovo subworflow will perform Illumina polishing | | Optional | ONT | | flye_denovo | **medaka_model_override** | String | The model of medaka to use for assembly | r1041_e82_400bps_sup_v5.0.0 | Optional | ONT | +| flye_denovo | **medaka_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **medaka_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **medaka_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | | flye_denovo | **polisher** | String | The polishing tool to use for assembly | medaka | Optional | ONT | | flye_denovo | **polishing_rounds** | Int | The number of polishing rounds to conduct for medaka or racon (without Illumina) | 1 | Optional | ONT | +| flye_denovo | **porechop_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT | +| flye_denovo | **porechop_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **porechop_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | ONT | +| flye_denovo | **porechop_trimopts** | String | Options to pass to Porechop for trimming | | Optional | ONT | +| flye_denovo | **polypolish_pair_orientation** | String | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_low_percentile_threshold** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_high_percentile_threshold** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_fraction_invalid** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_fraction_valid** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_maximum_errors** | Int | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_minimum_depth** | Int | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_careful** | Boolean | Polypolish-specific inputs | FALSE | Optional | ONT | +| flye_denovo | **polypolish_cpu** | Int | Polypolish cpu | 1 | Optional | ONT | +| flye_denovo | **polypolish_memory** | Int | Polypolish memory | 8 | Optional | ONT | +| flye_denovo | **polypolish_disk_size** | Int | Polypolish disk size | 100 | Optional | ONT | +| flye_denovo | **racon_cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | ONT | +| flye_denovo | **racon_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **racon_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | | flye_denovo | **read1** | File | ONT read file in FASTQ file format (compression optional) | | Optional | ONT | | flye_denovo | **skip_trim_reads** | Boolean | If true, trims reads before assembly | FALSE | Optional | ONT | | flye_denovo | **skip_polishing** | Boolean | If true, skips polishing | FALSE | Optional | ONT | | export_taxon_tables | **asembly_fasta** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **bandage_plot** | File | Internal component, do not modify | | Do not modify, Optional | ONT | | export_taxon_tables | **bbduk_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | | export_taxon_tables | **cg_pipeline_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | | export_taxon_tables | **cg_pipeline_report_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | @@ -167,7 +219,6 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | export_taxon_tables | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | FASTA, ONT, PE, SE | | export_taxon_tables | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | | export_taxon_tables | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | FASTA, ONT, PE, SE | -| export_taxon_tables | **dragonflye_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | | export_taxon_tables | **emmtypingtool_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT, SE | | export_taxon_tables | **emmtypingtool_emm_type** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | export_taxon_tables | **emmtypingtool_results_xml** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | @@ -749,44 +800,88 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al ??? task "`Flye`: _De novo_ Assembly" - `Flye` is a sub-workflow that performs _de novo_ assembly using Flye for ONT data and supports additional polishing and visualization steps. The detailed steps and tasks are as follows: - - ### Workflow Steps - - 1. **Version Capture**: - - Captures software and runtime versions for reproducibility. - - 2. **Optional Read Trimming**: - - Utilizes `task_porechop.wdl` to trim input reads if trimming is enabled. - - 3. **_De novo_ Assembly**: - - Uses `task_flye.wdl` for assembling long reads into contigs. - - 4. **Graph Visualization**: - - Employs `task_bandageplot.wdl` to visualize the assembly graph using Bandage. - - 5. **Polishing**: - - Supports polishing the assembly from Flye using: - - **Medaka** (`task_medaka.wdl`) for long reads. - - **Racon** (`task_racon.wdl`) for long reads. - - **Polypolish** (`task_polypolish.wdl`) for hybrid assemblies with Illumina data. - - 6. **Contig Filtering**: - - Filters contigs using `task_filtercontigs.wdl` based on user-defined criteria. - - 7. **Final Assembly Orientation**: - - Reorients contigs using `task_dnaapler.wdl` to start at a standard reference point. - - !!! techdetails "Flye_Denovo Technical Details" - | | Links | - | --- | --- | - | Subworkflow | [wf_flye_denovo.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_flye_denovo.wdl) | - | Assembly Task | [task_flye.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_flye.wdl) | - | Polishing Tasks | [task_medaka.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_medaka.wdl), [task_racon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_racon.wdl), [task_polypolish.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_polypolish.wdl) | - | Contig Filtering Task | [task_filtercontigs.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_filtercontigs.wdl) | - | Final Orientation Task | [task_dnaapler.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_dnaapler.wdl) | - | Software Source Code | [Flye on GitHub](https://github.com/fenderglass/Flye) | - | Software Documentation | [Flye Documentation](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md) | + `Flye_denovo` is a sub-workflow that performs _de novo_ assembly using Flye for ONT data and supports additional polishing and visualization steps. The detailed steps and tasks are as follows: + + ??? toggle "`Porechop`: Optional Read Trimming" + - Trims input reads if trimming is enabled using `task_porechop.wdl`. + + !!! techdetails "Technical Details: Porechop" + - **Purpose**: Ensures high-quality reads by removing adapters and other unwanted sequences. + - **WDL Task**: [task_porechop.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_porechop.wdl) + - **Software Source Code**: [Porechop on GitHub](https://github.com/rrwick/Porechop) + - **Software Documentation**: [Porechop Documentation](https://github.com/rrwick/Porechop#porechop) + + ??? toggle "`Flye`: De novo_ Assembly" + - Assembles long reads into contigs using `task_flye.wdl`. + + !!! techdetails "Technical Details: Flye" + - **Purpose**: Assembles raw reads into contiguous sequences (contigs) for further analysis. + - **WDL Task**: [task_flye.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_flye.wdl) + - **Software Source Code**: [Flye on GitHub](https://github.com/fenderglass/Flye) + - **Software Documentation**: [Flye Documentation](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md) + + ??? toggle "`Bandage`: Graph Visualization" + - Visualizes the assembly graph output from `Flye` using `task_bandageplot.wdl`. + + !!! techdetails "Technical Details: Bandage" + - **Purpose**: Generates a graphical representation of the assembly to aid visualization and quality checks. + - **WDL Task**: [task_bandageplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_bandageplot.wdl) + - **Software Source Code**: [Bandage on GitHub](https://github.com/rrwick/Bandage) + - **Software Documentation**: [Bandage Documentation](https://github.com/rrwick/Bandage#bandage) + + ??? toggle "`BWA`: Index and align data for Hybrid Assembly Alignment ==_for ONT and Illumina data_==" + - Indexes and creates SAM files for input to Polypolish using `task_bwaall.wdl`. + + !!! techdetails "Technical Details: BWA" + - **Purpose**: Aligns Illumina reads to the draft assembly and produces SAM files for hybrid polishing. + - **WDL Task**: [task_bwaall.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_bwa.wdl) + - **Software Source Code**: [BWA on GitHub](https://github.com/lh3/bwa) + - **Software Documentation**: [BWA Documentation](http://bio-bwa.sourceforge.net/) + + ??? toggle "`Polypolish`: Hybrid Assembly Polishing ==_for ONT and Illumina data_==" + - Polishes assemblies with hybrid ONT and Illumina data using `task_polypolish.wdl`. + + !!! techdetails "Technical Details: Polypolish" + - **Purpose**: Enhances hybrid assemblies by leveraging short-read data for fine-tuning. + - **WDL Task**: [task_polypolish.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_polypolish.wdl) + - **Software Source Code**: [Polypolish on GitHub](https://github.com/rrwick/Polypolish) + - **Software Documentation**: [Polypolish Documentation](https://github.com/rrwick/Polypolish#polypolish) + + ??? toggle "`Medaka`: Polishing of Flye assembly" + - Polishes the Flye assembly using `task_medaka.wdl`. + + !!! techdetails "Technical Details: Medaka" + - **Purpose**: Refines assembly accuracy for ONT data by incorporating error corrections. + - **WDL Task**: [task_medaka.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_medaka.wdl) + - **Software Source Code**: [Medaka on GitHub](https://github.com/nanoporetech/medaka) + - **Software Documentation**: [Medaka Documentation](https://github.com/nanoporetech/medaka#medaka) + + ??? toggle "`Racon`: Polishing" + - Offers an alternative polishing approach using `task_racon.wdl`. + + !!! techdetails "Technical Details: Racon" + - **Purpose**: Improves assembly accuracy for long-read data using sequence alignments. + - **WDL Task**: [task_racon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_racon.wdl) + - **Software Source Code**: [Racon on GitHub](https://github.com/lbcb-sci/racon) + - **Software Documentation**: [Racon Documentation](https://github.com/lbcb-sci/racon#racon) + + ??? toggle "`Filter Contigs`: filter contigs below a threshold length and remove homopolymer contigs" + - Filters contigs based on user-defined criteria using `task_filtercontigs.wdl`. + + !!! techdetails "Technical Details: Filter Contigs" + - **Purpose**: Removes low-quality or short contigs to improve downstream analyses. + - **WDL Task**: [task_filtercontigs.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_filtercontigs.wdl) + - **Software Source Code**: [FilterContigs on GitHub](https://github.com/your-organization/filtercontigs) (replace with actual link) + - **Software Documentation**: [FilterContigs Documentation](https://github.com/your-organization/filtercontigs#readme) (replace with actual link) + + ??? toggle "`DnaApler`: Final Assembly Orientation" + - Reorients contigs to start at a standard reference point using `task_dnaapler.wdl`. + + !!! techdetails "Technical Details: DnaApler" + - **Purpose**: Ensures consistent starting positions for assemblies based on reference standards. + - **WDL Task**: [task_dnaapler.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_dnaapler.wdl) + - **Software Source Code**: [DnaApler on GitHub](https://github.com/your-organization/dnaapler) (replace with actual link) + - **Software Documentation**: [DnaApler Documentation](https://github.com/your-organization/dnaapler#readme) (replace with actual link) #### Post-Assembly Tasks (performed for all taxa) @@ -1660,7 +1755,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | ani_mummer_version | String | Version of MUMmer used | FASTA, ONT, PE, SE | | ani_output_tsv | File | Full output TSV from ani-m | FASTA, ONT, PE, SE | | ani_top_species_match | String | Species of genome with highest ANI to query FASTA | FASTA, ONT, PE, SE | -| assembly_fasta | File | https://github.com/tseemann/shovill#contigsfa | ONT, PE, SE | +| assembly_fasta | File | Fasta file outputted from Flye or Shovill | ONT, PE, SE | | assembly_length | Int | Length of assembly (total contig length) as determined by QUAST | FASTA, ONT, PE, SE | | bakta_gbff | File | Genomic GenBank format annotation file | FASTA, ONT, PE, SE | | bakta_gff3 | File | Generic Feature Format Version 3 file | FASTA, ONT, PE, SE | @@ -1668,12 +1763,14 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | bakta_tsv | File | Annotations as simple human readable TSV | FASTA, ONT, PE, SE | | bakta_version | String | Bakta version used | FASTA, ONT, PE, SE | | bandage_plot | File | Image file (PNG) visualizing the Flye assembly graph generated by Bandage | FASTA, ONT, PE, SE | +| bandage_version | String | Version of Bandage used | FASTA, ONT, PE, SE | | bbduk_docker | String | BBDuk docker image used | PE, SE | | busco_database | String | BUSCO database used | FASTA, ONT, PE, SE | | busco_docker | String | BUSCO docker image used | FASTA, ONT, PE, SE | | busco_report | File | A plain text summary of the results in BUSCO notation | FASTA, ONT, PE, SE | | busco_results | String | BUSCO results (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) | ONT | | busco_version | String | BUSCO software version used | FASTA, ONT, PE, SE | +| bwa_version | String | Version of BWA software used | FASTA, ONT, PE, SE | | cg_pipeline_docker | String | Docker file used for running CG-Pipeline on cleaned reads | PE, SE | | cg_pipeline_report_clean | File | TSV file of read metrics from clean reads, including average read length, number of reads, and estimated genome coverage | PE, SE | | cg_pipeline_report_raw | File | TSV file of read metrics from raw reads, including average read length, number of reads, and estimated genome coverage | PE, SE | @@ -1686,6 +1783,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | contigs_fastg | File | Assembly graph if megahit used for genome assembly | PE | | contigs_gfa | File | Assembly graph if spades or Flye is used for genome assembly | ONT, PE, SE | | contigs_lastgraph | File | Assemb7ly graph if velvet used for genome assembly | PE | +| dnaapler_version | String | Version of dnaAplER used | FASTA, ONT, PE, SE | | ectyper_predicted_serotype | String | Serotype predicted by ECTyper | FASTA, ONT, PE, SE | | ectyper_results | File | TSV file of evidence for ECTyper predicted serotype (see https://github.com/phac-nml/ecoli_serotyping#report-format) | FASTA, ONT, PE, SE | | ectyper_version | String | Version of ECTyper used | FASTA, ONT, PE, SE | @@ -1717,6 +1815,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | fastqc_raw2_html | File | Graphical visualization of raw reverse read qualityfrom fastqc to open in an internet browser | PE | | fastqc_version | String | Version of fastqc software used | PE, SE | | final_assembly | File | Final assembly file output from flye_denovo assembly| FASTA, ONT, PE, SE | +| flye_version | String | Version of Flye software used | FASTA, ONT, PE, SE | | gambit_closest_genomes | File | CSV file listing genomes in the GAMBIT database that are most similar to the query assembly | FASTA, ONT, PE, SE | | gambit_db_version | String | Version of GAMBIT used | FASTA, ONT, PE, SE | | gambit_docker | String | GAMBIT docker file used | FASTA, ONT, PE, SE | @@ -1774,6 +1873,8 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | lissero_results | File | TSV results file from LisSero (see https://github.com/MDU-PHL/LisSero#example-output) | FASTA, ONT, PE, SE | | lissero_serotype | String | Serotype predicted by LisSero | FASTA, ONT, PE, SE | | lissero_version | String | Version of LisSero used | FASTA, ONT, PE, SE | +| medaka_model | String | Model used by Medaka | ONT | +| medaka_version | String | Version of Medaka used | ONT | | meningotype_BAST | String | BAST type | FASTA, ONT, PE, SE | | meningotype_FetA | String | FetA type | FASTA, ONT, PE, SE | | meningotype_fHbp | String | fHbp type | FASTA, ONT, PE, SE | @@ -1846,11 +1947,13 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | plasmidfinder_plasmids | String | Names of plasmids identified by PlasmidFinder | FASTA, ONT, PE, SE | | plasmidfinder_results | File | Output file from PlasmidFinder in TSV format | FASTA, ONT, PE, SE | | plasmidfinder_seqs | File | Hit_in_genome_seq.fsa file produced by PlasmidFinder | FASTA, ONT, PE, SE | +| polypolish_version | String | Version of Polypolish used | FASTA, ONT, PE, SE | | poppunk_docker | String | PopPUNK docker image with GPSC database used | FASTA, ONT, PE, SE | | poppunk_gps_cluster | String | GPS cluster predicted by PopPUNK | FASTA, ONT, PE, SE | | poppunk_GPS_db_version | String | Version of GPSC database used | FASTA, ONT, PE, SE | | poppunk_gps_external_cluster_csv | File | GPSC v6 scheme designations | FASTA, ONT, PE, SE | | poppunk_version | String | Version of PopPUNK used | FASTA, ONT, PE, SE | +| porechop_version | String | Version of Porechop used | ONT | | prokka_gbk | File | GenBank file produced from Prokka annotation of input FASTA | FASTA, ONT, PE, SE | | prokka_gff | File | Prokka output GFF3 file containing sequence and annotation (you can view this in IGV) | FASTA, ONT, PE, SE | | prokka_sqn | File | A Sequin file for GenBank submission | FASTA, ONT, PE, SE | @@ -1867,6 +1970,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | r2_mean_q_raw | Float | Mean quality score of raw reverse reads | PE | | r2_mean_readlength_clean | Float | Mean read length of clean reverse reads | PE | | r2_mean_readlength_raw | Float | Mean read length of raw reverse reads | PE | +| racon_version | String | Version of Racon used | ONT | | rasusa_version | String | Version of RASUSA used for analysis | ONT | | read_screen_clean | String | PASS or FAIL result from clean read screening; FAIL accompanied by the reason for failure | ONT, PE, SE | | read_screen_raw | String | PASS or FAIL result from raw read screening; FAIL accompanied by thereason for failure | ONT, PE, SE | diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index 9c908f3bb..ec7177d4d 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -171,7 +171,7 @@ task bwa_all { bwa &> BWA_HELP grep "Version" BWA_HELP | cut -d" " -f2 > BWA_VERSION - if [[ ! -f "~{draft_assembly_fasta}.bwt" ]]; then + if [[ ! -f "~{draft_assembly_fasta}.bwt" ]]; then echo "Indexing reference genome: ~{draft_assembly_fasta}" bwa index ~{draft_assembly_fasta} else diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl index 66b239a66..c07a92fd8 100644 --- a/tasks/polishing/task_medaka.wdl +++ b/tasks/polishing/task_medaka.wdl @@ -42,6 +42,7 @@ task medaka_consensus { fi echo "Using Medaka model for polishing: $selected_model" + echo ~[selected_model] > MEDAKA_MODEL # Perform Medaka polishing medaka_consensus \ @@ -56,6 +57,7 @@ task medaka_consensus { output { File medaka_fasta = "~{samplename}.polished.fasta" String medaka_version = read_string("MEDAKA_VERSION") + String medaka_model = read_string("MEDAKA_MODEL") } runtime { docker: "~{docker}" @@ -63,7 +65,7 @@ task medaka_consensus { memory: "~{memory} GB" disks: "local-disk " + disk_size + " SSD" disk: disk_size + " GB" - maxRetries: 1 + maxRetries: 3 preemptible: 0 } } diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 71db65af9..1043fbc88 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -104,7 +104,7 @@ workflow theiaprok_ont { } call quast_task.quast { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } # nanoplot for basic QC metrics @@ -122,73 +122,73 @@ workflow theiaprok_ont { } call busco_task.busco { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } if (perform_characterization) { call gambit_task.gambit { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } if (call_ani) { call ani_task.animummer as ani { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (call_kmerfinder) { call kmerfinder_task.kmerfinder_bacteria as kmerfinder { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } call amrfinderplus.amrfinderplus_nuc as amrfinderplus_task { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename, organism = select_first([expected_taxon, gambit.gambit_predicted_taxon]) } if (call_resfinder) { call resfinder_task.resfinder as resfinder_task { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename, organism = select_first([expected_taxon, gambit.gambit_predicted_taxon]) } } call ts_mlst_task.ts_mlst { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } if (genome_annotation == "prokka") { call prokka_task.prokka { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (genome_annotation == "bakta") { call bakta_task.bakta { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (call_plasmidfinder) { call plasmidfinder_task.plasmidfinder { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (call_abricate) { call abricate_task.abricate { input: - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename, database = abricate_db } @@ -219,7 +219,7 @@ workflow theiaprok_ont { call merlin_magic_workflow.merlin_magic { input: merlin_tag = select_first([expected_taxon, gambit.merlin_tag]), - assembly = flye_denovo.final_assembly, + assembly = flye_denovo.assembly_fasta, samplename = samplename, read1 = read_qc_trim.read1_clean, ont_data = true @@ -276,7 +276,7 @@ workflow theiaprok_ont { tiptoft_plasmid_replicon_fastq = read_qc_trim.tiptoft_plasmid_replicon_fastq, tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes, tiptoft_version = read_qc_trim.tiptoft_version, - assembly_fasta = flye_denovo.final_assembly, + assembly_fasta = flye_denovo.assembly_fasta, contigs_gfa = flye_denovo.contigs_gfa, quast_report = quast.quast_report, quast_version = quast.version, @@ -590,7 +590,7 @@ workflow theiaprok_ont { String? tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes String? tiptoft_version = read_qc_trim.tiptoft_version # Assembly - flye_denovo outputs - File? assembly_fasta = flye_denovo.final_assembly + File? assembly_fasta = flye_denovo.assembly_fasta File? contigs_gfa = flye_denovo.contigs_gfa File? bandage_plot = flye_denovo.bandage_plot # Assembly QC - quast outputs diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index 66224a451..db373d8be 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -96,9 +96,6 @@ workflow flye_denovo { Boolean skip_trim_reads = true # Default: No trimming Boolean skip_polishing = false # Default: Polishing enabled } - call versioning_task.version_capture { - input: - } # Optional Porechop trimming before Flye if (!skip_trim_reads) { call task_porechop.porechop as porechop { @@ -221,10 +218,19 @@ workflow flye_denovo { disk_size = dnaapler_disk_size } output { - File final_assembly = dnaapler.reoriented_fasta + #update this throughout theiprok + File assembly_fasta = dnaapler.reoriented_fasta File bandage_plot = bandage.plot File contigs_gfa = flye.assembly_graph_gfa - String flye_phb_version = version_capture.phb_version - String flye_analysis_date = version_capture.date + String? medaka_model = medaka.medaka_model + #add task versions + String? porechop_version = porechop.porechop_version + String flye_version = flye.flye_version + String bandage_version = bandage.bandage_version + String? medaka_version = medaka.medaka_version + String? racon_version = racon.racon_version + String? bwa_version = bwa.bwa_version + String? polypolish_version = polypolish.polypolish_version + String dnaapler_version = dnaapler.dnaapler_version } } \ No newline at end of file From 2ec4e8fb59e492972feea856fcc1b7623297c753 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 16 Dec 2024 07:59:06 -0600 Subject: [PATCH 51/58] add versions output to theiaprok --- workflows/theiaprok/wf_theiaprok_ont.wdl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 1043fbc88..7a40e8dd9 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -593,6 +593,14 @@ workflow theiaprok_ont { File? assembly_fasta = flye_denovo.assembly_fasta File? contigs_gfa = flye_denovo.contigs_gfa File? bandage_plot = flye_denovo.bandage_plot + String? porechop_version = flye_denovo.porechop_version + String flye_version = flye_denovo.flye_version + String bandage_version = flye_denovo.bandage_version + String? medaka_version = flye_denovo.medaka_version + String? racon_version = flye_denovo.racon_version + String? bwa_version = flye_denovo.bwa_version + String? polypolish_version = pflye_denovo.polypolish_version + String dnaapler_version = flye_denovo.dnaapler_version # Assembly QC - quast outputs File? quast_report = quast.quast_report String? quast_version = quast.version From 9ffc554cbb87f44c3fba1060ca990333f1afb86d Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 16 Dec 2024 08:41:34 -0600 Subject: [PATCH 52/58] update theiaprok wf --- workflows/theiaprok/wf_theiaprok_ont.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 7a40e8dd9..5bf102081 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -599,7 +599,7 @@ workflow theiaprok_ont { String? medaka_version = flye_denovo.medaka_version String? racon_version = flye_denovo.racon_version String? bwa_version = flye_denovo.bwa_version - String? polypolish_version = pflye_denovo.polypolish_version + String? polypolish_version = flye_denovo.polypolish_version String dnaapler_version = flye_denovo.dnaapler_version # Assembly QC - quast outputs File? quast_report = quast.quast_report From b088c74801ce0b001cf21ba0c277f2b76c08a886 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 16 Dec 2024 09:16:10 -0600 Subject: [PATCH 53/58] versions output --- workflows/theiaprok/wf_theiaprok_ont.wdl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 5bf102081..128049e8e 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -97,7 +97,7 @@ workflow theiaprok_ont { expected_genome_length = genome_length } if (clean_check_reads.read_screen == "PASS") { - call flye_workflow.flye_denovo { + call flye_workflow.flye_denovo as flye_denovo { input: read1 = read_qc_trim.read1_clean, samplename = samplename @@ -593,14 +593,15 @@ workflow theiaprok_ont { File? assembly_fasta = flye_denovo.assembly_fasta File? contigs_gfa = flye_denovo.contigs_gfa File? bandage_plot = flye_denovo.bandage_plot + String? medaka_model = flye_denovo.medaka_model String? porechop_version = flye_denovo.porechop_version - String flye_version = flye_denovo.flye_version - String bandage_version = flye_denovo.bandage_version + String? flye_version = flye_denovo.flye_version + String? bandage_version = flye_denovo.bandage_version String? medaka_version = flye_denovo.medaka_version String? racon_version = flye_denovo.racon_version String? bwa_version = flye_denovo.bwa_version String? polypolish_version = flye_denovo.polypolish_version - String dnaapler_version = flye_denovo.dnaapler_version + String? dnaapler_version = flye_denovo.dnaapler_version # Assembly QC - quast outputs File? quast_report = quast.quast_report String? quast_version = quast.version From abad81bf90f503fa2d649fbd1b677cedb7620fa7 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 16 Dec 2024 10:38:23 -0600 Subject: [PATCH 54/58] medaka model output --- tasks/polishing/task_medaka.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl index c07a92fd8..488a5ed4d 100644 --- a/tasks/polishing/task_medaka.wdl +++ b/tasks/polishing/task_medaka.wdl @@ -42,7 +42,7 @@ task medaka_consensus { fi echo "Using Medaka model for polishing: $selected_model" - echo ~[selected_model] > MEDAKA_MODEL + echo $selected_model > MEDAKA_MODEL # Perform Medaka polishing medaka_consensus \ From beaec7a45761c5c88328bc91f4110d4bd706586f Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 16 Dec 2024 13:52:42 -0600 Subject: [PATCH 55/58] update medaka model docs information for users --- docs/workflows/genomic_characterization/theiaprok.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 21ea06229..bcdd7ba26 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -850,6 +850,12 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al ??? toggle "`Medaka`: Polishing of Flye assembly" - Polishes the Flye assembly using `task_medaka.wdl`. + - Automatic Model Selection: Automatically determines the most appropriate Medaka model based on the input data, ensuring optimal polishing results without manual intervention. + - User-Specified Model Override: Allows users to specify a particular Medaka model if automatic selection does not yield the desired outcome or for specialized use cases. + - Default Model: If both automatic model selection fails and no user-specified model is provided, Medaka defaults to the predefined fallback model `r1041_e82_400bps_sup_v5.0.0`. + + !! note "Medaka Model Resolution Process" Medaka's automatic model selection leverages the medaka tools `resolve_model` command to analyze the input reads and identify the most suitable model for polishing. This process examines the error profiles and characteristics of the sequencing data to ensure that the selected model aligns well with the data's specific needs. If the automatic selection fails to identify a suitable model, Medaka gracefully falls back to the default model to maintain workflow continuity. Users should verify the chosen model and consider specifying a model override if necessary. + !!! techdetails "Technical Details: Medaka" - **Purpose**: Refines assembly accuracy for ONT data by incorporating error corrections. - **WDL Task**: [task_medaka.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_medaka.wdl) From fcd39b92f03d03f303d420a08472532df58e7c87 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 16 Dec 2024 16:03:52 -0600 Subject: [PATCH 56/58] update medaka model selection order --- tasks/polishing/task_medaka.wdl | 38 +++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl index 488a5ed4d..3e5d08dc1 100644 --- a/tasks/polishing/task_medaka.wdl +++ b/tasks/polishing/task_medaka.wdl @@ -20,25 +20,31 @@ task medaka_consensus { # Initialize selected_model variable selected_model="" - # Check if auto_model is enabled - if [[ "~{auto_model}" == "true" ]]; then - echo "Attempting automatic model selection..." - medaka tools resolve_model --auto_model consensus ~{read1} > auto_model.txt || true - auto_model=$(cat auto_model.txt || echo "") + # Check if override model is provided + if [[ -n "~{medaka_model_override}" ]]; then + echo "Using user-provided Medaka model override: ~{medaka_model_override}" + selected_model="~{medaka_model_override}" + else + # Check if auto_model is enabled + if [[ "~{auto_model}" == "true" ]]; then + echo "Attempting automatic model selection..." + medaka tools resolve_model --auto_model consensus ~{read1} > auto_model.txt || true + auto_model=$(cat auto_model.txt || echo "") - # Alert if automatic model selection fails - if [[ -z "$auto_model" ]]; then - echo "Warning: Automatic model selection failed. Defaulting to fallback model." + # Alert if automatic model selection fails + if [[ -n "$auto_model" ]]; then + selected_model="$auto_model" + echo "Automatically selected Medaka model: $selected_model" + else + echo "Warning: Automatic model selection failed or returned empty. Using default model." + fi fi - fi - # Use auto_model if detected, otherwise fallback to user-provided or default - if [[ -n "$auto_model" ]]; then - selected_model="$auto_model" - elif [[ -n "~{medaka_model_override}" ]]; then - selected_model="~{medaka_model_override}" - else - selected_model="r1041_e82_400bps_sup_v5.0.0" + # If no model selected yet, use default + if [[ -z "$selected_model" ]]; then + selected_model="r1041_e82_400bps_sup_v5.0.0" + echo "Using default Medaka model: $selected_model" + fi fi echo "Using Medaka model for polishing: $selected_model" From 78d3a5f3d082c099350ce4c2d067bca9eb73f038 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 17 Dec 2024 10:03:35 -0600 Subject: [PATCH 57/58] update md sums for theiaprok after merge main --- tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml | 2 +- tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index d9dfb5218..378f6bc5f 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -552,7 +552,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 8c97c5bd65e2787239f12ef425d479ae + md5sum: 75fd0459900986c63ef63be2a3446cfe - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl md5sum: ac49217c129add7c000eedf38acee8f3 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 2b6580c82..1e6e9accb 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -520,7 +520,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 8c97c5bd65e2787239f12ef425d479ae + md5sum: 75fd0459900986c63ef63be2a3446cfe - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl md5sum: 5e735ae6cb60f86ec7983274f3baf9f8 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl From 123f7eeacf445d8187a3ceced23247c4c495ed3c Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 20 Dec 2024 09:30:10 -0600 Subject: [PATCH 58/58] remove versioning task from fle sub wf --- workflows/utilities/wf_flye_denovo.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl index db373d8be..c0bbfba25 100644 --- a/workflows/utilities/wf_flye_denovo.wdl +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -6,7 +6,6 @@ import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage import "../../tasks/polishing/task_medaka.wdl" as task_medaka import "../../tasks/polishing/task_racon.wdl" as task_racon import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler -import "../../tasks/task_versioning.wdl" as versioning_task import "../../tasks/quality_control/read_filtering/task_filtercontigs.wdl" as task_filtercontigs import "../../tasks/alignment/task_bwa.wdl" as task_bwaall import "../../tasks/polishing/task_polypolish.wdl" as task_polypolish