diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 41e11c51b..65dd16db0 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -153,20 +153,68 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | | clean_check_reads | **organism** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | clean_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | -| dragonflye | **assembler** | String | The assembler to use in dragonflye. Three options: raven, miniasm, flye | flye | Optional | ONT | -| dragonflye | **assembler_options** | String | Enables extra assembler options in quote | | Optional | ONT | -| dragonflye | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | -| dragonflye | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | -| dragonflye | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/dragonflye:1.0.14--hdfd78af_0 | Optional | ONT | -| dragonflye | **illumina_polishing_rounds** | Int | Number of polishing rounds to conduct with Illumina data | 1 | Optional | ONT | -| dragonflye | **illumina_read1** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | -| dragonflye | **illumina_read2** | File | If Illumina reads are provided, Dragonflye will perform Illumina polishing | | Optional | ONT | -| dragonflye | **medaka_model** | String | The model of medaka to use for assembly | r941_min_hac_g507 | Optional | ONT | -| dragonflye | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT | -| dragonflye | **polishing_rounds** | Int | The number of polishing rounds to conduct (without Illumina) | 1 | Optional | ONT | -| dragonflye | **use_pilon_illumina_polisher** | Boolean | Set to true to use Pilon to polish Illumina reads | FALSE | Optional | ONT | -| dragonflye | **use_racon** | Boolean | Set to true to use Racon to polish instead of Medaka | FALSE | Optional | ONT | +| flye_denovo | **auto_medaka_model** | Boolean | If true, medaka will automatically select the best Medaka model for assembly | TRUE | Optional | ONT | +| flye_denovo | **bandage_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT | +| flye_denovo | **bandage_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 10 | Optional | ONT | +| flye_denovo | **bandage_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **dnaapler_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **dnaapler_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **dnaapler_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **dnaapler_mode** | String | Dnaapler-specific inputs | all | Optional | ONT | +| flye_denovo | **filtercontigs_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **filtercontigs_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **filtercontigs_homopolymers** | Boolean | If true, filters out homopolymers | TRUE | Optional | ONT | +| flye_denovo | **filtercontigs_min_len** | Int | Minimum contig length to keep | 1000 | Optional | ONT | +| flye_denovo | **filtercontigs_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **flye_ont_corrected** | File | If true, uses ONT corrected reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_ont_high_quality** | File | If true, uses ONT high-quality reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_pacbio_raw** | File | If true, uses PacBio raw reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_pacbio_corrected** | File | If true, uses PacBio corrected reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_pacbio_hifi** | File | If true, uses PacBio HiFi reads for assembly | FALSE | Optional | ONT | +| flye_denovo | **flye_genome_length** | Int | User-specified expected genome length to be used in genome statistics calculations | | Optional | ONT | +| flye_denovo | **flye_asm_coverage** | Int | Reduced coverage for initial disjointig assembly | | Optional | ONT | +| flye_denovo | **flye_polishing_iterations** | Int | Default polishing iterations | 1 | Optional | ONT | +| flye_denovo | **flye_minimum_overlap** | Int | Minimum overlap between reads | | Optional | ONT | +| flye_denovo | **flye_read_error_rate** | Float | Maximum expected read error rate | | Optional | ONT | +| flye_denovo | **flye_uneven_coverage_mode** | Boolean | | FALSE | Optional | ONT | +| flye_denovo | **flye_keep_haplotypes** | Boolean | If true keep haplotypes | FALSE | Optional | ONT | +| flye_denovo | **flye_no_alt_contigs** | Boolean | If true, do not generate alternative contigs | FALSE | Optional | ONT | +| flye_denovo | **flye_scaffold** | Boolean | If true, scaffold | FALSE | Optional | ONT | +| flye_denovo | **flye_additional_parameters** | String | Any extra Flye-specific parameters | | Optional | ONT | +| flye_denovo | **flye_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **flye_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT | +| flye_denovo | **flye_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **illumina_read1** | File | If Illumina reads are provided, flye_denovo subworkflow will perform Illumina polishing | | Optional | ONT | +| flye_denovo | **illumina_read2** | File | If Illumina reads are provided, flye_denovo subworflow will perform Illumina polishing | | Optional | ONT | +| flye_denovo | **medaka_model_override** | String | The model of medaka to use for assembly | r1041_e82_400bps_sup_v5.0.0 | Optional | ONT | +| flye_denovo | **medaka_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | +| flye_denovo | **medaka_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **medaka_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **polisher** | String | The polishing tool to use for assembly | medaka | Optional | ONT | +| flye_denovo | **polishing_rounds** | Int | The number of polishing rounds to conduct for medaka or racon (without Illumina) | 1 | Optional | ONT | +| flye_denovo | **porechop_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | ONT | +| flye_denovo | **porechop_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **porechop_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | ONT | +| flye_denovo | **porechop_trimopts** | String | Options to pass to Porechop for trimming | | Optional | ONT | +| flye_denovo | **polypolish_pair_orientation** | String | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_low_percentile_threshold** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_high_percentile_threshold** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_fraction_invalid** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_fraction_valid** | Float | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_maximum_errors** | Int | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_minimum_depth** | Int | Polypolish-specific inputs | | Optional | ONT | +| flye_denovo | **polypolish_careful** | Boolean | Polypolish-specific inputs | FALSE | Optional | ONT | +| flye_denovo | **polypolish_cpu** | Int | Polypolish cpu | 1 | Optional | ONT | +| flye_denovo | **polypolish_memory** | Int | Polypolish memory | 8 | Optional | ONT | +| flye_denovo | **polypolish_disk_size** | Int | Polypolish disk size | 100 | Optional | ONT | +| flye_denovo | **racon_cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | ONT | +| flye_denovo | **racon_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | ONT | +| flye_denovo | **racon_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | +| flye_denovo | **read1** | File | ONT read file in FASTQ file format (compression optional) | | Optional | ONT | +| flye_denovo | **skip_trim_reads** | Boolean | If true, trims reads before assembly | FALSE | Optional | ONT | +| flye_denovo | **skip_polishing** | Boolean | If true, skips polishing | FALSE | Optional | ONT | | export_taxon_tables | **asembly_fasta** | File | Internal component, do not modify | | Do not modify, Optional | FASTA | +| export_taxon_tables | **bandage_plot** | File | Internal component, do not modify | | Do not modify, Optional | ONT | | export_taxon_tables | **bbduk_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | | export_taxon_tables | **cg_pipeline_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT | | export_taxon_tables | **cg_pipeline_report_clean** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | @@ -179,7 +227,6 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | export_taxon_tables | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | FASTA, ONT, PE, SE | | export_taxon_tables | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | FASTA, ONT, PE, SE | | export_taxon_tables | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | FASTA, ONT, PE, SE | -| export_taxon_tables | **dragonflye_version** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, PE, SE | | export_taxon_tables | **emmtypingtool_docker** | String | The Docker container to use for the task | | Do not modify, Optional | FASTA, ONT, SE | | export_taxon_tables | **emmtypingtool_emm_type** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | export_taxon_tables | **emmtypingtool_results_xml** | File | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | @@ -790,7 +837,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al **Estimated genome length**: - By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `Rasusa` and `dragonflye`. + By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `RASUSA`. **Plotting and quantifying long-read sequencing data:** `nanoplot` @@ -813,13 +860,96 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan), [NanoPlot](https://github.com/wdecoster/NanoPlot), [RASUSA](https://github.com/mbhall88/rasusa), [tiptoft](https://github.com/andrewjpage/tiptoft), [nanoq](https://github.com/esteinig/nanoq) | | Original Publication(s) | [NanoPlot paper](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911)
[RASUSA paper](https://doi.org/10.21105/joss.03941)
[Nanoq Paper](https://doi.org/10.21105/joss.02991)
[Tiptoft paper](https://doi.org/10.21105/joss.01021) | -??? task "`dragonflye`: _De novo_ Assembly" - !!! techdetails "dragonflye Technical Details" - | | Links | - | --- | --- | - | Task | [task_dragonflye.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_dragonflye.wdl) | - | Software Source Code | [dragonflye on GitHub](https://github.com/rpetit3/dragonflye) | - | Software Documentation | [dragonflye on GitHub](https://github.com/rpetit3/dragonflye) | +??? task "`Flye`: _De novo_ Assembly" + + `Flye_denovo` is a sub-workflow that performs _de novo_ assembly using Flye for ONT data and supports additional polishing and visualization steps. The detailed steps and tasks are as follows: + + ??? toggle "`Porechop`: Optional Read Trimming" + - Trims input reads if trimming is enabled using `task_porechop.wdl`. + + !!! techdetails "Technical Details: Porechop" + - **Purpose**: Ensures high-quality reads by removing adapters and other unwanted sequences. + - **WDL Task**: [task_porechop.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_porechop.wdl) + - **Software Source Code**: [Porechop on GitHub](https://github.com/rrwick/Porechop) + - **Software Documentation**: [Porechop Documentation](https://github.com/rrwick/Porechop#porechop) + + ??? toggle "`Flye`: De novo_ Assembly" + - Assembles long reads into contigs using `task_flye.wdl`. + + !!! techdetails "Technical Details: Flye" + - **Purpose**: Assembles raw reads into contiguous sequences (contigs) for further analysis. + - **WDL Task**: [task_flye.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_flye.wdl) + - **Software Source Code**: [Flye on GitHub](https://github.com/fenderglass/Flye) + - **Software Documentation**: [Flye Documentation](https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md) + + ??? toggle "`Bandage`: Graph Visualization" + - Visualizes the assembly graph output from `Flye` using `task_bandageplot.wdl`. + + !!! techdetails "Technical Details: Bandage" + - **Purpose**: Generates a graphical representation of the assembly to aid visualization and quality checks. + - **WDL Task**: [task_bandageplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_bandageplot.wdl) + - **Software Source Code**: [Bandage on GitHub](https://github.com/rrwick/Bandage) + - **Software Documentation**: [Bandage Documentation](https://github.com/rrwick/Bandage#bandage) + + ??? toggle "`BWA`: Index and align data for Hybrid Assembly Alignment ==_for ONT and Illumina data_==" + - Indexes and creates SAM files for input to Polypolish using `task_bwaall.wdl`. + + !!! techdetails "Technical Details: BWA" + - **Purpose**: Aligns Illumina reads to the draft assembly and produces SAM files for hybrid polishing. + - **WDL Task**: [task_bwaall.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_bwa.wdl) + - **Software Source Code**: [BWA on GitHub](https://github.com/lh3/bwa) + - **Software Documentation**: [BWA Documentation](http://bio-bwa.sourceforge.net/) + + ??? toggle "`Polypolish`: Hybrid Assembly Polishing ==_for ONT and Illumina data_==" + - Polishes assemblies with hybrid ONT and Illumina data using `task_polypolish.wdl`. + + !!! techdetails "Technical Details: Polypolish" + - **Purpose**: Enhances hybrid assemblies by leveraging short-read data for fine-tuning. + - **WDL Task**: [task_polypolish.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_polypolish.wdl) + - **Software Source Code**: [Polypolish on GitHub](https://github.com/rrwick/Polypolish) + - **Software Documentation**: [Polypolish Documentation](https://github.com/rrwick/Polypolish#polypolish) + + ??? toggle "`Medaka`: Polishing of Flye assembly" + - Polishes the Flye assembly using `task_medaka.wdl`. + + - Automatic Model Selection: Automatically determines the most appropriate Medaka model based on the input data, ensuring optimal polishing results without manual intervention. + - User-Specified Model Override: Allows users to specify a particular Medaka model if automatic selection does not yield the desired outcome or for specialized use cases. + - Default Model: If both automatic model selection fails and no user-specified model is provided, Medaka defaults to the predefined fallback model `r1041_e82_400bps_sup_v5.0.0`. + + !! note "Medaka Model Resolution Process" Medaka's automatic model selection leverages the medaka tools `resolve_model` command to analyze the input reads and identify the most suitable model for polishing. This process examines the error profiles and characteristics of the sequencing data to ensure that the selected model aligns well with the data's specific needs. If the automatic selection fails to identify a suitable model, Medaka gracefully falls back to the default model to maintain workflow continuity. Users should verify the chosen model and consider specifying a model override if necessary. + + !!! techdetails "Technical Details: Medaka" + - **Purpose**: Refines assembly accuracy for ONT data by incorporating error corrections. + - **WDL Task**: [task_medaka.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_medaka.wdl) + - **Software Source Code**: [Medaka on GitHub](https://github.com/nanoporetech/medaka) + - **Software Documentation**: [Medaka Documentation](https://github.com/nanoporetech/medaka#medaka) + + ??? toggle "`Racon`: Polishing" + - Offers an alternative polishing approach using `task_racon.wdl`. + + !!! techdetails "Technical Details: Racon" + - **Purpose**: Improves assembly accuracy for long-read data using sequence alignments. + - **WDL Task**: [task_racon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/polishing/task_racon.wdl) + - **Software Source Code**: [Racon on GitHub](https://github.com/lbcb-sci/racon) + - **Software Documentation**: [Racon Documentation](https://github.com/lbcb-sci/racon#racon) + + ??? toggle "`Filter Contigs`: filter contigs below a threshold length and remove homopolymer contigs" + - Filters contigs based on user-defined criteria using `task_filtercontigs.wdl`. + + !!! techdetails "Technical Details: Filter Contigs" + - **Purpose**: Removes low-quality or short contigs to improve downstream analyses. + - **WDL Task**: [task_filtercontigs.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_filtercontigs.wdl) + - **Software Source Code**: [FilterContigs on GitHub](https://github.com/your-organization/filtercontigs) (replace with actual link) + - **Software Documentation**: [FilterContigs Documentation](https://github.com/your-organization/filtercontigs#readme) (replace with actual link) + + ??? toggle "`DnaApler`: Final Assembly Orientation" + - Reorients contigs to start at a standard reference point using `task_dnaapler.wdl`. + + !!! techdetails "Technical Details: DnaApler" + - **Purpose**: Ensures consistent starting positions for assemblies based on reference standards. + - **WDL Task**: [task_dnaapler.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_dnaapler.wdl) + - **Software Source Code**: [DnaApler on GitHub](https://github.com/your-organization/dnaapler) (replace with actual link) + - **Software Documentation**: [DnaApler Documentation](https://github.com/your-organization/dnaapler#readme) (replace with actual link) #### Post-Assembly Tasks (performed for all taxa) @@ -1719,19 +1849,22 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | ani_mummer_version | String | Version of MUMmer used | FASTA, ONT, PE, SE | | ani_output_tsv | File | Full output TSV from ani-m | FASTA, ONT, PE, SE | | ani_top_species_match | String | Species of genome with highest ANI to query FASTA | FASTA, ONT, PE, SE | -| assembly_fasta | File | https://github.com/tseemann/shovill#contigsfa | ONT, PE, SE | +| assembly_fasta | File | Fasta file outputted from Flye or Shovill | ONT, PE, SE | | assembly_length | Int | Length of assembly (total contig length) as determined by QUAST | FASTA, ONT, PE, SE | | bakta_gbff | File | Genomic GenBank format annotation file | FASTA, ONT, PE, SE | | bakta_gff3 | File | Generic Feature Format Version 3 file | FASTA, ONT, PE, SE | | bakta_summary | File | Bakta summary output TXT file | FASTA, ONT, PE, SE | | bakta_tsv | File | Annotations as simple human readable TSV | FASTA, ONT, PE, SE | | bakta_version | String | Bakta version used | FASTA, ONT, PE, SE | +| bandage_plot | File | Image file (PNG) visualizing the Flye assembly graph generated by Bandage | FASTA, ONT, PE, SE | +| bandage_version | String | Version of Bandage used | FASTA, ONT, PE, SE | | bbduk_docker | String | BBDuk docker image used | PE, SE | | busco_database | String | BUSCO database used | FASTA, ONT, PE, SE | | busco_docker | String | BUSCO docker image used | FASTA, ONT, PE, SE | | busco_report | File | A plain text summary of the results in BUSCO notation | FASTA, ONT, PE, SE | -| busco_results | String | BUSCO results (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) | FASTA, ONT, PE, SE | +| busco_results | String | BUSCO results (see https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) | ONT | | busco_version | String | BUSCO software version used | FASTA, ONT, PE, SE | +| bwa_version | String | Version of BWA software used | FASTA, ONT, PE, SE | | cg_pipeline_docker | String | Docker file used for running CG-Pipeline on cleaned reads | PE, SE | | cg_pipeline_report_clean | File | TSV file of read metrics from clean reads, including average read length, number of reads, and estimated genome coverage | PE, SE | | cg_pipeline_report_raw | File | TSV file of read metrics from raw reads, including average read length, number of reads, and estimated genome coverage | PE, SE | @@ -1742,9 +1875,9 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | combined_mean_readlength_clean | Float | Mean read length for the combined clean reads | PE | | combined_mean_readlength_raw | Float | Mean read length for the combined raw reads | PE | | contigs_fastg | File | Assembly graph if megahit used for genome assembly | PE | -| contigs_gfa | File | Assembly graph if spades used for genome assembly | ONT, PE, SE | -| contigs_lastgraph | File | Assembly graph if velvet used for genome assembly | PE | -| dragonflye_version | String | Version of dragonflye used for de novo assembly | ONT | +| contigs_gfa | File | Assembly graph if spades or Flye is used for genome assembly | ONT, PE, SE | +| contigs_lastgraph | File | Assemb7ly graph if velvet used for genome assembly | PE | +| dnaapler_version | String | Version of dnaAplER used | FASTA, ONT, PE, SE | | ectyper_predicted_serotype | String | Serotype predicted by ECTyper | FASTA, ONT, PE, SE | | ectyper_results | File | TSV file of evidence for ECTyper predicted serotype (see https://github.com/phac-nml/ecoli_serotyping#report-format) | FASTA, ONT, PE, SE | | ectyper_version | String | Version of ECTyper used | FASTA, ONT, PE, SE | @@ -1779,6 +1912,8 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | fastqc_raw1_html | File | Graphical visualization of raw forward read quality from fastqc to open in an internet browser | PE, SE | | fastqc_raw2_html | File | Graphical visualization of raw reverse read qualityfrom fastqc to open in an internet browser | PE | | fastqc_version | String | Version of fastqc software used | PE, SE | +| final_assembly | File | Final assembly file output from flye_denovo assembly| FASTA, ONT, PE, SE | +| flye_version | String | Version of Flye software used | FASTA, ONT, PE, SE | | gambit_closest_genomes | File | CSV file listing genomes in the GAMBIT database that are most similar to the query assembly | FASTA, ONT, PE, SE | | gambit_db_version | String | Version of GAMBIT used | FASTA, ONT, PE, SE | | gambit_docker | String | GAMBIT docker file used | FASTA, ONT, PE, SE | @@ -1836,6 +1971,8 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | lissero_results | File | TSV results file from LisSero (see https://github.com/MDU-PHL/LisSero#example-output) | FASTA, ONT, PE, SE | | lissero_serotype | String | Serotype predicted by LisSero | FASTA, ONT, PE, SE | | lissero_version | String | Version of LisSero used | FASTA, ONT, PE, SE | +| medaka_model | String | Model used by Medaka | ONT | +| medaka_version | String | Version of Medaka used | ONT | | meningotype_BAST | String | BAST type | FASTA, ONT, PE, SE | | meningotype_FetA | String | FetA type | FASTA, ONT, PE, SE | | meningotype_fHbp | String | fHbp type | FASTA, ONT, PE, SE | @@ -1908,11 +2045,13 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | plasmidfinder_plasmids | String | Names of plasmids identified by PlasmidFinder | FASTA, ONT, PE, SE | | plasmidfinder_results | File | Output file from PlasmidFinder in TSV format | FASTA, ONT, PE, SE | | plasmidfinder_seqs | File | Hit_in_genome_seq.fsa file produced by PlasmidFinder | FASTA, ONT, PE, SE | +| polypolish_version | String | Version of Polypolish used | FASTA, ONT, PE, SE | | poppunk_docker | String | PopPUNK docker image with GPSC database used | FASTA, ONT, PE, SE | | poppunk_gps_cluster | String | GPS cluster predicted by PopPUNK | FASTA, ONT, PE, SE | | poppunk_GPS_db_version | String | Version of GPSC database used | FASTA, ONT, PE, SE | | poppunk_gps_external_cluster_csv | File | GPSC v6 scheme designations | FASTA, ONT, PE, SE | | poppunk_version | String | Version of PopPUNK used | FASTA, ONT, PE, SE | +| porechop_version | String | Version of Porechop used | ONT | | prokka_gbk | File | GenBank file produced from Prokka annotation of input FASTA | FASTA, ONT, PE, SE | | prokka_gff | File | Prokka output GFF3 file containing sequence and annotation (you can view this in IGV) | FASTA, ONT, PE, SE | | prokka_sqn | File | A Sequin file for GenBank submission | FASTA, ONT, PE, SE | @@ -1929,6 +2068,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | r2_mean_q_raw | Float | Mean quality score of raw reverse reads | PE | | r2_mean_readlength_clean | Float | Mean read length of clean reverse reads | PE | | r2_mean_readlength_raw | Float | Mean read length of raw reverse reads | PE | +| racon_version | String | Version of Racon used | ONT | | rasusa_version | String | Version of RASUSA used for analysis | ONT | | read_screen_clean | String | PASS or FAIL result from clean read screening; FAIL accompanied by the reason for failure | ONT, PE, SE | | read_screen_raw | String | PASS or FAIL result from raw read screening; FAIL accompanied by thereason for failure | ONT, PE, SE | diff --git a/tasks/alignment/task_bwa.wdl b/tasks/alignment/task_bwa.wdl index 9aabb6cd7..ec7177d4d 100644 --- a/tasks/alignment/task_bwa.wdl +++ b/tasks/alignment/task_bwa.wdl @@ -153,4 +153,47 @@ task bwa { preemptible: 0 maxRetries: 3 } -} \ No newline at end of file +} + +task bwa_all { + input { + File draft_assembly_fasta + File read1 + File read2 + String samplename + + Int cpu = 6 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/bwa:0.7.18" + Int memory = 16 + } + command <<< + bwa &> BWA_HELP + grep "Version" BWA_HELP | cut -d" " -f2 > BWA_VERSION + + if [[ ! -f "~{draft_assembly_fasta}.bwt" ]]; then + echo "Indexing reference genome: ~{draft_assembly_fasta}" + bwa index ~{draft_assembly_fasta} + else + echo "Reference genome is already indexed: ~{draft_assembly_fasta}" + fi + + bwa mem -t ~{cpu} -a ~{draft_assembly_fasta} ~{read1} > ~{samplename}_R1.sam + bwa mem -t ~{cpu} -a ~{draft_assembly_fasta} ~{read2} > ~{samplename}_R2.sam + + >>> + output { + File read1_sam = "~{samplename}_R1.sam" + File read2_sam = "~{samplename}_R2.sam" + String bwa_version = read_string("BWA_VERSION") + } + runtime { + docker: "~{docker}" + memory: "~{memory} GB" + cpu: "~{cpu}" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/assembly/task_bandageplot.wdl b/tasks/assembly/task_bandageplot.wdl new file mode 100644 index 000000000..23549a5be --- /dev/null +++ b/tasks/assembly/task_bandageplot.wdl @@ -0,0 +1,30 @@ +version 1.0 + +task bandage_plot { + input { + File assembly_graph_gfa + String samplename + Int cpu = 2 + Int memory = 4 + Int disk_size = 10 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/bandage:0.8.1" + } + command <<< + set -euo pipefail + Bandage --version | tee VERSION + Bandage image ~{assembly_graph_gfa} ~{samplename}_bandage_plot.png + >>> + output { + File plot = "~{samplename}_bandage_plot.png" + String bandage_version = read_string("VERSION") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 1 + preemptible: 0 + } +} diff --git a/tasks/assembly/task_dnaapler.wdl b/tasks/assembly/task_dnaapler.wdl new file mode 100644 index 000000000..c97210da9 --- /dev/null +++ b/tasks/assembly/task_dnaapler.wdl @@ -0,0 +1,72 @@ +version 1.0 + +task dnaapler_all { + input { + File input_fasta + String samplename + String dmaapler_mode = "all" # The mode of reorientation to execute (default: 'all') + Int cpu = 4 + Int disk_size = 100 + Int memory = 16 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/dnaapler:1.0.1" + } + command <<< + set -euo pipefail + + # Check input FASTA is valid + echo "Validating input FASTA file..." + if ! grep -q "^>" ~{input_fasta}; then + echo "ERROR: Input file ~{input_fasta} is not in FASTA format." >&2 + exit 1 + else + echo "Input FASTA file is valid." + fi + + # dnaapler version + dnaapler --version | tee VERSION + + # Create a subdirectory for dnaapler outputs + output_dir="dnaapler_output" + mkdir -p "$output_dir" + echo "Output directory created: $output_dir" + + # Run dnaapler with the 'all' subcommand + echo "Running dnaapler..." + dnaapler ~{dmaapler_mode} \ + -i ~{input_fasta} \ + -o "$output_dir" \ + -p ~{samplename} \ + -t ~{cpu} \ + -f || { + echo "ERROR: dnaapler command failed. Check logs for details." >&2 + exit 1 + } + + echo "dnaapler command completed successfully." + + # Check if output FASTA file exists + if [[ ! -f "$output_dir"/~{samplename}_reoriented.fasta ]]; then + echo "ERROR: Expected output file not found: $output_dir/~{samplename}_reoriented.fasta" >&2 + exit 1 + fi + + # Move the final reoriented FASTA file to the task's working directory + echo "Moving output FASTA file to working directory..." + mv "$output_dir"/~{samplename}_reoriented.fasta . + + echo "dnaapler task completed successfully for sample: ~{samplename}" + >>> + output { + File reoriented_fasta = "~{samplename}_reoriented.fasta" + String dnaapler_version = read_string("VERSION") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} \ No newline at end of file diff --git a/tasks/assembly/task_dragonflye.wdl b/tasks/assembly/task_dragonflye.wdl deleted file mode 100644 index 794c5888d..000000000 --- a/tasks/assembly/task_dragonflye.wdl +++ /dev/null @@ -1,96 +0,0 @@ -version 1.0 - -task dragonflye { - input { - File read1 # intended for ONT data only - String samplename - String? assembler # default is flye - String? assembler_options # default '' - String? genome_length # default autodetect - File? illumina_read1 # illumina read1 fastq to use for polishing - File? illumina_read2 # illumina read2 fastq to use for polishing - Boolean use_pilon_illumina_polisher = false # use polypolish by default - Int illumina_polishing_rounds = 1 # default 1 - Int polishing_rounds = 1 # default 1 - Boolean use_racon = false # use medaka polishing by default - String medaka_model = "r941_min_hac_g507" - String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/dragonflye:1.0.14--hdfd78af_0" - Int disk_size = 100 - Int cpu = 4 - Int memory = 32 - } - command <<< - # get version information - dragonflye --version | tee VERSION - - # determine polishing parameter - if [ "~{use_racon}" = true ]; then - # --racon indicates number of polishing rounds to conduct with Racon - POLISHER="--racon ~{polishing_rounds}" - else - # --medaka indicates number of polishing rounds to conduct with medaka using medaka_model - POLISHER="--model ~{medaka_model} --medaka ~{polishing_rounds}" - fi - - # determine if illumina polishing will be performed based on presence of two files - if [ -f "~{illumina_read1}" ] && [ -f "~{illumina_read2}" ]; then - echo "Illumina reads provided; will perform Illumina polishing" - if [ "~{use_pilon_illumina_polisher}" = true ]; then - # use --pilon instead of polypolish as default - ILLUMINA_POLISHER="--R1 ~{illumina_read1} --R2 ~{illumina_read2} --pilon ~{illumina_polishing_rounds}" - else - # use --polypolish by default - ILLUMINA_POLISHER="--R1 ~{illumina_read1} --R2 ~{illumina_read2} --polypolish ~{illumina_polishing_rounds}" - fi - else - echo "Illumina reads were not provided." - ILLUMINA_POLISHER="" - fi - - # reduce requested memory to prevent out of memory errors - mem=$(( ~{memory}-1 )) - - # run dragonflye - # --reads for input nanopore fastq - # --depth 0 disables sub-sampling of reads (performed with rasusa already) - # --outdir indicates output directory - # --gsize indicates genome size; if this is not set it will autodetect size - # --cpus number of cpus to use - # --ram try to keep RAM usage below this number - # --nofilter disables read length filtering (performed with nanoq already) - # --assembler has three options: raven, miniasm, flye (default: flye) - # --opts enables extra assembler options in quotes - # see above for polisher input explanation - # see above for illumina polisher input explanation - dragonflye \ - --reads ~{read1} \ - --depth 0 \ - --outdir dragonflye \ - ~{'--gsize ' + genome_length} \ - --cpus ~{cpu} \ - --ram ${mem} \ - --nofilter \ - ~{'--assembler ' + assembler} \ - ~{'--opts "' + assembler_options + '"'} \ - ${POLISHER} \ - ${ILLUMINA_POLISHER} - - # rename final output file to have .fasta ending instead of .fa - mv dragonflye/contigs.fa ~{samplename}.fasta - mv dragonflye/*.gfa ~{samplename}_contigs.gfa - >>> - output { - File assembly_fasta = "~{samplename}.fasta" - File contigs_gfa = "~{samplename}_contigs.gfa" - String dragonflye_version = read_string("VERSION") - } - runtime { - docker: "~{docker}" - memory: "~{memory} GB" - cpu: cpu - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" - maxRetries: 3 - preemptible: 0 - } -} \ No newline at end of file diff --git a/tasks/assembly/task_flye.wdl b/tasks/assembly/task_flye.wdl new file mode 100644 index 000000000..bbcfd084a --- /dev/null +++ b/tasks/assembly/task_flye.wdl @@ -0,0 +1,90 @@ +version 1.0 + +task flye { + input { + File read1 + String samplename + + # data type options; by default, uses --nano-raw + Boolean ont_corrected = false + Boolean ont_high_quality = false + Boolean pacbio_raw = false + Boolean pacbio_corrected = false + Boolean pacbio_hifi = false + + Int? genome_length # requires `asm_coverage` + Int? asm_coverage # reduced coverage for initial disjointig assembly + + Int flye_polishing_iterations = 1 + Int? minimum_overlap + + Float? read_error_rate + Boolean uneven_coverage_mode = false + Boolean keep_haplotypes = false + Boolean no_alt_contigs = false + Boolean scaffold = false + + String? additional_parameters # Any extra Flye-specific parameters + + Int cpu = 4 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/flye:2.9.4" + Int memory = 32 + } + command <<< + set -euo pipefail + flye --version | tee VERSION + + # determine read type + if ~{ont_corrected}; then + READ_TYPE="--nano-corr" + elif ~{ont_high_quality}; then + READ_TYPE="--nano-hq" + elif ~{pacbio_raw}; then + READ_TYPE="--pacbio-raw" + elif ~{pacbio_corrected}; then + READ_TYPE="--pacbio-corr" + elif ~{pacbio_hifi}; then + READ_TYPE="--pacbio-hifi" + else + READ_TYPE="--nano-raw" + fi + + # genome size parameter requires asm_coverage + flye \ + ${READ_TYPE} ~{read1} \ + --iterations ~{flye_polishing_iterations} \ + ~{"--min-overlap" + minimum_overlap} \ + ~{if defined(asm_coverage) then "--genome-size " + genome_length else ""} \ + ~{"--asm-coverage " + asm_coverage} \ + ~{"--read-error " + read_error_rate} \ + ~{true="--meta" false="" uneven_coverage_mode} \ + ~{true="--keep-haplotypes" false="" keep_haplotypes} \ + ~{true="--no-alt-contigs" false="" no_alt_contigs} \ + ~{true="--scaffold" false="" scaffold} \ + ~{"--extra-params " + additional_parameters } \ + --threads ~{cpu} \ + --out-dir . + + mv assembly.fasta ~{samplename}.assembly.fasta + mv assembly_info.txt ~{samplename}.assembly_info.txt + mv assembly_graph.gfa ~{samplename}.assembly_graph.gfa + + >>> + output { + File assembly_fasta = "~{samplename}.assembly.fasta" + File assembly_graph_gfa = "~{samplename}.assembly_graph.gfa" + File assembly_info = "~{samplename}.assembly_info.txt" + String flye_version = read_string("VERSION") + String flye_docker = "~{docker}" + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} \ No newline at end of file diff --git a/tasks/polishing/task_medaka.wdl b/tasks/polishing/task_medaka.wdl new file mode 100644 index 000000000..3e5d08dc1 --- /dev/null +++ b/tasks/polishing/task_medaka.wdl @@ -0,0 +1,77 @@ +version 1.0 + +task medaka_consensus { + input { + File unpolished_fasta + String samplename + File read1 + Boolean auto_model = true # Automatically resolve model by inspecting input + String? medaka_model_override # Optional user-specified model + Int cpu = 4 + Int memory = 16 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/medaka:2.0.1" + } + command <<< + set -euo pipefail + + medaka --version | tee MEDAKA_VERSION + + # Initialize selected_model variable + selected_model="" + + # Check if override model is provided + if [[ -n "~{medaka_model_override}" ]]; then + echo "Using user-provided Medaka model override: ~{medaka_model_override}" + selected_model="~{medaka_model_override}" + else + # Check if auto_model is enabled + if [[ "~{auto_model}" == "true" ]]; then + echo "Attempting automatic model selection..." + medaka tools resolve_model --auto_model consensus ~{read1} > auto_model.txt || true + auto_model=$(cat auto_model.txt || echo "") + + # Alert if automatic model selection fails + if [[ -n "$auto_model" ]]; then + selected_model="$auto_model" + echo "Automatically selected Medaka model: $selected_model" + else + echo "Warning: Automatic model selection failed or returned empty. Using default model." + fi + fi + + # If no model selected yet, use default + if [[ -z "$selected_model" ]]; then + selected_model="r1041_e82_400bps_sup_v5.0.0" + echo "Using default Medaka model: $selected_model" + fi + fi + + echo "Using Medaka model for polishing: $selected_model" + echo $selected_model > MEDAKA_MODEL + + # Perform Medaka polishing + medaka_consensus \ + -i ~{read1} \ + -d ~{unpolished_fasta} \ + -o . \ + -m $selected_model \ + -t ~{cpu} + + mv consensus.fasta ~{samplename}.polished.fasta + >>> + output { + File medaka_fasta = "~{samplename}.polished.fasta" + String medaka_version = read_string("MEDAKA_VERSION") + String medaka_model = read_string("MEDAKA_MODEL") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/polishing/task_polypolish.wdl b/tasks/polishing/task_polypolish.wdl new file mode 100644 index 000000000..1f3f76360 --- /dev/null +++ b/tasks/polishing/task_polypolish.wdl @@ -0,0 +1,77 @@ +version 1.0 + +task polypolish { + input { + String samplename + File assembly_fasta + File read1_sam # these files need to be aligned to the draft assembly with `-a` flag + File read2_sam # see also the task_bwa.wdl#bwa_all task + + Int illumina_polishing_rounds = 1 # Default: 1 round of polishing + String? pair_orientation # default: auto + Float? low_percentile_threshold # default: 0.1 + Float? high_percentile_threshold # default: 99.9 + + Float? fraction_invalid # default: 0.2 + Float? fraction_valid # default 0.5 + Int? maximum_errors # default: 10 + Int? minimum_depth # default: 5 + Boolean careful = false # ignore any reads with multiple alignments + + Int cpu = 1 # polypolish is single-threaded + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/polypolish:0.6.0" + Int memory = 8 + } + command <<< + set -euo pipefail + + polypolish --version | tee VERSION + + # Initial input for polishing + polished_assembly="~{assembly_fasta}" + + for i in $(seq 1 ~{illumina_polishing_rounds}); do + echo "Starting Polypolish round $i..." + + # Filter SAM files + polypolish filter \ + --in1 ~{read1_sam} \ + --in2 ~{read2_sam} \ + --out1 ${polished_assembly}_filtered1.sam \ + --out2 ${polished_assembly}_filtered2.sam \ + ~{"--orientation " + pair_orientation} \ + ~{"--low " + low_percentile_threshold} \ + ~{"--high " + high_percentile_threshold} + + # Perform polishing + polypolish polish ${polished_assembly} \ + ${polished_assembly}_filtered1.sam \ + ${polished_assembly}_filtered2.sam \ + ~{"--fraction_invalid " + fraction_invalid} \ + ~{"--fraction_valid " + fraction_valid} \ + ~{"--max_errors " + maximum_errors} \ + ~{"--min_depth " + minimum_depth} \ + ~{true="--careful" false="" careful} \ + > "${polished_assembly}_round${i}.fasta" + + polished_assembly="${polished_assembly}_round${i}.fasta" + done + + # Final polished output + mv "${polished_assembly}" "~{samplename}_final_polished.fasta" + >>> + output { + String polypolish_version = read_string("VERSION") + File polished_assembly = "~{samplename}_final_polished.fasta" + } + runtime { + docker: "~{docker}" + memory: "~{memory} GB" + cpu: "~{cpu}" + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} \ No newline at end of file diff --git a/tasks/polishing/task_racon.wdl b/tasks/polishing/task_racon.wdl new file mode 100644 index 000000000..21a9572e2 --- /dev/null +++ b/tasks/polishing/task_racon.wdl @@ -0,0 +1,62 @@ +version 1.0 + +task racon { + input { + File unpolished_fasta + File read1 + Int polishing_rounds = 1 # Default: 1 polishing round + Int cpu = 8 + Int memory = 32 + Int disk_size = 100 + String samplename + String docker = "us-docker.pkg.dev/general-theiagen/staphb/racon:1.5.0-minimap2-generic-v3" + } + command <<< + set -euo pipefail + + minimap2 --version | tee MINIMAP2_VERSION + racon --version | tee -a RACON_VERSION + + echo "Starting Racon polishing process..." + + # Initialize the input assembly + intermediate_fasta="~{unpolished_fasta}" + + # Loop through polishing rounds + for i in $(seq 1 ~{polishing_rounds}); do + echo "Polishing round $i..." + + # Align reads to the current assembly + minimap2 -x map-ont -t ~{cpu} "${intermediate_fasta}" "~{read1}" > "~{samplename}_round${i}.paf" + + # Run Racon to polish the assembly + racon \ + -t ~{cpu} \ + "~{read1}" \ + "~{samplename}_round${i}.paf" \ + "${intermediate_fasta}" \ + > "~{samplename}_round${i}.polished.fasta" + + # Update for the next round + intermediate_fasta="~{samplename}_round${i}.polished.fasta" + done + + # Save the final polished assembly + mv "${intermediate_fasta}" "~{samplename}_final_polished.fasta" + echo "Polishing complete. Final assembly saved as ~{samplename}_final_polished.fasta" + >>> + output { + File polished_fasta = "~{samplename}_final_polished.fasta" + String racon_version = read_string("RACON_VERSION") + String minimap2_version = read_string("MINIMAP2_VERSION") + } + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + maxRetries: 1 + preemptible: 0 + } +} + diff --git a/tasks/quality_control/read_filtering/task_filtercontigs.wdl b/tasks/quality_control/read_filtering/task_filtercontigs.wdl new file mode 100644 index 000000000..1910c3308 --- /dev/null +++ b/tasks/quality_control/read_filtering/task_filtercontigs.wdl @@ -0,0 +1,98 @@ +version 1.0 + +task contig_filter { + input { + File assembly_fasta + Int min_len = 1000 + Boolean filter_homopolymers = true + Int disk_size = 100 + Int memory = 16 + Int cpu = 4 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/seqkit:2.8.2" + } + command <<< + set -euo pipefail + echo "Filtering contigs from ~{assembly_fasta}" >&2 + + # Calculate initial metrics + total_contigs=$(grep -c "^>" ~{assembly_fasta}) + echo "Total contigs: $total_contigs" >&2 + + # Calculate total base pairs in all contigs and output individual lengths + awk '/^>/ {if (seqlen) {print seqlen; total+=seqlen}; seqlen=0; next} {seqlen+=length($0)} END {if (seqlen) {print seqlen; total+=seqlen}; print "Total:" total}' ~{assembly_fasta} > contig_lengths_before.txt + total_bases=$(tail -n 1 contig_lengths_before.txt | cut -d ':' -f 2) + echo "Total bases: $total_bases" >&2 + + # Filter contigs by length + seqkit seq -m ~{min_len} ~{assembly_fasta} > length_filtered.fasta + echo "Length filtering complete. File size:" >&2 + ls -lh length_filtered.fasta >&2 + + # Optionally filter out contigs with homopolymers + if [ "~{filter_homopolymers}" = "true" ]; then + awk ' + BEGIN {RS=">"; ORS=""} + NR > 1 { + split($0, lines, "\n"); + header = lines[1]; + seq = ""; + for (i=2; i<=length(lines); i++) { + seq = seq lines[i]; + } + if (seq !~ /^(.)\1+$/) { + print ">" header "\n" seq "\n" + } + }' length_filtered.fasta > filtered_contigs.fasta + else + mv length_filtered.fasta filtered_contigs.fasta + fi + + # Validate the final file is not empty + if [ ! -s filtered_contigs.fasta ]; then + echo "Error: No contigs passed filtering criteria!" >&2 + exit 1 + fi + + # Calculate final metrics for the filtered contigs + awk '/^>/ {if (seqlen) {print seqlen; total+=seqlen}; seqlen=0; next} {seqlen+=length($0)} END {if (seqlen) {print seqlen; total+=seqlen}; print "Total:" total}' filtered_contigs.fasta > contig_lengths_after.txt + retained_contigs=$(grep -c "^>" filtered_contigs.fasta) + retained_bases=$(tail -n 1 contig_lengths_after.txt | cut -d ':' -f 2) + + # Calculate removed contigs + contigs_removed_short=$((total_contigs - $(grep -c "^>" length_filtered.fasta))) + contigs_removed_homopolymers=$((total_contigs - retained_contigs - contigs_removed_short)) + + # Write filtering metrics to a summary file + metrics_file="filtering_metrics.txt" + { + echo "Total contigs: $total_contigs" + echo "Total bases: $total_bases" + echo "Contigs retained: $retained_contigs" + echo "Bases retained: $retained_bases" + echo "Contigs removed (short length): $contigs_removed_short" + echo "Contigs removed (homopolymers): $contigs_removed_homopolymers" + } > $metrics_file + + # Output individual contig lengths before and after filtering + echo "Contig lengths before filtering:" >> $metrics_file + cat contig_lengths_before.txt >> $metrics_file + echo "Contig lengths after filtering:" >> $metrics_file + cat contig_lengths_after.txt >> $metrics_file + + cat $metrics_file >&2 + >>> + output { + File filtered_fasta = "filtered_contigs.fasta" + File assembly_filtering_metrics = "filtering_metrics.txt" + File contig_lengths_before = "contig_lengths_before.txt" + File contig_lengths_after = "contig_lengths_after.txt" + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory}G" + disks: "local-disk " + disk_size + " SSD" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/quality_control/read_filtering/task_porechop.wdl b/tasks/quality_control/read_filtering/task_porechop.wdl new file mode 100644 index 000000000..797a6808f --- /dev/null +++ b/tasks/quality_control/read_filtering/task_porechop.wdl @@ -0,0 +1,44 @@ +version 1.0 + +task porechop { + input { + File read1 # Raw read input file + String samplename + String? trimopts # Optional trimming options + Int cpu = 4 + Int memory = 8 + Int disk_size = 50 + String docker = "us-docker.pkg.dev/general-theiagen/staphb/porechop:0.2.4" + } + command <<< + set -euo pipefail + + echo "Porechop version:" + porechop --version | tee VERSION + + echo "Trimming reads with Porechop..." + porechop \ + --input ~{read1} \ + --output ~{samplename}.trimmed.fastq \ + --threads ~{cpu} \ + ~{trimopts} || { + echo "Porechop failed."; exit 1; + } + + echo "Compressing trimmed reads..." + gzip -f ~{samplename}.trimmed.fastq + >>> + output { + File trimmed_reads = "~{samplename}.trimmed.fastq.gz" + String porechop_version = read_string("VERSION") + } + runtime { + docker: "~{docker}" + cpu: cpu + memory: "~{memory} GB" + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + maxRetries: 3 + preemptible: 0 + } +} diff --git a/tasks/utilities/data_export/task_broad_terra_tools.wdl b/tasks/utilities/data_export/task_broad_terra_tools.wdl index 8a54d6bad..fcbaa6be0 100644 --- a/tasks/utilities/data_export/task_broad_terra_tools.wdl +++ b/tasks/utilities/data_export/task_broad_terra_tools.wdl @@ -85,8 +85,8 @@ task export_taxon_tables { String? tiptoft_plasmid_replicon_genes String? tiptoft_version File? assembly_fasta + File? bandage_plot File? contigs_gfa - String? dragonflye_version String? shovill_pe_version String? shovill_se_version File? quast_report @@ -501,8 +501,8 @@ task export_taxon_tables { "tiptoft_plasmid_replicon_genes": "~{tiptoft_plasmid_replicon_genes}", "tiptoft_version": "~{tiptoft_version}", "assembly_fasta": "~{assembly_fasta}", + "bandage_plot": "~{bandage_plot}", "contigs_gfa": "~{contigs_gfa}", - "dragonflye_version": "~{dragonflye_version}", "shovill_pe_version": "~{shovill_pe_version}", "shovill_se_version": "~{shovill_se_version}", "quast_report": "~{quast_report}", diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index d9dfb5218..378f6bc5f 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -552,7 +552,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 8c97c5bd65e2787239f12ef425d479ae + md5sum: 75fd0459900986c63ef63be2a3446cfe - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl md5sum: ac49217c129add7c000eedf38acee8f3 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 2b6580c82..1e6e9accb 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -520,7 +520,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 8c97c5bd65e2787239f12ef425d479ae + md5sum: 75fd0459900986c63ef63be2a3446cfe - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl md5sum: 5e735ae6cb60f86ec7983274f3baf9f8 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 867e67e0b..a873c5b76 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -1,6 +1,5 @@ version 1.0 -import "../../tasks/assembly/task_dragonflye.wdl" as dragonflye_task import "../../tasks/gene_typing/annotation/task_bakta.wdl" as bakta_task import "../../tasks/gene_typing/annotation/task_prokka.wdl" as prokka_task import "../../tasks/gene_typing/drug_resistance/task_amrfinderplus.wdl" as amrfinderplus @@ -20,6 +19,7 @@ import "../../tasks/taxon_id/task_gambit.wdl" as gambit_task import "../../tasks/utilities/data_export/task_broad_terra_tools.wdl" as terra_tools_task import "../utilities/wf_merlin_magic.wdl" as merlin_magic_workflow import "../utilities/wf_read_QC_trim_ont.wdl" as read_qc_workflow +import "../utilities/wf_flye_denovo.wdl" as flye_workflow workflow theiaprok_ont { meta { @@ -97,16 +97,15 @@ workflow theiaprok_ont { expected_genome_length = genome_length } if (clean_check_reads.read_screen == "PASS") { - call dragonflye_task.dragonflye { - input: - read1 = read_qc_trim.read1_clean, - genome_length = select_first([genome_length, read_qc_trim.est_genome_length]), - samplename = samplename + call flye_workflow.flye_denovo as flye_denovo { + input: + read1 = read_qc_trim.read1_clean, + samplename = samplename } call quast_task.quast { - input: - assembly = dragonflye.assembly_fasta, - samplename = samplename + input: + assembly = flye_denovo.assembly_fasta, + samplename = samplename } # nanoplot for basic QC metrics call nanoplot_task.nanoplot as nanoplot_raw { @@ -123,73 +122,73 @@ workflow theiaprok_ont { } call busco_task.busco { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } if (perform_characterization) { call gambit_task.gambit { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } if (call_ani) { call ani_task.animummer as ani { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (call_kmerfinder) { call kmerfinder_task.kmerfinder_bacteria as kmerfinder { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } call amrfinderplus.amrfinderplus_nuc as amrfinderplus_task { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename, organism = select_first([expected_taxon, gambit.gambit_predicted_taxon]) } if (call_resfinder) { call resfinder_task.resfinder as resfinder_task { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename, organism = select_first([expected_taxon, gambit.gambit_predicted_taxon]) } } call ts_mlst_task.ts_mlst { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } if (genome_annotation == "prokka") { call prokka_task.prokka { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (genome_annotation == "bakta") { call bakta_task.bakta { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (call_plasmidfinder) { call plasmidfinder_task.plasmidfinder { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename } } if (call_abricate) { call abricate_task.abricate { input: - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename, database = abricate_db } @@ -220,7 +219,7 @@ workflow theiaprok_ont { call merlin_magic_workflow.merlin_magic { input: merlin_tag = select_first([expected_taxon, gambit.merlin_tag]), - assembly = dragonflye.assembly_fasta, + assembly = flye_denovo.assembly_fasta, samplename = samplename, read1 = read_qc_trim.read1_clean, ont_data = true @@ -277,9 +276,8 @@ workflow theiaprok_ont { tiptoft_plasmid_replicon_fastq = read_qc_trim.tiptoft_plasmid_replicon_fastq, tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes, tiptoft_version = read_qc_trim.tiptoft_version, - assembly_fasta = dragonflye.assembly_fasta, - contigs_gfa = dragonflye.contigs_gfa, - dragonflye_version = dragonflye.dragonflye_version, + assembly_fasta = flye_denovo.assembly_fasta, + contigs_gfa = flye_denovo.contigs_gfa, quast_report = quast.quast_report, quast_version = quast.version, assembly_length = quast.genome_length, @@ -591,10 +589,19 @@ workflow theiaprok_ont { File? tiptoft_plasmid_replicon_fastq = read_qc_trim.tiptoft_plasmid_replicon_fastq String? tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes String? tiptoft_version = read_qc_trim.tiptoft_version - # Assembly - dragonflye outputs - File? assembly_fasta = dragonflye.assembly_fasta - File? contigs_gfa = dragonflye.contigs_gfa - String? dragonflye_version = dragonflye.dragonflye_version + # Assembly - flye_denovo outputs + File? assembly_fasta = flye_denovo.assembly_fasta + File? contigs_gfa = flye_denovo.contigs_gfa + File? bandage_plot = flye_denovo.bandage_plot + String? medaka_model = flye_denovo.medaka_model + String? porechop_version = flye_denovo.porechop_version + String? flye_version = flye_denovo.flye_version + String? bandage_version = flye_denovo.bandage_version + String? medaka_version = flye_denovo.medaka_version + String? racon_version = flye_denovo.racon_version + String? bwa_version = flye_denovo.bwa_version + String? polypolish_version = flye_denovo.polypolish_version + String? dnaapler_version = flye_denovo.dnaapler_version # Assembly QC - quast outputs File? quast_report = quast.quast_report String? quast_version = quast.version diff --git a/workflows/utilities/wf_flye_denovo.wdl b/workflows/utilities/wf_flye_denovo.wdl new file mode 100644 index 000000000..c0bbfba25 --- /dev/null +++ b/workflows/utilities/wf_flye_denovo.wdl @@ -0,0 +1,235 @@ +version 1.0 + +import "../../tasks/quality_control/read_filtering/task_porechop.wdl" as task_porechop +import "../../tasks/assembly/task_flye.wdl" as task_flye +import "../../tasks/assembly/task_bandageplot.wdl" as task_bandage +import "../../tasks/polishing/task_medaka.wdl" as task_medaka +import "../../tasks/polishing/task_racon.wdl" as task_racon +import "../../tasks/assembly/task_dnaapler.wdl" as task_dnaapler +import "../../tasks/quality_control/read_filtering/task_filtercontigs.wdl" as task_filtercontigs +import "../../tasks/alignment/task_bwa.wdl" as task_bwaall +import "../../tasks/polishing/task_polypolish.wdl" as task_polypolish + +workflow flye_denovo { + meta { + description: "This workflow assembles long-read sequencing data using Flye, optionally trims reads with Porechop, and generates an assembly graph visualization with Bandage. It supports consensus polishing with Medaka or Racon for long reads, or Polypolish for hybrid assemblies with Illumina short reads. The workflow concludes by reorienting contigs with Dnaapler for a final assembly." + } + input { + File read1 # raw input reads + File? illumina_read1 # Optional Illumina short-read R1 for hybrid assembly + File? illumina_read2 # Optional Illumina short-read R2 for hybrid assembly + String samplename + String polisher = "medaka" + Int polish_rounds = 1 # Optional number of polishing rounds + + # Task-level inputs for Porechop + Int porechop_cpu = 4 + Int porechop_memory = 8 + Int porechop_disk_size = 50 + String? porechop_trimopts # Optional Porechop trimming options + + # Flye-specific inputs + Boolean flye_ont_corrected = false + Boolean flye_ont_high_quality = false + Boolean flye_pacbio_raw = false + Boolean flye_pacbio_corrected = false + Boolean flye_pacbio_hifi = false + Int? flye_genome_length # Requires `asm_coverage` + Int? flye_asm_coverage # Reduced coverage for initial disjointig assembly + Int flye_polishing_iterations = 1 # Default polishing iterations + Int? flye_minimum_overlap # Minimum overlap between reads + Float? flye_read_error_rate # Maximum expected read error rate + Boolean flye_uneven_coverage_mode = false + Boolean flye_keep_haplotypes = false + Boolean flye_no_alt_contigs = false + Boolean flye_scaffold = false + String? flye_additional_parameters # Any extra Flye-specific parameters + Int flye_cpu = 4 + Int flye_memory = 32 + Int flye_disk_size = 100 + + # Bandage-specific inputs + Int bandage_cpu = 2 + Int bandage_memory = 4 + Int bandage_disk_size = 10 + + # Polypolish-specific inputs + String? polypolish_pair_orientation + Float? polypolish_low_percentile_threshold + Float? polypolish_high_percentile_threshold + Float? polypolish_fraction_invalid + Float? polypolish_fraction_valid + Int? polypolish_maximum_errors + Int? polypolish_minimum_depth + Boolean polypolish_careful = false + Int polypolish_cpu = 1 + Int polypolish_memory = 8 + Int polypolish_disk_size = 100 + + # Medaka-specific inputs + Boolean auto_medaka_model = true # Enable automatic Medaka model selection + String? medaka_model_override # Optional user-specified Medaka model + Int medaka_cpu = 4 + Int medaka_memory = 16 + Int medaka_disk_size = 100 + + # Racon-specific inputs + Int racon_cpu = 8 + Int racon_memory = 16 + Int racon_disk_size = 100 + + # Contig filter-specific inputs + Int filtercontigs_min_len = 1000 + Boolean filtercontigs_homopolymers = true + Int filtercontigs_cpu = 4 + Int filtercontigs_memory = 16 + Int filtercontigs_disk_size = 100 + + # Dnaapler-specific inputs + String dnaapler_mode = "all" + Int dnaapler_cpu = 4 + Int dnaapler_memory = 16 + Int dnaapler_disk_size = 100 + + #Other workdlow-level inputs + Boolean skip_trim_reads = true # Default: No trimming + Boolean skip_polishing = false # Default: Polishing enabled + } + # Optional Porechop trimming before Flye + if (!skip_trim_reads) { + call task_porechop.porechop as porechop { + input: + read1 = read1, + samplename = samplename, + trimopts = porechop_trimopts, + cpu = porechop_cpu, + memory = porechop_memory, + disk_size = porechop_disk_size + } + } + # Call Flye using either trimmed reads or raw reads + call task_flye.flye as flye { + input: + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available + samplename = samplename, + ont_corrected = flye_ont_corrected, + pacbio_corrected = flye_pacbio_corrected, + pacbio_hifi = flye_pacbio_hifi, + pacbio_raw = flye_pacbio_raw, + ont_high_quality = flye_ont_high_quality, + genome_length = flye_genome_length, + asm_coverage = flye_asm_coverage, + flye_polishing_iterations = flye_polishing_iterations, + minimum_overlap = flye_minimum_overlap, + read_error_rate = flye_read_error_rate, + uneven_coverage_mode = flye_uneven_coverage_mode, + keep_haplotypes = flye_keep_haplotypes, + no_alt_contigs = flye_no_alt_contigs, + scaffold = flye_scaffold, + additional_parameters = flye_additional_parameters, + cpu = flye_cpu, + memory = flye_memory, + disk_size = flye_disk_size + } + # Generate Bandage plot + call task_bandage.bandage_plot as bandage { + input: + assembly_graph_gfa = flye.assembly_graph_gfa, + samplename = samplename, + cpu = bandage_cpu, + memory = bandage_memory, + disk_size = bandage_disk_size + } + # Hybrid Assembly Path: Polypolish + if (defined(illumina_read1) && defined(illumina_read2)) { + call task_bwaall.bwa_all as bwa { + input: + draft_assembly_fasta = flye.assembly_fasta, + read1 = select_first([illumina_read1]), + read2 = select_first([illumina_read2]), + samplename = samplename + } + call task_polypolish.polypolish as polypolish { + input: + assembly_fasta = flye.assembly_fasta, + read1_sam = bwa.read1_sam, + read2_sam = bwa.read2_sam, + samplename = samplename, + illumina_polishing_rounds = polish_rounds, + pair_orientation = polypolish_pair_orientation, + low_percentile_threshold = polypolish_low_percentile_threshold, + high_percentile_threshold = polypolish_high_percentile_threshold, + fraction_invalid = polypolish_fraction_invalid, + fraction_valid = polypolish_fraction_valid, + maximum_errors = polypolish_maximum_errors, + minimum_depth = polypolish_minimum_depth, + careful = polypolish_careful, + cpu = polypolish_cpu, + memory = polypolish_memory, + disk_size = polypolish_disk_size + } + } + # ONT-only Polishing Path: Medaka or Racon + if (!defined(illumina_read1) || !defined(illumina_read2)) { + if (!skip_polishing && polisher == "medaka") { + call task_medaka.medaka_consensus as medaka { + input: + unpolished_fasta = flye.assembly_fasta, + samplename = samplename, + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available + medaka_model_override = medaka_model_override, + auto_model = auto_medaka_model, + cpu = medaka_cpu, + memory = medaka_memory, + disk_size = medaka_disk_size + } + } + if (!skip_polishing && polisher == "racon") { + call task_racon.racon as racon { + input: + unpolished_fasta = flye.assembly_fasta, + read1 = select_first([porechop.trimmed_reads, read1]), # Use trimmed reads if available + samplename = samplename, + polishing_rounds = polish_rounds, + cpu = racon_cpu, + memory = racon_memory, + disk_size = racon_disk_size + } + } + } + # Contig Filtering and Final Assembly orientation + call task_filtercontigs.contig_filter as contig_filter { + input: + assembly_fasta = select_first([polypolish.polished_assembly, medaka.medaka_fasta, racon.polished_fasta, flye.assembly_fasta]), # Use Flye assembly if no polishing + min_len = filtercontigs_min_len, + filter_homopolymers = filtercontigs_homopolymers, + cpu = filtercontigs_cpu, + memory = filtercontigs_memory, + disk_size = filtercontigs_disk_size + } + call task_dnaapler.dnaapler_all as dnaapler { + input: + input_fasta = contig_filter.filtered_fasta, + samplename = samplename, + dmaapler_mode = dnaapler_mode, + cpu = dnaapler_cpu, + memory = dnaapler_memory, + disk_size = dnaapler_disk_size + } + output { + #update this throughout theiprok + File assembly_fasta = dnaapler.reoriented_fasta + File bandage_plot = bandage.plot + File contigs_gfa = flye.assembly_graph_gfa + String? medaka_model = medaka.medaka_model + #add task versions + String? porechop_version = porechop.porechop_version + String flye_version = flye.flye_version + String bandage_version = bandage.bandage_version + String? medaka_version = medaka.medaka_version + String? racon_version = racon.racon_version + String? bwa_version = bwa.bwa_version + String? polypolish_version = polypolish.polypolish_version + String dnaapler_version = dnaapler.dnaapler_version + } +} \ No newline at end of file