From 39ece663024fe0b3a95ed1bbf8babf816400e0dc Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Fri, 8 Nov 2024 20:39:12 +0000 Subject: [PATCH 1/7] Add iqtree model extraction to augur_tree task and update workflow output --- .../augur/task_augur_tree.wdl | 15 +++++++++++++++ workflows/phylogenetics/wf_augur.wdl | 1 + 2 files changed, 16 insertions(+) diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl index f16c73618..a8b4f30ea 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl @@ -19,6 +19,9 @@ task augur_tree { # capture version information augur version > VERSION + # Get alignment basename + FASTA_BASENAME=$(basename ~{aligned_fasta} .fasta) + AUGUR_RECURSION_LIMIT=10000 augur tree \ --alignment "~{aligned_fasta}" \ --output "~{build_name}_~{method}.nwk" \ @@ -28,10 +31,22 @@ task augur_tree { ~{"--tree-builder-args " + tree_builder_args} \ ~{true="--override-default-args" false="" override_default_args} \ --nthreads auto + + # If iqtree, get the model used + if [ "~{method}" == "iqtree" ]; then + if [ "~{substitution_model}" == "auto" ]; then + MODEL=$(grep "Best-fit model:" ${FASTA_BASENAME}-delim.fasta.log | sed 's|Best-fit model: ||g;s|chosen.*||' | tr -d '\n\r') + else + MODEL="~{substitution_model}" + fi + echo "$MODEL" > FINAL_MODEL.txt + fi >>> + output { File aligned_tree = "~{build_name}_~{method}.nwk" String augur_version = read_string("VERSION") + String? iqtree_model_used = read_string("FINAL_MODEL.txt") } runtime { docker: docker diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl index 3398d430f..e77c20fb5 100644 --- a/workflows/phylogenetics/wf_augur.wdl +++ b/workflows/phylogenetics/wf_augur.wdl @@ -203,6 +203,7 @@ workflow augur { File? auspice_input_json = augur_export.auspice_json File? time_tree = augur_refine.refined_tree File distance_tree = augur_tree.aligned_tree + String? iqtree_model_used = augur_tree.iqtree_model_used File aligned_fastas = select_first([augur_align.aligned_fasta, alignment_fasta]) File combined_assemblies = filter_sequences_by_length.filtered_fasta File? metadata_merged = tsv_join.out_tsv From d181a1747fe503346f8d7463e1b78d828a08c162 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Fri, 8 Nov 2024 21:17:07 +0000 Subject: [PATCH 2/7] Refactor augur_tree task to correctly derive FASTA basename and directory for iqtree model extraction --- tasks/phylogenetic_inference/augur/task_augur_tree.wdl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl index a8b4f30ea..be6566595 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl @@ -19,9 +19,6 @@ task augur_tree { # capture version information augur version > VERSION - # Get alignment basename - FASTA_BASENAME=$(basename ~{aligned_fasta} .fasta) - AUGUR_RECURSION_LIMIT=10000 augur tree \ --alignment "~{aligned_fasta}" \ --output "~{build_name}_~{method}.nwk" \ @@ -35,7 +32,9 @@ task augur_tree { # If iqtree, get the model used if [ "~{method}" == "iqtree" ]; then if [ "~{substitution_model}" == "auto" ]; then - MODEL=$(grep "Best-fit model:" ${FASTA_BASENAME}-delim.fasta.log | sed 's|Best-fit model: ||g;s|chosen.*||' | tr -d '\n\r') + FASTA_BASENAME=$(basename ~{aligned_fasta} .fasta) + FASTA_DIR=$(dirname ~{aligned_fasta}) + MODEL=$(grep "Best-fit model:" ${FASTA_DIR}/${FASTA_BASENAME}-delim.iqtree.log | sed 's|Best-fit model: ||g;s|chosen.*||' | tr -d '\n\r') else MODEL="~{substitution_model}" fi From 2ac0c0d358ae92014d64e1d0224105e0da38f2a7 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Tue, 12 Nov 2024 14:42:38 +0000 Subject: [PATCH 3/7] Add iqtree model used field to augur workflow documentation --- docs/workflows/phylogenetic_construction/augur.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md index 7ccf78d56..7e885d945 100644 --- a/docs/workflows/phylogenetic_construction/augur.md +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -290,6 +290,7 @@ The Nextstrain team hosts documentation surrounding the Augur workflow → Auspi | auspice_input_json | File | JSON file used as input to Auspice | | combined_assemblies | File | Concatenated FASTA file containing all samples | | distance_tree | File | The distance tree created in Newick (.nwk) format | +| iqtree_model_used | String | The iqtree model used during augur tree | | keep_list | File | A list of samples included in the phylogenetic tree | | metadata_merged | File | Tab-delimited text file of the merged augur_metadata input files from all samples | | snp_matrix | File | The SNP distance matrix for all samples used in the phylogenetic tree | From a9a4e65a54a1880c6ca6992cdf966a4674f3d29a Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Tue, 12 Nov 2024 14:53:32 +0000 Subject: [PATCH 4/7] Handle empty substitution model case in augur_tree task --- tasks/phylogenetic_inference/augur/task_augur_tree.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl index be6566595..af50fb9f9 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl @@ -39,6 +39,8 @@ task augur_tree { MODEL="~{substitution_model}" fi echo "$MODEL" > FINAL_MODEL.txt + else + echo "" > FINAL_MODEL.txt fi >>> From 064d6778c01bf7f00ebf18a87a2cb6738fabdb07 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Tue, 12 Nov 2024 15:23:17 +0000 Subject: [PATCH 5/7] Ensure iqtree_model_used is a non-nullable String in augur_tree task and workflow --- tasks/phylogenetic_inference/augur/task_augur_tree.wdl | 2 +- workflows/phylogenetics/wf_augur.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl index af50fb9f9..22bd469e7 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl @@ -47,7 +47,7 @@ task augur_tree { output { File aligned_tree = "~{build_name}_~{method}.nwk" String augur_version = read_string("VERSION") - String? iqtree_model_used = read_string("FINAL_MODEL.txt") + String iqtree_model_used = read_string("FINAL_MODEL.txt") } runtime { docker: docker diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl index e77c20fb5..3a6b64ba9 100644 --- a/workflows/phylogenetics/wf_augur.wdl +++ b/workflows/phylogenetics/wf_augur.wdl @@ -203,7 +203,7 @@ workflow augur { File? auspice_input_json = augur_export.auspice_json File? time_tree = augur_refine.refined_tree File distance_tree = augur_tree.aligned_tree - String? iqtree_model_used = augur_tree.iqtree_model_used + String iqtree_model_used = augur_tree.iqtree_model_used File aligned_fastas = select_first([augur_align.aligned_fasta, alignment_fasta]) File combined_assemblies = filter_sequences_by_length.filtered_fasta File? metadata_merged = tsv_join.out_tsv From 8e8f1f2a77a13bd13df6fa1835359ce8536573b1 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Thu, 14 Nov 2024 16:03:48 +0000 Subject: [PATCH 6/7] Update augur workflow documentation to include model options for substitution model and rename iqtree model variable for clarity --- docs/workflows/phylogenetic_construction/augur.md | 2 +- workflows/phylogenetics/wf_augur.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md index 7e885d945..24c53d8f7 100644 --- a/docs/workflows/phylogenetic_construction/augur.md +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -220,7 +220,7 @@ This workflow runs on the set level. Please note that for every task, runtime pa | augur_tree | **exclude_sites** | File | File of one-based sites to exclude for raw tree building (BED format in .bed files, DRM format in tab-delimited files, or one position per line) | | Optional | | augur_tree | **method** | String | Which method to use to build the tree; options: "fasttree", "raxml", "iqtree" | iqtree | Optional | | augur_tree | **override_default_args** | Boolean | If true, override default tree builder arguments instead of augmenting them | FALSE | Optional | -| augur_tree | **substitution_model** | String | The substitution model to use; only available for iqtree. Specify "auto" to run ModelTest; options: "GTR" | GTR | Optional | +| augur_tree | **substitution_model** | String | The substitution model to use; only available for iqtree. Specify "auto" to run ModelTest; model options can be found [here](http://www.iqtree.org/doc/Substitution-Models) | GTR | Optional | | augur_tree | **tree_builder_args** | String | Additional tree builder arguments either augmenting or overriding the default arguments. FastTree defaults: "-nt -nosupport". RAxML defaults: "-f d -m GTRCAT -c 25 -p 235813". IQ-TREE defaults: "-ninit 2 -n 2 -me 0.05 -nt AUTO -redo" | | Optional | | sc2_defaults | **nextstrain_ncov_repo_commit** | String | The version of the from which to draw default values for SARS-CoV-2. | `23d1243127e8838a61b7e5c1a72bc419bf8c5a0d` | Optional | | organism_parameters | **gene_locations_bed_file** | File | Use to provide locations of interest where average coverage will be calculated | Defaults are organism-specific. Please find default values for some organisms here: . For an organism without set defaults, an empty file is provided, "gs://theiagen-public-files/terra/theiacov-files/empty.bed", but will not be as useful as an organism specific gene locations bed file. | Optional | diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl index 3a6b64ba9..bb003b705 100644 --- a/workflows/phylogenetics/wf_augur.wdl +++ b/workflows/phylogenetics/wf_augur.wdl @@ -203,7 +203,7 @@ workflow augur { File? auspice_input_json = augur_export.auspice_json File? time_tree = augur_refine.refined_tree File distance_tree = augur_tree.aligned_tree - String iqtree_model_used = augur_tree.iqtree_model_used + String augur_iqtree_model_used = augur_tree.iqtree_model_used File aligned_fastas = select_first([augur_align.aligned_fasta, alignment_fasta]) File combined_assemblies = filter_sequences_by_length.filtered_fasta File? metadata_merged = tsv_join.out_tsv From 548ca2696b04a90dd273bc944a6ff50c8d68ace5 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Thu, 14 Nov 2024 19:07:26 +0000 Subject: [PATCH 7/7] Add augur_iqtree_model_used variable to documentation for clarity --- docs/workflows/phylogenetic_construction/augur.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md index 24c53d8f7..d8eb10f9f 100644 --- a/docs/workflows/phylogenetic_construction/augur.md +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -284,13 +284,13 @@ The Nextstrain team hosts documentation surrounding the Augur workflow → Auspi | **Variable** | **Type** | **Description** | | --- | --- | --- | | aligned_fastas | File | A FASTA file of the aligned genomes | +| augur_iqtree_model_used | String | The iqtree model used during augur tree | | augur_phb_analysis_date | String | The date the analysis was run | | augur_phb_version | String | The version of the Public Health Bioinformatics (PHB) repository used | | augur_version | String | Version of Augur used | | auspice_input_json | File | JSON file used as input to Auspice | | combined_assemblies | File | Concatenated FASTA file containing all samples | | distance_tree | File | The distance tree created in Newick (.nwk) format | -| iqtree_model_used | String | The iqtree model used during augur tree | | keep_list | File | A list of samples included in the phylogenetic tree | | metadata_merged | File | Tab-delimited text file of the merged augur_metadata input files from all samples | | snp_matrix | File | The SNP distance matrix for all samples used in the phylogenetic tree |