From 8c80feceb6f1afc5ef1d63cf12c14ede8c808931 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 15 Nov 2024 10:50:14 -0600 Subject: [PATCH] handle multiple SRR accessionss as string version outputs --- .../fetch_srr_accession.md | 2 +- mkdocs.yml | 2 +- .../task_fetch_srr_accession.wdl | 32 +++++++++++-------- .../data_import/wf_fetch_srr_accession.wdl | 13 ++++++-- 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/docs/workflows/public_data_sharing/fetch_srr_accession.md b/docs/workflows/public_data_sharing/fetch_srr_accession.md index efd1dfae8..b0543972f 100644 --- a/docs/workflows/public_data_sharing/fetch_srr_accession.md +++ b/docs/workflows/public_data_sharing/fetch_srr_accession.md @@ -8,7 +8,7 @@ ## Fetch SRR Accession -This workflow is designed to retrieve the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. The primary inputs are BioSample IDs (e.g., SAMN00000000) or SRA Experiment IDs (e.g., SRX000000), which link to sequencing data in the SRA repository. +This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. The primary inputs are BioSample IDs (e.g., SAMN00000000) or SRA Experiment IDs (e.g., SRX000000), which link to sequencing data in the SRA repository. The workflow uses the fastq-dl tool to fetch metadata from SRA and specifically parses this metadata to extract the associated SRR accession and outputs the SRR accession. diff --git a/mkdocs.yml b/mkdocs.yml index 6f63c60a5..2b2bf4808 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,8 +43,8 @@ nav: - Samples_to_Ref_Tree: workflows/phylogenetic_placement/samples_to_ref_tree.md - Usher_PHB: workflows/phylogenetic_placement/usher.md - Public Data Sharing: + - Fetch_SRR_Accession: workflows/public_data_sharing/fetch_srr_accessions.md - Mercury_Prep_N_Batch: workflows/public_data_sharing/mercury_prep_n_batch.md - - Retrieve_SRR_Metadata: workflows/public_data_sharing/retrieve_srr_metadata.md - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md - Exporting Data from Terra: diff --git a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl index 5c1a0044f..333b75afb 100644 --- a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl +++ b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl @@ -10,39 +10,43 @@ task fetch_srr_accession { } meta { volatile: true + version: "1.0" } command <<< - mkdir -p metadata_output date -u | tee DATE + fastq-dl --version | tee VERSION + # Debug output to show the sample being processed echo "Fetching metadata for sample accession: ${sample_accession}" - # Use fastq-dl to fetch metadata only - fastq-dl --accession ~{sample_accession} --outdir metadata_output --only-download-metadata --verbose - + # Use fastq-dl to fetch metadata only, outputting to the current directory + fastq-dl --accession ~{sample_accession} --only-download-metadata --verbose - if [[ -f metadata_output/fastq-run-info.tsv ]]; then + if [[ -f fastq-run-info.tsv ]]; then echo "Metadata written for ${sample_accession}:" echo "TSV content:" - cat metadata_output/fastq-run-info.tsv + cat fastq-run-info.tsv + + # Extract the SRR accessions and write them directly to srr_accession.txt + awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv > srr_accession.txt - # Extract the SRR accession (It is typically in the first column) - SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' metadata_output/fastq-run-info.tsv) - if [[ -z "${SRR_accessions}" ]]; then - echo "No SRR accession found for ${sample_accession}" > metadata_output/srr_accession.txt + # Check if srr_accession.txt is empty (no SRR accessions found) + if [[ ! -s srr_accession.txt ]]; then + echo "No SRR accession found for ${sample_accession}" > srr_accession.txt else - echo "Extracted SRR accessions: ${SRR_accessions}" - echo "${SRR_accessions}" > metadata_output/srr_accession.txt + echo "Extracted SRR accessions:" + cat srr_accession.txt fi else echo "No metadata found for ${sample_accession}" - echo "No SRR accession found" > metadata_output/srr_accession.txt + echo "No SRR accession found" > srr_accession.txt fi >>> output { - String srr_accession = read_string("metadata_output/srr_accession.txt") + Array[String] srr_accession = read_lines("srr_accession.txt") + String fastq_dl_version = read_string("VERSION") } runtime { docker: docker diff --git a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl index 966695f80..422597c1c 100644 --- a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl +++ b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl @@ -4,16 +4,23 @@ import "../../../tasks/utilities/data_handling/fetch_srr_accession.wdl" as srr_t workflow wf_retrieve_srr { meta { - description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession that can be used for downstream analysis." + description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession." } input { String sample_accession } - call srr_task.fetch_srr_metadata { + call versioning_task.version_capture { + input: + } + call srr_task.fetch_srr_accession { input: sample_accession = sample_accession } output { - String srr_accession = fetch_srr_metadata.srr_accession + Array[String] srr_accession = fetch_srr_accession.srr_accession + + # Version Captures + String phb_version = version_capture.phb_version + String fetch_srr_date = version_capture.date } } \ No newline at end of file