From 770233c91ba5c301571ff9dc1022aaf3a67134db Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 22 Nov 2024 12:59:31 -0600 Subject: [PATCH] update logic for no SRR accessions and invalid samples --- .../call-fetch_srr/command | 36 +++++++ .../call-fetch_srr/inputs.json | 3 + .../call-fetch_srr/outputs.json | 4 + .../call-fetch_srr/stderr.txt | 0 .../call-fetch_srr/stderr.txt.offset | 2 + .../call-fetch_srr/stdout.txt | 6 ++ .../call-fetch_srr/task.log | 17 ++++ .../call-fetch_srr/work/DATE | 1 + .../call-fetch_srr/work/VERSION | 1 + .../call-fetch_srr/work/srr_accession.txt | 1 + .../call-fetch_srr/work/stderr.log | 98 +++++++++++++++++++ .../call-version_capture/command | 6 ++ .../call-version_capture/inputs.json | 1 + .../call-version_capture/outputs.json | 4 + .../call-version_capture/stderr.txt | 0 .../call-version_capture/stderr.txt.offset | 2 + .../call-version_capture/stdout.txt | 0 .../call-version_capture/task.log | 11 +++ .../call-version_capture/work/PHB_VERSION | 1 + .../call-version_capture/work/TODAY | 1 + .../inputs.json | 3 + .../outputs.json | 5 + 20241122_125512_fetch_srr_accession/rerun | 1 + .../wdl/tasks/task_versioning.wdl | 30 ++++++ .../task_fetch_srr_accession.wdl | 65 ++++++++++++ .../data_import/wf_fetch_srr_accession.wdl | 26 +++++ .../workflow.log | 45 +++++++++ .../fetch_srr_accession.md | 8 +- .../task_fetch_srr_accession.wdl | 55 +++++------ .../data_import/wf_fetch_srr_accession.wdl | 12 +-- 30 files changed, 403 insertions(+), 42 deletions(-) create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/command create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/inputs.json create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/outputs.json create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt.offset create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/task.log create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/work/DATE create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/work/VERSION create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/work/srr_accession.txt create mode 100644 20241122_125512_fetch_srr_accession/call-fetch_srr/work/stderr.log create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/command create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/inputs.json create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/outputs.json create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt.offset create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/stdout.txt create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/task.log create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/work/PHB_VERSION create mode 100644 20241122_125512_fetch_srr_accession/call-version_capture/work/TODAY create mode 100644 20241122_125512_fetch_srr_accession/inputs.json create mode 100644 20241122_125512_fetch_srr_accession/outputs.json create mode 100644 20241122_125512_fetch_srr_accession/rerun create mode 100644 20241122_125512_fetch_srr_accession/wdl/tasks/task_versioning.wdl create mode 100644 20241122_125512_fetch_srr_accession/wdl/tasks/utilities/data_handling/task_fetch_srr_accession.wdl create mode 100644 20241122_125512_fetch_srr_accession/wdl/workflows/utilities/data_import/wf_fetch_srr_accession.wdl create mode 100644 20241122_125512_fetch_srr_accession/workflow.log diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/command b/20241122_125512_fetch_srr_accession/call-fetch_srr/command new file mode 100644 index 000000000..cc7887f5b --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/command @@ -0,0 +1,36 @@ + +set -euo pipefail + +# Output the current date and fastq-dl version for debugging +date -u | tee DATE +fastq-dl --version | tee VERSION + +echo "Fetching metadata for accession: SAMD00010204" + +# Run fastq-dl and capture stderr +fastq-dl --accession SAMD00010204 --only-download-metadata -m 2 --verbose 2> stderr.log || true + +# Handle whether the ID/accession is valid and contains SRR metadata based on stderr +if grep -q "No results found for" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: SAMD00010204" +elif grep -q "received an empty response" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: SAMD00010204" +elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then + echo "Invalid accession: SAMD00010204" >&2 + exit 1 +elif [[ ! -f fastq-run-info.tsv ]]; then + echo "No metadata file found for accession: SAMD00010204" >&2 + exit 1 +else + # Extract SRR accessions from the TSV file if it exists + SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + if [[ -z "${SRR_accessions}" ]]; then + echo "No SRR accession found" > srr_accession.txt + else + echo "Extracted SRR accessions: ${SRR_accessions}" + echo "${SRR_accessions}" > srr_accession.txt + fi +fi + \ No newline at end of file diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/inputs.json b/20241122_125512_fetch_srr_accession/call-fetch_srr/inputs.json new file mode 100644 index 000000000..ad458cf3f --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/inputs.json @@ -0,0 +1,3 @@ +{ + "sample_accession": "SAMD00010204" +} diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/outputs.json b/20241122_125512_fetch_srr_accession/call-fetch_srr/outputs.json new file mode 100644 index 000000000..1115b53be --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/outputs.json @@ -0,0 +1,4 @@ +{ + "fetch_srr_accession.fastq_dl_version": "fastq-dl, version 2.0.4", + "fetch_srr_accession.srr_accession": "No SRR accession found" +} diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt b/20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt new file mode 100644 index 000000000..e69de29bb diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt.offset b/20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt.offset new file mode 100644 index 000000000..c1e400b6c --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/stderr.txt.offset @@ -0,0 +1,2 @@ +17043145 +0 diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt b/20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt new file mode 100644 index 000000000..0d8003536 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt @@ -0,0 +1,6 @@ +Fri Nov 22 18:55:14 UTC 2024 +fastq-dl, version 2.0.4 +Fetching metadata for accession: SAMD00010204 +No results found for SAMD00010204 +No results found for SAMD00010204 +No SRR accession found for accession: SAMD00010204 diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/task.log b/20241122_125512_fetch_srr_accession/call-fetch_srr/task.log new file mode 100644 index 000000000..9eeb26f4a --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/task.log @@ -0,0 +1,17 @@ +2024-11-22 12:55:12.808 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE task setup :: name: "fetch_srr_accession", source: "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl", line: 3, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-fetch_srr", thread: 132198141265472 +2024-11-22 12:55:12.958 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker swarm resources :: workers: 1, max_cpus: 4, max_mem_bytes: 16767336448, total_cpus: 4, total_mem_bytes: 16767336448 +2024-11-22 12:55:12.958 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO input :: name: "sample_accession", value: "SAMD00010204" +2024-11-22 12:55:12.959 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "memory", value: 8 +2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "docker", value: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0" +2024-11-22 12:55:12.961 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "cpu", value: 2 +2024-11-22 12:55:12.962 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "disk_size", value: 10 +2024-11-22 12:55:12.963 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO effective runtime :: docker: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0", cpu: 2, memory_reservation: 8000000000, preemptible: 1 +2024-11-22 12:55:12.964 wdl.w:fetch_srr_accession.t:call-fetch_srr WARNING ignored runtime settings :: keys: ["disks", "disk"] +2024-11-22 12:55:12.978 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker image :: tag: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0", id: "sha256:c6689b7f5754d89574331af9a748cdb84e89107ecfafe8855fcdc745d41f0674", RepoDigest: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl@sha256:c0a1484561017e0f14e9cb8ceddfac2f28e3576a9bf1a8b743bd12183f4e38b4" +2024-11-22 12:55:14.613 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker task running :: service: "b752vpzdc7g1", task: "rl8xyzauey", node: "t2vz2h1tc6", message: "started" +2024-11-22 12:55:40.942 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO docker task complete :: service: "b752vpzdc7g1", task: "rl8xyzauey", node: "t2vz2h1tc6", message: "finished" +2024-11-22 12:55:40.942 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker task exit :: state: "complete", exit_code: 0 +2024-11-22 12:55:41.247 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO command stdout unused; consider output `File cmd_out = stdout()` or redirect command to stderr log >&2 :: stdout_file: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt" +2024-11-22 12:55:41.247 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO output :: name: "srr_accession", value: "No SRR accession found" +2024-11-22 12:55:41.248 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO output :: name: "fastq_dl_version", value: "fastq-dl, version 2.0.4" +2024-11-22 12:55:41.250 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE done diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/work/DATE b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/DATE new file mode 100644 index 000000000..e176829c9 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/DATE @@ -0,0 +1 @@ +Fri Nov 22 18:55:14 UTC 2024 diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/work/VERSION b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/VERSION new file mode 100644 index 000000000..9d40c6f7d --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/VERSION @@ -0,0 +1 @@ +fastq-dl, version 2.0.4 diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/work/srr_accession.txt b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/srr_accession.txt new file mode 100644 index 000000000..7d20bf94b --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/srr_accession.txt @@ -0,0 +1 @@ +No SRR accession found diff --git a/20241122_125512_fetch_srr_accession/call-fetch_srr/work/stderr.log b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/stderr.log new file mode 100644 index 000000000..26984750c --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-fetch_srr/work/stderr.log @@ -0,0 +1,98 @@ +2024-11-22 18:55:17 DEBUG 2024-11-22 18:55:17:root:DEBUG - fastq_dl.py:500 + Querying ENA for metadata (Attempt + 1 of 2) + DEBUG 2024-11-22 connectionpool.py:1048 + 18:55:17:urllib3.connectionp + ool:DEBUG - Starting new + HTTPS connection (1): + www.ebi.ac.uk:443 +2024-11-22 18:55:18 DEBUG 2024-11-22 connectionpool.py:546 + 18:55:18:urllib3.connectionpo + ol:DEBUG - + https://www.ebi.ac.uk:443 + "GET + /ena/portal/api/search?result + =read_run&format=tsv&query=%2 + 2(sample_accession=SAMD000102 + 04%20OR%20secondary_sample_ac + cession=SAMD00010204)%22&fiel + ds=all HTTP/1.1" 200 2973 + WARNING 2024-11-22 18:55:18:root:WARNING - fastq_dl.py:531 + Querying ENA was unsuccessful, + retrying after (10 seconds) +2024-11-22 18:55:28 DEBUG 2024-11-22 18:55:28:root:DEBUG - fastq_dl.py:504 + Querying SRA for metadata (Attempt + 1 of 2) + DEBUG 2024-11-22 connectionpool.py:1048 + 18:55:28:urllib3.connectionp + ool:DEBUG - Starting new + HTTPS connection (1): + www.ebi.ac.uk:443 +2024-11-22 18:55:29 DEBUG 2024-11-22 connectionpool.py:546 + 18:55:29:urllib3.connectionpo + ol:DEBUG - + https://www.ebi.ac.uk:443 + "GET + /ena/portal/api/search?result + =read_run&format=tsv&query=%2 + 2(sample_accession=SAMD000102 + 04%20OR%20secondary_sample_ac + cession=SAMD00010204)%22&fiel + ds=all HTTP/1.1" 200 2973 + DEBUG 2024-11-22 18:55:29:root:DEBUG - fastq_dl.py:514 + Failed to get metadata from ENA. + Trying SRA... + DEBUG 2024-11-22 connectionpool.py:1048 + 18:55:29:urllib3.connectionp + ool:DEBUG - Starting new + HTTPS connection (1): + eutils.ncbi.nlm.nih.gov:443 + DEBUG 2024-11-22 connectionpool.py:546 + 18:55:29:urllib3.connectionpo + ol:DEBUG - + https://eutils.ncbi.nlm.nih.g + ov:443 "POST + /entrez/eutils/esearch.fcgi + HTTP/1.1" 200 None + WARNING 2024-11-22 18:55:29:root:WARNING - fastq_dl.py:525 + Querying SRA was unsuccessful, + retrying after (10 seconds) +2024-11-22 18:55:39 DEBUG 2024-11-22 18:55:39:root:DEBUG - fastq_dl.py:504 + Querying SRA for metadata (Attempt + 2 of 2) + DEBUG 2024-11-22 connectionpool.py:1048 + 18:55:39:urllib3.connectionp + ool:DEBUG - Starting new + HTTPS connection (1): + www.ebi.ac.uk:443 + DEBUG 2024-11-22 connectionpool.py:546 + 18:55:39:urllib3.connectionpo + ol:DEBUG - + https://www.ebi.ac.uk:443 + "GET + /ena/portal/api/search?result + =read_run&format=tsv&query=%2 + 2(sample_accession=SAMD000102 + 04%20OR%20secondary_sample_ac + cession=SAMD00010204)%22&fiel + ds=all HTTP/1.1" 200 2973 + DEBUG 2024-11-22 connectionpool.py:1048 + 18:55:39:urllib3.connectionp + ool:DEBUG - Starting new + HTTPS connection (1): + eutils.ncbi.nlm.nih.gov:443 +2024-11-22 18:55:40 DEBUG 2024-11-22 connectionpool.py:546 + 18:55:40:urllib3.connectionpo + ol:DEBUG - + https://eutils.ncbi.nlm.nih.g + ov:443 "POST + /entrez/eutils/esearch.fcgi + HTTP/1.1" 200 None + ERROR 2024-11-22 18:55:40:root:ERROR - fastq_dl.py:519 + There was an issue querying ENA and + SRA, exiting... + ERROR 2024-11-22 18:55:40:root:ERROR - fastq_dl.py:520 + STATUS: 200 + ERROR 2024-11-22 18:55:40:root:ERROR - fastq_dl.py:521 + TEXT: Query was successful, but + received an empty response diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/command b/20241122_125512_fetch_srr_accession/call-version_capture/command new file mode 100644 index 000000000..5e440a801 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/command @@ -0,0 +1,6 @@ + +PHB_Version="PHB v2.2.1" + +date +"%Y-%m-%d" > TODAY +echo "$PHB_Version" > PHB_VERSION + \ No newline at end of file diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/inputs.json b/20241122_125512_fetch_srr_accession/call-version_capture/inputs.json new file mode 100644 index 000000000..0967ef424 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/inputs.json @@ -0,0 +1 @@ +{} diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/outputs.json b/20241122_125512_fetch_srr_accession/call-version_capture/outputs.json new file mode 100644 index 000000000..59e141099 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/outputs.json @@ -0,0 +1,4 @@ +{ + "version_capture.date": "2024-11-22", + "version_capture.phb_version": "PHB v2.2.1" +} diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt b/20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt new file mode 100644 index 000000000..e69de29bb diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt.offset b/20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt.offset new file mode 100644 index 000000000..f475bf3a4 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/stderr.txt.offset @@ -0,0 +1,2 @@ +17043147 +0 diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/stdout.txt b/20241122_125512_fetch_srr_accession/call-version_capture/stdout.txt new file mode 100644 index 000000000..e69de29bb diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/task.log b/20241122_125512_fetch_srr_accession/call-version_capture/task.log new file mode 100644 index 000000000..193ce4da9 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/task.log @@ -0,0 +1,11 @@ +2024-11-22 12:55:12.807 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE task setup :: name: "version_capture", source: "../../../tasks/task_versioning.wdl", line: 3, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-version_capture", thread: 132198151751232 +2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-version_capture INFO eval :: name: "docker", value: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" +2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-version_capture INFO eval :: name: "timezone", value: null +2024-11-22 12:55:12.963 wdl.w:fetch_srr_accession.t:call-version_capture INFO effective runtime :: docker: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0", cpu: 1, memory_reservation: 1000000000, preemptible: 1 +2024-11-22 12:55:12.964 wdl.w:fetch_srr_accession.t:call-version_capture WARNING ignored runtime settings :: keys: ["disks", "dx_instance_type"] +2024-11-22 12:55:12.980 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE docker image :: tag: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0", id: "sha256:e5b3b43b59e1cd3267788b867d9d4c84d4ffc8236278541b3cc6963784c57a5f", RepoDigest: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash@sha256:f62289e07dea809f88322fbed3a42057f95177e44c8622a38baf22e8113d1ab0" +2024-11-22 12:55:15.058 wdl.w:fetch_srr_accession.t:call-version_capture INFO docker task complete :: service: "q7zvoncm26cc", task: "k6un27duii", node: "t2vz2h1tc6", message: "finished" +2024-11-22 12:55:15.058 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE docker task exit :: state: "complete", exit_code: 0 +2024-11-22 12:55:15.412 wdl.w:fetch_srr_accession.t:call-version_capture INFO output :: name: "date", value: "2024-11-22" +2024-11-22 12:55:15.413 wdl.w:fetch_srr_accession.t:call-version_capture INFO output :: name: "phb_version", value: "PHB v2.2.1" +2024-11-22 12:55:15.415 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE done diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/work/PHB_VERSION b/20241122_125512_fetch_srr_accession/call-version_capture/work/PHB_VERSION new file mode 100644 index 000000000..84c16373f --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/work/PHB_VERSION @@ -0,0 +1 @@ +PHB v2.2.1 diff --git a/20241122_125512_fetch_srr_accession/call-version_capture/work/TODAY b/20241122_125512_fetch_srr_accession/call-version_capture/work/TODAY new file mode 100644 index 000000000..3e78e2263 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/call-version_capture/work/TODAY @@ -0,0 +1 @@ +2024-11-22 diff --git a/20241122_125512_fetch_srr_accession/inputs.json b/20241122_125512_fetch_srr_accession/inputs.json new file mode 100644 index 000000000..285328d34 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/inputs.json @@ -0,0 +1,3 @@ +{ + "fetch_srr_accession.sample_accession": "SAMD00010204" +} diff --git a/20241122_125512_fetch_srr_accession/outputs.json b/20241122_125512_fetch_srr_accession/outputs.json new file mode 100644 index 000000000..0fede9882 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/outputs.json @@ -0,0 +1,5 @@ +{ + "fetch_srr_accession.fetch_srr_accession_analysis_date": "2024-11-22", + "fetch_srr_accession.fetch_srr_accession_version": "PHB v2.2.1", + "fetch_srr_accession.srr_accession": "No SRR accession found" +} diff --git a/20241122_125512_fetch_srr_accession/rerun b/20241122_125512_fetch_srr_accession/rerun new file mode 100644 index 000000000..232b951d5 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/rerun @@ -0,0 +1 @@ +pushd /home/frasc/bioinformatics_projects/public_health_bioinformatics && miniwdl run --verbose /home/frasc/bioinformatics_projects/public_health_bioinformatics/workflows/utilities/data_import/wf_fetch_srr_accession.wdl -- sample_accession=SAMD00010204; popd diff --git a/20241122_125512_fetch_srr_accession/wdl/tasks/task_versioning.wdl b/20241122_125512_fetch_srr_accession/wdl/tasks/task_versioning.wdl new file mode 100644 index 000000000..fab908614 --- /dev/null +++ b/20241122_125512_fetch_srr_accession/wdl/tasks/task_versioning.wdl @@ -0,0 +1,30 @@ +version 1.0 + +task version_capture { + input { + String? timezone + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" + } + meta { + volatile: true + } + command { + PHB_Version="PHB v2.2.1" + ~{default='' 'export TZ=' + timezone} + date +"%Y-%m-%d" > TODAY + echo "$PHB_Version" > PHB_VERSION + } + output { + String date = read_string("TODAY") + String phb_version = read_string("PHB_VERSION") + } + runtime { + memory: "1 GB" + cpu: 1 + docker: docker + disks: "local-disk 10 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 1 + } +} + diff --git a/20241122_125512_fetch_srr_accession/wdl/tasks/utilities/data_handling/task_fetch_srr_accession.wdl b/20241122_125512_fetch_srr_accession/wdl/tasks/utilities/data_handling/task_fetch_srr_accession.wdl new file mode 100644 index 000000000..6eae676ba --- /dev/null +++ b/20241122_125512_fetch_srr_accession/wdl/tasks/utilities/data_handling/task_fetch_srr_accession.wdl @@ -0,0 +1,65 @@ +version 1.0 + +task fetch_srr_accession { + input { + String sample_accession + String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0" + Int disk_size = 10 + Int cpu = 2 + Int memory = 8 + } + meta { + volatile: true + } + + command <<< + set -euo pipefail + + # Output the current date and fastq-dl version for debugging + date -u | tee DATE + fastq-dl --version | tee VERSION + + echo "Fetching metadata for accession: ~{sample_accession}" + + # Run fastq-dl and capture stderr + fastq-dl --accession ~{sample_accession} --only-download-metadata -m 2 --verbose 2> stderr.log || true + + # Handle whether the ID/accession is valid and contains SRR metadata based on stderr + if grep -q "No results found for" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "received an empty response" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then + echo "Invalid accession: ~{sample_accession}" >&2 + exit 1 + elif [[ ! -f fastq-run-info.tsv ]]; then + echo "No metadata file found for accession: ~{sample_accession}" >&2 + exit 1 + else + # Extract SRR accessions from the TSV file if it exists + SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + if [[ -z "${SRR_accessions}" ]]; then + echo "No SRR accession found" > srr_accession.txt + else + echo "Extracted SRR accessions: ${SRR_accessions}" + echo "${SRR_accessions}" > srr_accession.txt + fi + fi + >>> + + output { + String srr_accession = read_string("srr_accession.txt") + String fastq_dl_version = read_string("VERSION") + } + + runtime { + docker: docker + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + preemptible: 1 + } +} diff --git a/20241122_125512_fetch_srr_accession/wdl/workflows/utilities/data_import/wf_fetch_srr_accession.wdl b/20241122_125512_fetch_srr_accession/wdl/workflows/utilities/data_import/wf_fetch_srr_accession.wdl new file mode 100644 index 000000000..e40e54a0f --- /dev/null +++ b/20241122_125512_fetch_srr_accession/wdl/workflows/utilities/data_import/wf_fetch_srr_accession.wdl @@ -0,0 +1,26 @@ +version 1.0 + +import "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl" as srr_task +import "../../../tasks/task_versioning.wdl" as versioning_task + +workflow fetch_srr_accession { + meta { + description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession." + } + input { + String sample_accession + } + call versioning_task.version_capture { + input: + } + call srr_task.fetch_srr_accession as fetch_srr { + input: + sample_accession = sample_accession + } + output { + String srr_accession = fetch_srr.srr_accession + # Version Captures + String fetch_srr_accession_version = version_capture.phb_version + String fetch_srr_accession_analysis_date = version_capture.date + } +} diff --git a/20241122_125512_fetch_srr_accession/workflow.log b/20241122_125512_fetch_srr_accession/workflow.log new file mode 100644 index 000000000..76293479f --- /dev/null +++ b/20241122_125512_fetch_srr_accession/workflow.log @@ -0,0 +1,45 @@ +2024-11-22 12:55:12.797 wdl.w:fetch_srr_accession NOTICE workflow start :: name: "fetch_srr_accession", source: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/workflows/utilities/data_import/wf_fetch_srr_accession.wdl", line: 6, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession" +2024-11-22 12:55:12.801 wdl.w:fetch_srr_accession NOTICE miniwdl :: version: "v1.12.1", uname: "Linux fcombe-dev-vm 6.8.0-1018-gcp #20~22.04.1-Ubuntu SMP Thu Nov 7 18:30:15 UTC 2024 x86_64" +2024-11-22 12:55:12.801 wdl.w:fetch_srr_accession INFO task thread pool initialized :: task_concurrency: 4 +2024-11-22 12:55:12.806 wdl.w:fetch_srr_accession NOTICE ready :: job: "call-version_capture", callee: "version_capture" +2024-11-22 12:55:12.806 wdl.w:fetch_srr_accession INFO input :: job: "call-version_capture", values: {} +2024-11-22 12:55:12.806 wdl.w:fetch_srr_accession INFO visit :: node: "decl-sample_accession", values: {"sample_accession": "SAMD00010204"} +2024-11-22 12:55:12.807 wdl.w:fetch_srr_accession NOTICE ready :: job: "call-fetch_srr", callee: "fetch_srr_accession" +2024-11-22 12:55:12.807 wdl.w:fetch_srr_accession INFO input :: job: "call-fetch_srr", values: {"sample_accession": "SAMD00010204"} +2024-11-22 12:55:12.807 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE task setup :: name: "version_capture", source: "../../../tasks/task_versioning.wdl", line: 3, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-version_capture", thread: 132198151751232 +2024-11-22 12:55:12.808 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE task setup :: name: "fetch_srr_accession", source: "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl", line: 3, column: 1, dir: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-fetch_srr", thread: 132198141265472 +2024-11-22 12:55:12.958 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker swarm resources :: workers: 1, max_cpus: 4, max_mem_bytes: 16767336448, total_cpus: 4, total_mem_bytes: 16767336448 +2024-11-22 12:55:12.958 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO input :: name: "sample_accession", value: "SAMD00010204" +2024-11-22 12:55:12.959 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "memory", value: 8 +2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-version_capture INFO eval :: name: "docker", value: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" +2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "docker", value: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0" +2024-11-22 12:55:12.960 wdl.w:fetch_srr_accession.t:call-version_capture INFO eval :: name: "timezone", value: null +2024-11-22 12:55:12.961 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "cpu", value: 2 +2024-11-22 12:55:12.962 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO eval :: name: "disk_size", value: 10 +2024-11-22 12:55:12.963 wdl.w:fetch_srr_accession.t:call-version_capture INFO effective runtime :: docker: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0", cpu: 1, memory_reservation: 1000000000, preemptible: 1 +2024-11-22 12:55:12.963 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO effective runtime :: docker: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0", cpu: 2, memory_reservation: 8000000000, preemptible: 1 +2024-11-22 12:55:12.964 wdl.w:fetch_srr_accession.t:call-version_capture WARNING ignored runtime settings :: keys: ["disks", "dx_instance_type"] +2024-11-22 12:55:12.964 wdl.w:fetch_srr_accession.t:call-fetch_srr WARNING ignored runtime settings :: keys: ["disks", "disk"] +2024-11-22 12:55:12.978 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker image :: tag: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0", id: "sha256:c6689b7f5754d89574331af9a748cdb84e89107ecfafe8855fcdc745d41f0674", RepoDigest: "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl@sha256:c0a1484561017e0f14e9cb8ceddfac2f28e3576a9bf1a8b743bd12183f4e38b4" +2024-11-22 12:55:12.980 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE docker image :: tag: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0", id: "sha256:e5b3b43b59e1cd3267788b867d9d4c84d4ffc8236278541b3cc6963784c57a5f", RepoDigest: "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash@sha256:f62289e07dea809f88322fbed3a42057f95177e44c8622a38baf22e8113d1ab0" +2024-11-22 12:55:14.613 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker task running :: service: "b752vpzdc7g1", task: "rl8xyzauey", node: "t2vz2h1tc6", message: "started" +2024-11-22 12:55:15.058 wdl.w:fetch_srr_accession.t:call-version_capture INFO docker task complete :: service: "q7zvoncm26cc", task: "k6un27duii", node: "t2vz2h1tc6", message: "finished" +2024-11-22 12:55:15.058 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE docker task exit :: state: "complete", exit_code: 0 +2024-11-22 12:55:15.412 wdl.w:fetch_srr_accession.t:call-version_capture INFO output :: name: "date", value: "2024-11-22" +2024-11-22 12:55:15.413 wdl.w:fetch_srr_accession.t:call-version_capture INFO output :: name: "phb_version", value: "PHB v2.2.1" +2024-11-22 12:55:15.415 wdl.w:fetch_srr_accession.t:call-version_capture NOTICE done +2024-11-22 12:55:15.415 wdl.w:fetch_srr_accession NOTICE finish :: job: "call-version_capture" +2024-11-22 12:55:15.416 wdl.w:fetch_srr_accession INFO output :: job: "call-version_capture", values: {"phb_version": "PHB v2.2.1", "date": "2024-11-22"} +2024-11-22 12:55:15.416 wdl.w:fetch_srr_accession INFO visit :: node: "output-fetch_srr_accession_analysis_date", values: {"fetch_srr_accession_analysis_date": "2024-11-22"} +2024-11-22 12:55:15.416 wdl.w:fetch_srr_accession INFO visit :: node: "output-fetch_srr_accession_version", values: {"fetch_srr_accession_version": "PHB v2.2.1"} +2024-11-22 12:55:40.942 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO docker task complete :: service: "b752vpzdc7g1", task: "rl8xyzauey", node: "t2vz2h1tc6", message: "finished" +2024-11-22 12:55:40.942 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE docker task exit :: state: "complete", exit_code: 0 +2024-11-22 12:55:41.247 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO command stdout unused; consider output `File cmd_out = stdout()` or redirect command to stderr log >&2 :: stdout_file: "/home/frasc/bioinformatics_projects/public_health_bioinformatics/20241122_125512_fetch_srr_accession/call-fetch_srr/stdout.txt" +2024-11-22 12:55:41.247 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO output :: name: "srr_accession", value: "No SRR accession found" +2024-11-22 12:55:41.248 wdl.w:fetch_srr_accession.t:call-fetch_srr INFO output :: name: "fastq_dl_version", value: "fastq-dl, version 2.0.4" +2024-11-22 12:55:41.250 wdl.w:fetch_srr_accession.t:call-fetch_srr NOTICE done +2024-11-22 12:55:41.250 wdl.w:fetch_srr_accession NOTICE finish :: job: "call-fetch_srr" +2024-11-22 12:55:41.250 wdl.w:fetch_srr_accession INFO output :: job: "call-fetch_srr", values: {"fastq_dl_version": "fastq-dl, version 2.0.4", "srr_accession": "No SRR accession found"} +2024-11-22 12:55:41.250 wdl.w:fetch_srr_accession INFO visit :: node: "output-srr_accession", values: {"srr_accession": "No SRR accession found"} +2024-11-22 12:55:41.251 wdl.w:fetch_srr_accession INFO visit :: node: "outputs", values: {"srr_accession": "No SRR accession found", "fetch_srr_accession_version": "PHB v2.2.1", "fetch_srr_accession_analysis_date": "2024-11-22"} +2024-11-22 12:55:41.251 wdl.w:fetch_srr_accession NOTICE done diff --git a/docs/workflows/public_data_sharing/fetch_srr_accession.md b/docs/workflows/public_data_sharing/fetch_srr_accession.md index b48be2bcb..eb9ccac22 100644 --- a/docs/workflows/public_data_sharing/fetch_srr_accession.md +++ b/docs/workflows/public_data_sharing/fetch_srr_accession.md @@ -16,11 +16,11 @@ The workflow uses the fastq-dl tool to fetch metadata from SRA and specifically | **Terra Task Name** | **Variable** | **Type** | **Description**| **Default Value** | **Terra Status** | | --- | --- | --- | --- | --- | --- | -| fetch_srr_metadata | **sample_accession** | String | SRA-compatible accession, such as a **BioSample ID** (e.g., "SAMN00000000") or **SRA Experiment ID** (e.g., "SRX000000"), used to retrieve SRR metadata. | | Required | -| fetch_srr_metadata | **docker**| String | Docker image for metadata retrieval. | `us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0` | Optional | -| fetch_srr_metadata | **disk_size** | Int | Disk space in GB allocated for the task. | 10 | Optional | | fetch_srr_metadata | **cpu** | Int | Number of CPUs allocated for the task. | 2 | Optional | +| fetch_srr_metadata | **disk_size** | Int | Disk space in GB allocated for the task. | 10 | Optional | +| fetch_srr_metadata | **docker**| String | Docker image for metadata retrieval. | `us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0` | Optional | | fetch_srr_metadata | **memory** | Int | Memory in GB allocated for the task. | 8 | Optional | +| fetch_srr_metadata | **sample_accession** | String | SRA-compatible accession, such as a **BioSample ID** (e.g., "SAMN00000000") or **SRA Experiment ID** (e.g., "SRX000000"), used to retrieve SRR metadata. | | Required | ### Workflow Tasks @@ -35,7 +35,7 @@ This workflow has a single task that performs metadata retrieval for the specifi | Task | [Task on GitHub](https://github.com/theiagen-org/phb-workflows/blob/main/tasks/utilities/data_handling/task_fetch_srr_metadata.wdl) | | Software Source Code | [fastq-dl Source](https://github.com/rvalieris/fastq-dl) | | Software Documentation | [fastq-dl Documentation](https://github.com/rvalieris/fastq-dl#documentation) | - | Original Publication | [fastq-dl Publication](https://doi.org/10.1186/s12859-021-04346-3) | + | Original Publication | [fastq-dl: A fast and reliable tool for downloading SRA metadata](https://doi.org/10.1186/s12859-021-04346-3) | ### Outputs diff --git a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl index c8913587a..6eae676ba 100644 --- a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl +++ b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl @@ -19,50 +19,41 @@ task fetch_srr_accession { date -u | tee DATE fastq-dl --version | tee VERSION - # Fetch metadata for the sample accession - echo "Fetching metadata for valid biosample ID or SRA: ~{sample_accession}" - if fastq-dl --accession ~{sample_accession} --only-download-metadata --verbose 2> stderr; then - if [[ -f fastq-run-info.tsv ]]; then - echo "Metadata written for valid biosample ID or SRA: ~{sample_accession}" - cat fastq-run-info.tsv + echo "Fetching metadata for accession: ~{sample_accession}" - # Extract SRR accessions from the TSV file - SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + # Run fastq-dl and capture stderr + fastq-dl --accession ~{sample_accession} --only-download-metadata -m 2 --verbose 2> stderr.log || true - if [[ -z "${SRR_accessions}" ]]; then - # Valid biosample ID or SRA, but no SRR accessions found - echo "No SRR accession found for valid biosample ID or SRA: ~{sample_accession}" > srr_accession.txt - else - # Valid biosample ID or SRA with SRR accessions - echo "Extracted SRR accessions: ${SRR_accessions}" - echo "${SRR_accessions}" > srr_accession.txt - fi - else - # No metadata file generated, treat as no SRRs found for valid biosample - echo "No metadata file found for valid biosample ID or SRA: ~{sample_accession}" - echo "No SRR accession found" > srr_accession.txt - fi + # Handle whether the ID/accession is valid and contains SRR metadata based on stderr + if grep -q "No results found for" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "received an empty response" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then + echo "Invalid accession: ~{sample_accession}" >&2 + exit 1 + elif [[ ! -f fastq-run-info.tsv ]]; then + echo "No metadata file found for accession: ~{sample_accession}" >&2 + exit 1 else - # Check stderr for specific error messages - if grep -q "Query was successful, but received an empty response" stderr; then - # Valid biosample ID or SRA, but no data found output No SRR accession found - echo "No SRR accession found for valid biosample ID or SRA: ~{sample_accession} -Query was successful, but received an empty response" > srr_accession.txt + # Extract SRR accessions from the TSV file if it exists + SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + if [[ -z "${SRR_accessions}" ]]; then echo "No SRR accession found" > srr_accession.txt - elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr; then - # Invalid accession ID or SRA Fail workflow - echo "Invalid biosample ID or SRA: ~{sample_accession}" - exit 1 else - # Unexpected error - echo "fastq-dl failed for ~{sample_accession} due to an unknown error." - exit 1 + echo "Extracted SRR accessions: ${SRR_accessions}" + echo "${SRR_accessions}" > srr_accession.txt fi fi >>> + output { String srr_accession = read_string("srr_accession.txt") String fastq_dl_version = read_string("VERSION") } + runtime { docker: docker memory: "~{memory} GB" diff --git a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl index d28d042e9..e40e54a0f 100644 --- a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl +++ b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl @@ -3,7 +3,7 @@ version 1.0 import "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl" as srr_task import "../../../tasks/task_versioning.wdl" as versioning_task -workflow fetch_srr { +workflow fetch_srr_accession { meta { description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession." } @@ -13,14 +13,14 @@ workflow fetch_srr { call versioning_task.version_capture { input: } - call srr_task.fetch_srr_accession { + call srr_task.fetch_srr_accession as fetch_srr { input: sample_accession = sample_accession } output { - String srr_accession = fetch_srr_accession.srr_accession + String srr_accession = fetch_srr.srr_accession # Version Captures - String phb_version = version_capture.phb_version - String fetch_srr_date = version_capture.date + String fetch_srr_accession_version = version_capture.phb_version + String fetch_srr_accession_analysis_date = version_capture.date } -} \ No newline at end of file +}