From 42ab935ddd1417e37d2a34ec8680f7e4681047c8 Mon Sep 17 00:00:00 2001 From: Sage Wright <40403716+sage-wright@users.noreply.github.com> Date: Tue, 22 Aug 2023 10:10:07 -0400 Subject: [PATCH] Smw helix dev (#13) * first try!!!! * prep helix for launch * add helix name fix * udpate version --- .../helix-gisaid-data-workflow.yaml | 73 +++++++++++++++++++ .../launch-helix-gisaid-trigger.sh | 7 ++ google_workflows/launch-helix-workflow.sh | 2 + google_workflows/standard-dashboard.sh | 57 ++++++++++++--- scripts/gisaid_metadata_cleanser.py | 8 +- scripts/make_terra_set.sh | 34 +++++++++ 6 files changed, 169 insertions(+), 12 deletions(-) create mode 100644 google_workflows/helix-gisaid-data-workflow.yaml create mode 100644 google_workflows/launch-helix-gisaid-trigger.sh create mode 100644 google_workflows/launch-helix-workflow.sh create mode 100644 scripts/make_terra_set.sh diff --git a/google_workflows/helix-gisaid-data-workflow.yaml b/google_workflows/helix-gisaid-data-workflow.yaml new file mode 100644 index 0000000..32b3ce1 --- /dev/null +++ b/google_workflows/helix-gisaid-data-workflow.yaml @@ -0,0 +1,73 @@ +main: + params: [args] + steps: + - init: + assign: + - projectId: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} + - region: "us-central1" + - batchApi: "batch.googleapis.com/v1" + - batchApiUrl: ${ "https://" + batchApi + "/projects/" + projectId + "/locations/" + region + "/jobs"} + - jobId: ${ "helix-gisaid-" + string(int(sys.now()))} + - newFile: ${args.data.name} + - logCreateBatchJob: + call: sys.log + args: + data: ${ "Creating and running the batch job " + jobId} + - createAndRunBatchJob: + call: http.post + args: + url: ${batchApiUrl} + query: + job_id: ${jobId} + headers: + Content-Type: application/json + auth: + type: OAuth2 + body: + taskGroups: + - taskSpec: + runnables: + - container: + imageUri: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-08-2" + entrypoint: "/bin/bash" + volumes: + - "/mnt/disks/cdph_helix_gisaid_staging:/data" + - "/mnt/disks/cdph_helix_gisaid_staging/.config:/.config" + commands: + - "-c" + - ${ "bash /data/standard-dashboard.sh -d gs://cdph_helix_gisaid_staging -s no -b /data/input_gisaid -o /data -t gs://cdph_helix_gisaid -g gs://fc-6f47810a-0cc3-4e68-b8f0-12cde24d5893 -r helix_gisaid -p cdc-terrabio-taborda-manual -w dataAnalysis_SARS-CoV-2_Helix -q no -m false -i " + newFile + " -k true -x true"} + volumes: + - gcs: + remotePath: "cdph_helix_gisaid_staging" + mountPath: "/mnt/disks/cdph_helix_gisaid_staging" + taskCount: 1 + logsPolicy: + destination: CLOUD_LOGGING + result: createAndRunBatchJobResponse + - getJob: + call: http.get + args: + url: ${batchApiUrl + "/" + jobId} + auth: + type: OAuth2 + result: getJobResult + - logState: + call: sys.log + args: + data: ${ "Current job state " + getJobResult.body.status.state} + - checkState: + switch: + - condition: ${getJobResult.body.status.state == "SUCCEEDED"} + next: returnResult + - condition: ${getJobResult.body.status.state == "FAILED"} + next: returnResult + next: sleep + - sleep: + call: sys.sleep + args: + seconds: 10 + next: getJob + - returnResult: + return: + jobId: ${jobId} + status: "OK" \ No newline at end of file diff --git a/google_workflows/launch-helix-gisaid-trigger.sh b/google_workflows/launch-helix-gisaid-trigger.sh new file mode 100644 index 0000000..d6b03bb --- /dev/null +++ b/google_workflows/launch-helix-gisaid-trigger.sh @@ -0,0 +1,7 @@ +gcloud eventarc triggers create helix-gisaid-trigger \ + --destination-workflow=helix-gisaid \ + --destination-workflow-location=us-central1 \ + --event-filters="type=google.cloud.storage.object.v1.finalized" \ + --event-filters="bucket=cdph_helix_gisaid" \ + --location=us \ + --service-account="551108248392-compute@developer.gserviceaccount.com" diff --git a/google_workflows/launch-helix-workflow.sh b/google_workflows/launch-helix-workflow.sh new file mode 100644 index 0000000..6937f32 --- /dev/null +++ b/google_workflows/launch-helix-workflow.sh @@ -0,0 +1,2 @@ +gcloud workflows deploy helix-gisaid \ + --source=helix-gisaid-data-workflow.yaml \ No newline at end of file diff --git a/google_workflows/standard-dashboard.sh b/google_workflows/standard-dashboard.sh index 62db881..f5577f8 100755 --- a/google_workflows/standard-dashboard.sh +++ b/google_workflows/standard-dashboard.sh @@ -4,11 +4,11 @@ set -e # filename: standard_dashboard.sh # authors: Sage Wright, Kevin Libuit, Frank Ambrosio -VERSION="Google Dashboarding v0.1" +VERSION="Google Dashboarding v0.3" showHelp() { cat << EOF -Google Dashboarding v0.1 +Google Dashboarding v0.3 This script is configured to work within a Google Batch job managed by a Google Workflow and Trigger. The following variables need to be passed in as input parameters. CAUTION: The entire command length must be under 400 characters; using the short version of arguments is recommended @@ -29,13 +29,14 @@ Usage: ./standard_dashboard.sh [ -q | --big-query-table-name ] the name of the big query table to upload to ("sars_cov_2_dashboard.workflow_la_state_gisaid_specimens_test") [ -m | --puerto-rico ] apply Puerto Rico-specific changes. available options: true or false [ -i | --input-tar-file ] the tar file given to the script by the Google Trigger - + [ -k | --skip-bq-load ] skips the bq load step. available options: true or false + [ -x | --helix ] apply Helix-specific changes. available options: true or false Happy dashboarding! EOF } # use getopt to parse the input arguments -PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:j:s:b:o:t:g:r:p:w:q:m:i:" -l "version,help,dashboard-gcp-uri:,dashboard-newline-json:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,puerto-rico:,input-tar-file:" -a -- "$@") +PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:s:b:o:t:g:r:p:w:q:m:i:k:x:" -l "version,help,dashboard-gcp-uri:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,puerto-rico:,input-tar-file:,skip-bq-load:,helix:" -a -- "$@") eval set -- "$PARSED_ARGUMENTS" @@ -47,8 +48,6 @@ while true; do showHelp; exit 0;; -d|--dashboard-gcp-uri) dashboard_gcp_uri=$2; shift 2;; - -j|--dashboard-newline-json) - dashboard_newline_json=$2; shift 2;; -s|--dashboard_schema) dashboard_schema=$2; shift 2;; -b|--gisaid-backup-dir) @@ -71,6 +70,10 @@ while true; do puerto_rico=$2; shift 2;; -i|--input-tar-file) input_tar_file=$2; shift 2;; + -k|--skip-bq-load) + skip_bq_load=$2; shift 2;; + -x|--helix) + helix=$2; shift 2;; --) shift; break ;; *) echo "Unexpected option: $1 -- this should not happen."; exit 1;; esac @@ -100,7 +103,7 @@ make_directory ${output_dir}/backup_jsons # echo the variables that were provided echo -e "Dashboarding Automated System initiated at ${date_tag}\n" | tee ${output_dir}/automation_logs/dashboard-${date_tag}.log -echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_newline_json: ${dashboard_newline_json},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log +echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log # take in file as input from trigger file=${trigger_bucket}/${input_tar_file} @@ -131,17 +134,17 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then \n # Create individual fasta files from GISAID multifasta \n - python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} ${puerto_rico} + python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} ${puerto_rico} ${helix} \n \n # Deposit individual fasta files into Terra GCP bucket \n - gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_$(date -I)/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I)/ + gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_${date_tag}/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag}/ \n \n # Create and import Terra Data table containing GCP pointers to deposited assemblies \n - /scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I) ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" $(date -I) + /scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag} ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" ${date_tag} \n \n # Capture, reformat, and prune GISAID metadata @@ -154,6 +157,38 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${gisaid_dir}/gisaid_metadata_${date_tag}.tsv \n \n + if ${skip_bq_load} ; then + \n + # Make a set table + \n + /scripts/make_set_table.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag} ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" ${date_tag} + \n + \n + # Run TheiaCoV_FASTA on the set + \n + TOKEN=`gcloud auth print-access-token` + \n + curl -X 'POST' \ + 'https://api.firecloud.org/api/workspaces/${terra_project}/${terra_workspace}/submissions' \ + -H 'accept: */*' \ + -H "Authorization: Bearer ${TOKEN}" \ + -H 'Content-Type: application/json' \ + -d \"{ + \"methodConfigurationNamespace\": \"${terra_project}\", + \"methodConfigurationName\": \"TheiaCoV_FASTA_PHB\", + \"entityType\": \"${terra_table_root_entity}_set\", + \"entityName\": \"${date_tag}-set\", + \"expression\": \"this.${terra_table_root_entity}s\", + \"useCallCache\": true, + \"deleteIntermediateOutputFiles\": false, + \"useReferenceDisks\": false, + \"memoryRetryMultiplier\": 1, + \"workflowFailureMode\": \"NoNewCalls\", + \"userComment\": \"${date_tag}-set automatically launched\" + }\" + \n + \n + else # Capture the entire Terra data table as a tsv \n python3 /scripts/export_large_tsv/export_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --entity_type ${terra_table_root_entity} --tsv_filename ${gisaid_dir}/full_${terra_table_root_entity}_terra_table_${date_tag}.tsv @@ -175,6 +210,8 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then \n bq load --ignore_unknown_values=true --replace=true --source_format=NEWLINE_DELIMITED_JSON ${big_query_table_name} ${dashboard_gcp_uri}/${terra_table_root_entity}.json ${dashboard_schema} \n + fi + \n \n " # write the commands that will be run to the automation log diff --git a/scripts/gisaid_metadata_cleanser.py b/scripts/gisaid_metadata_cleanser.py index 7267338..799123e 100755 --- a/scripts/gisaid_metadata_cleanser.py +++ b/scripts/gisaid_metadata_cleanser.py @@ -10,6 +10,7 @@ def get_opts(): p.add_argument('out_file', help='Output file: required, must be a string.') p.add_argument('table_name', help='Terra table name: required, must be a string; do not include entity: or _id.') p.add_argument('puertorico', help='Perform Puerto Rico-specific actions') + p.add_argument('helix', help='Perform Helix-specific actions') args = p.parse_args() return args arguments = get_opts() @@ -33,6 +34,11 @@ def get_opts(): # remove any samples uploaded by PR meta_df1 = meta_df1[~meta_df1[table_name].str.contains("PR-CVL")] +# perform Helix specific actions: +if arguments.helix == "true": + # rename virus names to start after the `hCoV-10/USA/CA-` prefix + meta_df1[table_name] = meta_df1[table_name].str.replace('hCoV-19/USA/CA-', '') + # drop extraneous cols drop_list = [] for i in meta_df1.columns.values: @@ -60,11 +66,9 @@ def get_opts(): meta_df1['collection_date'].replace('_', value='-', regex=True, inplace=True) meta_df1['date_submitted'].replace('_', value='-', regex=True, inplace=True) - # remove the word 'years' from the age column meta_df1['age'].replace(' years', value='', regex=True, inplace=True) - # age column cleaning # replace string inputs of age ranges with individual numerical age equivalent to the bottom of the bins age_range_replace_dict = {'0-4': 4, '5-17': 5, '18-49': 18, '50-64': 50} diff --git a/scripts/make_terra_set.sh b/scripts/make_terra_set.sh new file mode 100644 index 0000000..70843cc --- /dev/null +++ b/scripts/make_terra_set.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +gcp_uri=$1 +terra_project=$2 +terra_workspace=$3 +root_entity=$4 +output_dir=$5 +alt_delimiter=$6 + +date_tag=$7 +set_name=${date_tag}-set + +# set default for $alt_delimiter in case user does not specify one +if [ -z $alt_delimiter ]; then + alt_delimiter="_" +fi + +assembly_files=$(gsutil ls ${gcp_uri}/*.fasta | awk -F'/' '{ print $NF }') + +# make set table header +echo -e "membership:${root_entity}_set_id\t${root_entity}" > ${output_dir}/${set_name}.tsv + +for assembly in $assembly_files; do + # capture samplename from assembly filename + samplename=$(echo ${assembly} | awk -F "${alt_delimiter}|.fasta" '{ print $1 }') + # write samplename to the set + echo -e "${set_name}\t${samplename}" >> ${output_dir}/${set_name}.tsv +done + +# remove duplicates from tsv if samplename not unique +awk '!a[$1]++' ${output_dir}/${set_name}.tsv > temp.tsv && mv temp.tsv ${output_dir}/${set_name}.tsv + +# Import Terra table to sepcified terra_workspace +python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${output_dir}/${set_name}.tsv