Skip to content

Commit

Permalink
Smw helix dev (#13)
Browse files Browse the repository at this point in the history
* first try!!!!

* prep helix for launch

* add helix name fix

* udpate version
  • Loading branch information
sage-wright authored Aug 22, 2023
1 parent 8e7bcac commit 42ab935
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 12 deletions.
73 changes: 73 additions & 0 deletions google_workflows/helix-gisaid-data-workflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
main:
params: [args]
steps:
- init:
assign:
- projectId: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
- region: "us-central1"
- batchApi: "batch.googleapis.com/v1"
- batchApiUrl: ${ "https://" + batchApi + "/projects/" + projectId + "/locations/" + region + "/jobs"}
- jobId: ${ "helix-gisaid-" + string(int(sys.now()))}
- newFile: ${args.data.name}
- logCreateBatchJob:
call: sys.log
args:
data: ${ "Creating and running the batch job " + jobId}
- createAndRunBatchJob:
call: http.post
args:
url: ${batchApiUrl}
query:
job_id: ${jobId}
headers:
Content-Type: application/json
auth:
type: OAuth2
body:
taskGroups:
- taskSpec:
runnables:
- container:
imageUri: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-08-2"
entrypoint: "/bin/bash"
volumes:
- "/mnt/disks/cdph_helix_gisaid_staging:/data"
- "/mnt/disks/cdph_helix_gisaid_staging/.config:/.config"
commands:
- "-c"
- ${ "bash /data/standard-dashboard.sh -d gs://cdph_helix_gisaid_staging -s no -b /data/input_gisaid -o /data -t gs://cdph_helix_gisaid -g gs://fc-6f47810a-0cc3-4e68-b8f0-12cde24d5893 -r helix_gisaid -p cdc-terrabio-taborda-manual -w dataAnalysis_SARS-CoV-2_Helix -q no -m false -i " + newFile + " -k true -x true"}
volumes:
- gcs:
remotePath: "cdph_helix_gisaid_staging"
mountPath: "/mnt/disks/cdph_helix_gisaid_staging"
taskCount: 1
logsPolicy:
destination: CLOUD_LOGGING
result: createAndRunBatchJobResponse
- getJob:
call: http.get
args:
url: ${batchApiUrl + "/" + jobId}
auth:
type: OAuth2
result: getJobResult
- logState:
call: sys.log
args:
data: ${ "Current job state " + getJobResult.body.status.state}
- checkState:
switch:
- condition: ${getJobResult.body.status.state == "SUCCEEDED"}
next: returnResult
- condition: ${getJobResult.body.status.state == "FAILED"}
next: returnResult
next: sleep
- sleep:
call: sys.sleep
args:
seconds: 10
next: getJob
- returnResult:
return:
jobId: ${jobId}
status: "OK"
7 changes: 7 additions & 0 deletions google_workflows/launch-helix-gisaid-trigger.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
gcloud eventarc triggers create helix-gisaid-trigger \
--destination-workflow=helix-gisaid \
--destination-workflow-location=us-central1 \
--event-filters="type=google.cloud.storage.object.v1.finalized" \
--event-filters="bucket=cdph_helix_gisaid" \
--location=us \
--service-account="[email protected]"
2 changes: 2 additions & 0 deletions google_workflows/launch-helix-workflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
gcloud workflows deploy helix-gisaid \
--source=helix-gisaid-data-workflow.yaml
57 changes: 47 additions & 10 deletions google_workflows/standard-dashboard.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ set -e
# filename: standard_dashboard.sh
# authors: Sage Wright, Kevin Libuit, Frank Ambrosio

VERSION="Google Dashboarding v0.1"
VERSION="Google Dashboarding v0.3"

showHelp() {
cat << EOF
Google Dashboarding v0.1
Google Dashboarding v0.3
This script is configured to work within a Google Batch job managed by a Google Workflow and Trigger.
The following variables need to be passed in as input parameters.
CAUTION: The entire command length must be under 400 characters; using the short version of arguments is recommended
Expand All @@ -29,13 +29,14 @@ Usage: ./standard_dashboard.sh
[ -q | --big-query-table-name ] the name of the big query table to upload to ("sars_cov_2_dashboard.workflow_la_state_gisaid_specimens_test")
[ -m | --puerto-rico ] apply Puerto Rico-specific changes. available options: true or false
[ -i | --input-tar-file ] the tar file given to the script by the Google Trigger
[ -k | --skip-bq-load ] skips the bq load step. available options: true or false
[ -x | --helix ] apply Helix-specific changes. available options: true or false
Happy dashboarding!
EOF
}

# use getopt to parse the input arguments
PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:j:s:b:o:t:g:r:p:w:q:m:i:" -l "version,help,dashboard-gcp-uri:,dashboard-newline-json:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,puerto-rico:,input-tar-file:" -a -- "$@")
PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:s:b:o:t:g:r:p:w:q:m:i:k:x:" -l "version,help,dashboard-gcp-uri:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,puerto-rico:,input-tar-file:,skip-bq-load:,helix:" -a -- "$@")

eval set -- "$PARSED_ARGUMENTS"

Expand All @@ -47,8 +48,6 @@ while true; do
showHelp; exit 0;;
-d|--dashboard-gcp-uri)
dashboard_gcp_uri=$2; shift 2;;
-j|--dashboard-newline-json)
dashboard_newline_json=$2; shift 2;;
-s|--dashboard_schema)
dashboard_schema=$2; shift 2;;
-b|--gisaid-backup-dir)
Expand All @@ -71,6 +70,10 @@ while true; do
puerto_rico=$2; shift 2;;
-i|--input-tar-file)
input_tar_file=$2; shift 2;;
-k|--skip-bq-load)
skip_bq_load=$2; shift 2;;
-x|--helix)
helix=$2; shift 2;;
--) shift; break ;;
*) echo "Unexpected option: $1 -- this should not happen."; exit 1;;
esac
Expand Down Expand Up @@ -100,7 +103,7 @@ make_directory ${output_dir}/backup_jsons

# echo the variables that were provided
echo -e "Dashboarding Automated System initiated at ${date_tag}\n" | tee ${output_dir}/automation_logs/dashboard-${date_tag}.log
echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_newline_json: ${dashboard_newline_json},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log
echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log

# take in file as input from trigger
file=${trigger_bucket}/${input_tar_file}
Expand Down Expand Up @@ -131,17 +134,17 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then
\n
# Create individual fasta files from GISAID multifasta
\n
python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} ${puerto_rico}
python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} ${puerto_rico} ${helix}
\n
\n
# Deposit individual fasta files into Terra GCP bucket
\n
gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_$(date -I)/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I)/
gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_${date_tag}/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag}/
\n
\n
# Create and import Terra Data table containing GCP pointers to deposited assemblies
\n
/scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I) ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" $(date -I)
/scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag} ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" ${date_tag}
\n
\n
# Capture, reformat, and prune GISAID metadata
Expand All @@ -154,6 +157,38 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then
python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${gisaid_dir}/gisaid_metadata_${date_tag}.tsv
\n
\n
if ${skip_bq_load} ; then
\n
# Make a set table
\n
/scripts/make_set_table.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_${date_tag} ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" ${date_tag}
\n
\n
# Run TheiaCoV_FASTA on the set
\n
TOKEN=`gcloud auth print-access-token`
\n
curl -X 'POST' \
'https://api.firecloud.org/api/workspaces/${terra_project}/${terra_workspace}/submissions' \
-H 'accept: */*' \
-H "Authorization: Bearer ${TOKEN}" \
-H 'Content-Type: application/json' \
-d \"{
\"methodConfigurationNamespace\": \"${terra_project}\",
\"methodConfigurationName\": \"TheiaCoV_FASTA_PHB\",
\"entityType\": \"${terra_table_root_entity}_set\",
\"entityName\": \"${date_tag}-set\",
\"expression\": \"this.${terra_table_root_entity}s\",
\"useCallCache\": true,
\"deleteIntermediateOutputFiles\": false,
\"useReferenceDisks\": false,
\"memoryRetryMultiplier\": 1,
\"workflowFailureMode\": \"NoNewCalls\",
\"userComment\": \"${date_tag}-set automatically launched\"
}\"
\n
\n
else
# Capture the entire Terra data table as a tsv
\n
python3 /scripts/export_large_tsv/export_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --entity_type ${terra_table_root_entity} --tsv_filename ${gisaid_dir}/full_${terra_table_root_entity}_terra_table_${date_tag}.tsv
Expand All @@ -175,6 +210,8 @@ if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then
\n
bq load --ignore_unknown_values=true --replace=true --source_format=NEWLINE_DELIMITED_JSON ${big_query_table_name} ${dashboard_gcp_uri}/${terra_table_root_entity}.json ${dashboard_schema}
\n
fi
\n
\n
"
# write the commands that will be run to the automation log
Expand Down
8 changes: 6 additions & 2 deletions scripts/gisaid_metadata_cleanser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def get_opts():
p.add_argument('out_file', help='Output file: required, must be a string.')
p.add_argument('table_name', help='Terra table name: required, must be a string; do not include entity: or _id.')
p.add_argument('puertorico', help='Perform Puerto Rico-specific actions')
p.add_argument('helix', help='Perform Helix-specific actions')
args = p.parse_args()
return args
arguments = get_opts()
Expand All @@ -33,6 +34,11 @@ def get_opts():
# remove any samples uploaded by PR
meta_df1 = meta_df1[~meta_df1[table_name].str.contains("PR-CVL")]

# perform Helix specific actions:
if arguments.helix == "true":
# rename virus names to start after the `hCoV-10/USA/CA-` prefix
meta_df1[table_name] = meta_df1[table_name].str.replace('hCoV-19/USA/CA-', '')

# drop extraneous cols
drop_list = []
for i in meta_df1.columns.values:
Expand Down Expand Up @@ -60,11 +66,9 @@ def get_opts():
meta_df1['collection_date'].replace('_', value='-', regex=True, inplace=True)
meta_df1['date_submitted'].replace('_', value='-', regex=True, inplace=True)


# remove the word 'years' from the age column
meta_df1['age'].replace(' years', value='', regex=True, inplace=True)


# age column cleaning
# replace string inputs of age ranges with individual numerical age equivalent to the bottom of the bins
age_range_replace_dict = {'0-4': 4, '5-17': 5, '18-49': 18, '50-64': 50}
Expand Down
34 changes: 34 additions & 0 deletions scripts/make_terra_set.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

gcp_uri=$1
terra_project=$2
terra_workspace=$3
root_entity=$4
output_dir=$5
alt_delimiter=$6

date_tag=$7
set_name=${date_tag}-set

# set default for $alt_delimiter in case user does not specify one
if [ -z $alt_delimiter ]; then
alt_delimiter="_"
fi

assembly_files=$(gsutil ls ${gcp_uri}/*.fasta | awk -F'/' '{ print $NF }')

# make set table header
echo -e "membership:${root_entity}_set_id\t${root_entity}" > ${output_dir}/${set_name}.tsv

for assembly in $assembly_files; do
# capture samplename from assembly filename
samplename=$(echo ${assembly} | awk -F "${alt_delimiter}|.fasta" '{ print $1 }')
# write samplename to the set
echo -e "${set_name}\t${samplename}" >> ${output_dir}/${set_name}.tsv
done

# remove duplicates from tsv if samplename not unique
awk '!a[$1]++' ${output_dir}/${set_name}.tsv > temp.tsv && mv temp.tsv ${output_dir}/${set_name}.tsv

# Import Terra table to sepcified terra_workspace
python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${output_dir}/${set_name}.tsv

0 comments on commit 42ab935

Please sign in to comment.