diff --git a/google_workflows/launch-louisiana-dashboard-trigger.sh b/google_workflows/launch-louisiana-dashboard-trigger.sh new file mode 100644 index 0000000..2334988 --- /dev/null +++ b/google_workflows/launch-louisiana-dashboard-trigger.sh @@ -0,0 +1,7 @@ +gcloud eventarc triggers create louisisana-gisaid-trigger-standard \ + --destination-workflow=louisiana-dashboard-standard \ + --destination-workflow-location=us-central1 \ + --event-filters="type=google.cloud.storage.object.v1.finalized" \ + --event-filters="bucket=louisiana-gisaid-data" \ + --location=us \ + --service-account="551108248392-compute@developer.gserviceaccount.com" diff --git a/google_workflows/launch-louisiana-workflow.sh b/google_workflows/launch-louisiana-workflow.sh new file mode 100644 index 0000000..6a89a6b --- /dev/null +++ b/google_workflows/launch-louisiana-workflow.sh @@ -0,0 +1,2 @@ +gcloud workflows deploy louisiana-dashboard-standard \ + --source=louisiana-dashboard-workflow.yaml diff --git a/google_workflows/louisiana-dashboard-workflow.yaml b/google_workflows/louisiana-dashboard-workflow.yaml new file mode 100644 index 0000000..2ccf4a1 --- /dev/null +++ b/google_workflows/louisiana-dashboard-workflow.yaml @@ -0,0 +1,61 @@ +main: + params: [args] + steps: + - init: + assign: + - projectId: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} + - region: "us-central1" + - batchApi: "batch.googleapis.com/v1" + - batchApiUrl: ${ "https://" + batchApi + "/projects/" + projectId + "/locations/" + region + "/jobs"} + - jobId: ${ "louisiana-dashboard-" + string(int(sys.now()))} + - newFile: ${args.data.name} + - logCreateBatchJob: + call: sys.log + args: + data: ${ "Creating and running the batch job " + jobId} + - createAndRunBatchJob: + call: http.post + args: + url: ${batchApiUrl} + query: + job_id: ${jobId} + headers: + Content-Type: application/json + auth: + type: OAuth2 + body: + taskGroups: + - taskSpec: + runnables: + - container: + imageUri: "quay.io/theiagen/terra-tools:2023-02-13" + entrypoint: "/bin/bash" + volumes: + - "/mnt/disks/louisiana-bucket:/data" + - "/mnt/disks/louisiana-bucket/.config:/.config" + commands: + - "-c" + - ${ "bash /data/standard-dashboard.sh -d gs://louisiana-dashboard-processing -j gisaid_louisiana_data.json -s /data/bq_schema/schema_LA_v6.json -b /data/input_gisaid -o /data -t gs://louisiana-gisaid-data -g gs://fc-6c0c9352-49f4-4673-a41c-71baddb16f42 -r gisaid_louisiana_data -p cdc-terra-la-phl -w CDC-COVID-LA-Dashboard-Test -q sars_cov_2_dashboard.workflow_test -i " + newFile} + volumes: + - gcs: + remotePath: "louisiana-dashboard-processing" + mountPath: "/mnt/disks/louisiana-bucket" + taskCount: 1 + logsPolicy: + destination: CLOUD_LOGGING + result: createAndRunBatchJobResponse + - getJob: + call: http.get + args: + url: ${batchApiUrl + "/" + jobId} + auth: + type: OAuth2 + result: getJobResult + - logState: + call: sys.log + args: + data: ${ "Current job state " + getJobResult.body.status.state} + - returnResult: + return: + jobId: ${jobId} + status: "OK" diff --git a/google_workflows/standard-dashboard.sh b/google_workflows/standard-dashboard.sh new file mode 100755 index 0000000..6865f91 --- /dev/null +++ b/google_workflows/standard-dashboard.sh @@ -0,0 +1,192 @@ +#!/bin/bash +set -e + +# filename: standard_dashboard.sh +# authors: Sage Wright, Kevin Libuit, Frank Ambrosio + +VERSION="Google Dashboarding v0.1" + +showHelp() { +cat << EOF +Google Dashboarding v0.1 +This script is configured to work within a Google Batch job managed by a Google Workflow and Trigger. +The following variables need to be passed in as input parameters. +CAUTION: The entire command length must be under 400 characters; using the short version of arguments is recommended + +Usage: ./standard_dashboard.sh + [ -v | --version ] display version and quit + [ -h | --help ] display this help message and quit + [ -d | --dashboard-gcp-uri ] the gcp bucket where dashboard processing will occur ("gs://louisiana-dashboarding-processing") + [ -j | --dashboard-newline-json ] the name of the dashboard newline json file to uploaded to Big Query ("gisaid_louisiana_data.json") + [ -s | --dashboard-schema ] the path in the mounted directory of where you can find the dashboard schema ("/data/bq_schema/schema_LA_v6.json") + [ -b | --gisaid-backup-dir ] the path in the mounted directory of where the gisaid data will be copied ("/data/input_gisaid") + [ -o | --output-dir ] the path in the mounted direcotry where the output files will be written ("/data") + [ -t | --trigger-bucket ] the gcp bucket where the trigger will watch ("gs://louisiana-gisaid-data") + [ -g | --terra-gcp-uri ] the dashboard terra bucket (gs://fc-6c0c9352-49f4-4673-a41c-71baddb16f42") + [ -r | --terra-table-root-entity ] the terra table you want the data stored ("gisaid_louisiana_data") + [ -p | --terra-project ] the project hosting the terra workspace ("cdc-terra-la-phl") + [ -w | --terra-workspace ] the terra workspace ("CDC-COVID-LA-Dashboard-Test") + [ -q | --big-query-table-name ] the name of the big query table to upload to ("sars_cov_2_dashboard.workflow_la_state_gisaid_specimens_test") + [ -m | --metadata-parameters ] (optional) any additional metadata cleanser parameter (enclose in quotes). available options: "--puertorico" + [ -i | --input-tar-file ] the tar file given to the script by the Google Trigger + +Happy dashboarding! +EOF +} + +# use getopt to parse the input arguments +PARSED_ARGUMENTS=$(getopt -n "standard-dashboard" -o "hvd:j:s:b:o:t:g:r:p:w:q:m::i:" -l "version,help,dashboard-gcp-uri:,dashboard-newline-json:,dashboard-schema:,gisaid-backup-dir:,output-dir:,trigger-bucket:,terra-gcp-uri:,terra-table-root-entity:,terra-project:,terra-workspace:,big-query-table-name:,metadata-parameters::,input-tar-file:" -a -- "$@") + +eval set -- "$PARSED_ARGUMENTS" + +while true; do + case "$1" in + -v|--version) + echo $VERSION; exit 0;; + -h|--help) + showHelp; exit 0;; + -d|--dashboard-gcp-uri) + dashboard_gcp_uri=$2; shift 2;; + -j|--dashboard-newline-json) + dashboard_newline_json=$2; shift 2;; + -s|--dashboard_schema) + dashboard_schema=$2; shift 2;; + -b|--gisaid-backup-dir) + gisaid_backup_dir=$2; shift 2;; + -o|--output-dir) + output_dir=$2; shift 2;; + -t|--trigger-bucket) + trigger_bucket=$2; shift 2;; + -g|--terra-gcp-uri) + terra_gcp_uri=$2; shift 2;; + -r|--terra-table-root-entity) + terra_table_root_entity=$2; shift 2;; + -p|--terra-project) + terra_project=$2; shift 2;; + -w|--terra-workspace) + terra_workspace=$2; shift 2;; + -q|--big-query-table-name) + big_query_table_name=$2; shift 2;; + -m|--metadata-parameters) + case "$2" in + "") metadata_cleanser_parameters=''; shift 2;; + *) metadata_cleanser_parameters=$2; shift 2;; + esac ;; + -i|--input-tar-file) + input_tar_file=$2; shift 2;; + --) shift; break ;; + *) echo "Unexpected option: $1 -- this should not happen."; exit 1;; + esac +done + +### SET RE-USED FUNCTIONS + +# this function will make a direcotry if it does not already exist +make_directory() { + if [ -e $1 ]; then + echo "Directory "$1" already exists" + else + mkdir -v $1 + fi +} + +### BEGIN DASHBOARD FUNCTION + +# Set date tag +date_tag=$(date +"%Y-%m-%d-%Hh-%Mm-%Ss") + +# Create output subdirectories if they do not yet exist: +make_directory ${output_dir}/automation_logs +make_directory ${output_dir}/gisaid_processing +make_directory ${output_dir}/backup_jsons + +# echo the variables that were provided +echo -e "Dashboarding Automated System initiated at ${date_tag}\n" | tee ${output_dir}/automation_logs/dashboard-${date_tag}.log +echo -e "Input variables:\ndashboard_gcp_uri: ${dashboard_gcp_uri},\ndashboard_newline_json: ${dashboard_newline_json},\ndashboard_bq_load_schema: ${dashboard_schema},\ngisaid_backup_dir: ${gisaid_backup_dir},\nmounted_output_dir: ${output_dir},\ntrigger_bucket_gcp_uri: ${trigger_bucket},\nterra_gcp_uri: ${terra_gcp_uri},\nterra_table_root_entity: ${terra_table_root_entity},\nterra_project: ${terra_project},\nterra_workspace: ${terra_workspace},\nbig_query_table_name: ${big_query_table_name}\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log + +# take in file as input from trigger +file=${trigger_bucket}/${input_tar_file} +filename=${input_tar_file} + +# indicate that a file has been successfully passed to the script +echo "The file '$filename' appeared in directory '$trigger_bucket'" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log + +# copy the file to the gisaid_backup directory +gsutil cp ${file} ${gisaid_backup_dir}/ + +# if the created file is a gisaid_auspice input file, integrate into Terra and BQ +if [[ "$file" == *"gisaid_auspice_input"*"tar" ]]; then + # indicate the new file is a gisaid file + echo -e "New gisaid file identified: $filename \n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log + + # set up gisaid processing directory using the current date + gisaid_dir="${output_dir}/gisaid_processing/${date_tag}" + + # run the following compilation of scripts: + SCRIPTS=" + # decompress gisaid input tar ball into specific date processing directory + \n + mkdir ${gisaid_dir} + \n + tar -xf ${gisaid_backup_dir}/$filename -C ${gisaid_dir} + \n + \n + # Create individual fasta files from GISAID multifasta + \n + python3 /scripts/gisaid_multifasta_parser.py ${gisaid_dir}/*.sequences.fasta ${gisaid_dir} + \n + \n + # Deposit individual fasta files into Terra GCP bucket + \n + gsutil -m cp ${gisaid_dir}/individual_gisaid_assemblies_$(date -I)/*.fasta ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I)/ + \n + \n + # Create and import Terra Data table containing GCP pointers to deposited assemblies + \n + /scripts/terra_table_from_gcp_assemblies.sh ${terra_gcp_uri}/uploads/gisaid_individual_assemblies_$(date -I) ${terra_project} ${terra_workspace} ${terra_table_root_entity} ${gisaid_dir} \".fasta\" $(date -I) + \n + \n + # Capture, reformat, and prune GISAID metadata + \n + python3 /scripts/gisaid_metadata_cleanser.py ${gisaid_dir}/*.metadata.tsv ${gisaid_dir}/gisaid_metadata_${date_tag}.tsv ${terra_table_root_entity} ${metadata_cleanser_parameters} + \n + \n + # Import formatted data table into Terra + \n + python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${gisaid_dir}/gisaid_metadata_${date_tag}.tsv + \n + \n + # Capture the entire Terra data table as a tsv + \n + python3 /scripts/export_large_tsv/export_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --entity_type ${terra_table_root_entity} --tsv_filename ${gisaid_dir}/full_${terra_table_root_entity}_terra_table_${date_tag}.tsv + \n + \n + # Convert the local Terra table tsv into a newline json + \n + python3 /scripts/tsv_to_newline_json.py ${gisaid_dir}/full_${terra_table_root_entity}_terra_table_${date_tag}.tsv ${gisaid_dir}/${terra_table_root_entity}_${date_tag} + \n + \n + # Push newline json to the dashboard GCP bucket and backup folder + \n + gsutil cp ${gisaid_dir}/${terra_table_root_entity}_${date_tag}.json ${dashboard_gcp_uri}/${terra_table_root_entity}.json + \n + gsutil cp ${gisaid_dir}/${terra_table_root_entity}_${date_tag}.json ${output_dir}/backup_jsons/ + \n + \n + # Load newline json to Big Query + \n + bq load --ignore_unknown_values=true --replace=true --source_format=NEWLINE_DELIMITED_JSON ${big_query_table_name} ${dashboard_gcp_uri}/${terra_table_root_entity}.json ${dashboard_schema} + \n + \n + " + # write the commands that will be run to the automation log + echo -e "#### Capturing GISAID data into Dashboard (${date_tag}) ####\n" >> ${output_dir}/automation_logs/dashboard-${date_tag}.log + echo -e $SCRIPTS >> ${output_dir}/automation_logs/dashboard-${date_tag}.log + + # run the scripts + echo -e $SCRIPTS | bash -x + +else + # display error message if the file is not a GISAID file + echo "The file was not recognized as a GISAID auspice tar file." +fi diff --git a/scripts/README.md b/scripts/README.md index 660c8f4..be57350 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -35,3 +35,81 @@ $ concatenate-across-lanes.sh . dry $ concatenate-across-lanes.sh . >concatenate-across-lanes.log 2>&1 ``` +### gisaid_metadata_cleanser.py + +This python script will read in a tsv of sequence metadata, perform some reformatting and data sanitization, and then produce a tsv that can be uploaded to Terra. + +#### requirements +Positional inputs required: + - tsv_meta_file (assumes GISAID-generated tsv) + - out_file (output file name) + - table_name (the name of the terra table; do not include entity: or _id) + +Optional input parameters: + - `--puertorico` which performs Puerto Rico-specific actions, like removing pango_lineage from the metadata and all samples with PR-CVL in their name + +#### usage +```bash +$ python3 gisaid_metadata_cleanser.py +``` + +### gisaid_multifasta_parser.py + +This python script will parse the mutlifasta file provided in the gisaid auspice tarball + +#### requirements +Two positional inputs required: + - gisaid_multifasta_file (the multifasta file from the auspice tarball) + - output_dir (the location of the output directory + +#### usage +```bash +$ python3 gisaid_multifasta_parser.py +``` + + +### terra_table_from_gcp_assemblies.sh + +This shell script will create a Terra data table with sample names and GCP pointers to assemblies, and then import it to a Terra workspace. + +#### requirements +Five positional arguments required: + - gcp_uri : the gcp_uri for the bucket containing the assembly files + - terra_project : the terra projet that will host incoming terra table + - terra_workspace : the terra workspace that will host incoming terra table + - root_entity : the name of the terra table root entity; do not include entity: or _id + - output_dir : path to local directory where a copy of the terra table will be saved + +Two optional arguments: + - alt_delimiter : file delimiter to pull sample name from file, an underscore is the default + - terra_upload_set : the name of the set applied to the data; by default the date is used + +#### usage +```bash +$ ./terra_table_from_gcp_assemblies.sh +``` + +### tsv_to_newline_json.py + +This python script converts a tsv file into a newline json. + +#### requirements +Two positional inputs required: + - tsv_file : the input tsv file + - output_name : the name of the ouptut file (do not include .json) + +#### usage +```bash +$ python3 tsv_to_newline_json.py +``` + +### standard-dashboard.sh + +This shell script performs all of the functions necessary to transform a given GISAID-generated auspice tarball into a Big Query upload. + +#### usage +```bash +# read the help message +$ ./standard-dashboard.sh -h +``` + diff --git a/scripts/gisaid_metadata_cleanser.py b/scripts/gisaid_metadata_cleanser.py index 8e7f199..70d3f3f 100755 --- a/scripts/gisaid_metadata_cleanser.py +++ b/scripts/gisaid_metadata_cleanser.py @@ -1,30 +1,38 @@ -#!/Users/frank/opt/anaconda3/bin/python +#!/usr/bin/env python3 -# import sys -# import csv import argparse import pandas as pd + #argpase used to take in command line arguments -# three positional arguments, argparse might be overkill, sys command included def get_opts(): - p = argparse.ArgumentParser(description = 'This program reads in a tsv of sequence metadata and performs some reformatting and data sanitization then spits out a tsv to be uploaded to terra.bio', usage='[-h] metadata_cleanser.py ') - p.add_argument('csv_meta_file', - help='tsv metadata file input') - p.add_argument('out_file', - help='Output file: required, must be a string.') - args = p.parse_args() - return args + p = argparse.ArgumentParser(description = 'This program reads in a tsv of sequence metadata and performs some reformatting and data sanitization then spits out a tsv to be uploaded to terra.bio', usage='[-h] metadata_cleanser.py ') + p.add_argument('tsv_meta_file', help='tsv metadata file input') + p.add_argument('out_file', help='Output file: required, must be a string.') + p.add_argument('table_name', help='Terra table name: required, must be a string; do not include entity: or _id.') + p.add_argument('--puertorico', action='store_true', help='Perform Puerto Rico-specific actions') + args = p.parse_args() + return args arguments = get_opts() # read in metadata csv file meta_csv1 = arguments.csv_meta_file meta_df1 = pd.read_csv(meta_csv1, delimiter='\t', dtype={'strain': str, 'age': str}) + +table_name = "entity:" + arguments.table_name + "_id" + # input_headers = meta_df1.columns.values -output_headers = ['entity:gisaid_louisiana_data_id', 'age', 'authors', 'country', 'country_exposure', 'date', 'date_submitted', 'division', 'division_exposure', 'GISAID_clade', 'gisaid_epi_isl', 'host', 'location', 'originating_lab', 'pangolin_lineage', 'region', 'region_exposure', 'segment', 'sex', 'submitting_lab', 'url', 'virus', 'gisaid_accession', 'nextclade_clade', 'gisaid_clade'] +output_headers = [table_name, 'age', 'authors', 'country', 'country_exposure', 'date_submitted', 'division', 'division_exposure', 'GISAID_clade', 'gisaid_epi_isl', 'host', 'location', 'originating_lab', 'pango_lineage', 'region', 'region_exposure', 'segment', 'sex', 'submitting_lab', 'url', 'virus', 'gisaid_accession', 'nextclade_clade', 'gisaid_clade', 'county', 'collection_date'] # rename headers -meta_df1.rename(columns={'strain': 'entity:gisaid_louisiana_data_id', 'GISAID_accession': 'gisaid_accession', 'Nextstrain_clade': 'nextclade_clade', 'vendor': 'sequencing_lab', 'zip': 'county', 'GISAID_clade': 'gisaid_clade', 'pangolin_lineage': 'pango_lineage'}, inplace=True) +meta_df1.rename(columns={'strain': table_name, 'gisaid_epi_isl': 'gisaid_accession', 'Nextstrain_clade': 'nextclade_clade', 'vendor': 'sequencing_lab', 'location': 'county', 'GISAID_clade': 'gisaid_clade', 'pangolin_lineage': 'pango_lineage', 'date': 'collection_date'}, inplace=True) + +# perform PR specific actions: +if arguments.puertorico: + # drop pangolin lineage column + meta_df1.drop('pango_lineage', axis='columns', inplace=True) + # remove any samples uploaded by PR + meta_df1[~meta_df1[table_name].str.contains("PR-CVL")] # drop extraneous cols drop_list = [] @@ -40,7 +48,8 @@ def get_opts(): meta_df1.replace("\n", value=' ', regex=True, inplace=True) # replace all forward slashes in first with underscores -meta_df1['entity:gisaid_louisiana_data_id'].replace('/', value='_', regex=True, inplace=True) +meta_df1[table_name].replace('/', value='_', regex=True, inplace=True) +meta_df1[table_name].replace('\|', value='_', regex=True, inplace=True) # prevent accidental piping # replace all commas with spaces meta_df1.replace(',', value=' ', regex=True, inplace=True) @@ -49,7 +58,7 @@ def get_opts(): meta_df1.replace('Unknown', value='unknown', regex=True, inplace=True) # replace all '_' with '-' in collection date cols -meta_df1['date'].replace('_', value='-', regex=True, inplace=True) +meta_df1['collection_date'].replace('_', value='-', regex=True, inplace=True) meta_df1['date_submitted'].replace('_', value='-', regex=True, inplace=True) @@ -63,7 +72,7 @@ def get_opts(): meta_df1['age'].replace(age_range_replace_dict, inplace=True) # replace all NA values with numerical value 151 -meta_df1['age'] =pd.to_numeric(meta_df1['age'], errors ='coerce').fillna(151).astype('int') +meta_df1['age'] = pd.to_numeric(meta_df1['age'], errors ='coerce').fillna(151).astype('int') # set bin boundaries bins1 = [0, 4, 17, 49, 64, 123, 1000000] @@ -74,14 +83,14 @@ def get_opts(): # perform binning meta_df1['age_bins'] = pd.cut(x=meta_df1['age'], bins=bins1, labels=labels1, include_lowest=True) -# replace all values >122 with unknown -meta_df1['age'].replace(122, 'unknown', inplace=True) +# replace all values >151 with unknown +meta_df1['age'].replace(151, 'unknown', inplace=True) # replace all NA values with unknown meta_df1['age_bins'] = meta_df1['age_bins'].fillna('unknown') # remove duplicate lines, keeping the first values -meta_df1.drop_duplicates(subset='entity:gisaid_louisiana_data_id', keep='first', inplace=True) +meta_df1.drop_duplicates(subset=table_name, keep='first', inplace=True) # Get outfile name out_file_name = arguments.out_file diff --git a/scripts/gisaid_multifasta_parser.py b/scripts/gisaid_multifasta_parser.py index 4644f50..3ddb5b3 100755 --- a/scripts/gisaid_multifasta_parser.py +++ b/scripts/gisaid_multifasta_parser.py @@ -1,25 +1,25 @@ #!/usr/bin/env python3 - import argparse import pyfaidx import time import os import sys -# three positional inputs - +# two positional inputs def get_opts(): - p = argparse.ArgumentParser(description = 'This program will parse the multifasta file provided in the gisaid tarball download of augur input files', usage='[-h] gisaid_multifasta_parser.py ') + p = argparse.ArgumentParser(description = 'This program will parse the multifasta file provided in the gisaid tarball download of augur input files', usage='[-h] gisaid_multifasta_parser.py ') p.add_argument('gisaid_multifasta_file', help='multifasta input file: Enter a multifasta file containing DNA sequence.') - p.add_argument('output_dir_input', + p.add_argument('output_dir', help='Location of output directory.') args = p.parse_args() return args arguments = get_opts() + fasta1 = arguments.gisaid_multifasta_file -output_dir_loc = arguments.output_dir_input +output_dir_loc = arguments.output_dir + # use pyfaidx to read in the fasta file to create a dictionary-like object and in the event of a duplicate sequence keey only take the first entry💪💪💪 seqs1 = pyfaidx.Fasta(fasta1, duplicate_action="first") @@ -34,6 +34,7 @@ def get_opts(): no_slashes_seq_names_list = [] for i in original_seq_names_list: j = i.replace('/','_') + j = j.replace('|',"_") # to prevent accidental piping no_slashes_seq_names_list.append(j) # zip sequences and new slashless names into dicitonary diff --git a/scripts/terra_table_from_gcp_assemblies.sh b/scripts/terra_table_from_gcp_assemblies.sh index f9bb44b..f7c007d 100755 --- a/scripts/terra_table_from_gcp_assemblies.sh +++ b/scripts/terra_table_from_gcp_assemblies.sh @@ -5,19 +5,16 @@ Will create a terra data table with samplenames and gcp pointers to assemblies b For the Terra table to properly import into the user-defined workspace, gcloud authentication is required. - Five positional arguments required, two optional arguments: - terra_table_from_gcp_assemblies.sh {gcp_uri} {terra_project} {terra_workspace} {root_entity} {output_dir} {alt_delimiter} -- {gcp_uri}: gcp_uri for the bucket containing assembly files; gcp_uri must end in foward slash, e.g. \"gs://my_gcp_bucket/\" +- {gcp_uri}: gcp_uri for the bucket containing assembly files; gcp_uri cannot end in foward slash, e.g. \"gs://my_gcp_bucket\" - {terra_project}: terra project that will host the imported terra data table - {terra_workspace}: terra workspace taht will host the imported terra data table - {root_entity}: name of terra table root entity; root_entity should not contain the \"entity:\" prefix nor the \"_id\" suffix - {output_dir}: path to local directory to save a copy of the terra data table - {alt_delimiter}:(OPTIONAL) filename delimiter to pull sample name from file; if no alt_delimiter is provided, an underscore (\"_\") will be utilized - {terra_upload_set}: (OPTIONAL) name of the set which is applied in a third column called 'set' e.g. '2022-02-09-set' will be applied to all samples. - " # If the user invokes the script with -h or any command line arguments, print some help. @@ -50,7 +47,7 @@ fi date_tag=$(date +"%Y-%m-%d-%Hh-%Mm-%Ss") # Capture samplenames from existing assembleis in given gcp_uri -assembly_files=$(gsutil ls ${gcp_uri}*.fasta | awk -F'/' '{ print $NF }') +assembly_files=$(gsutil ls ${gcp_uri}/*.fasta | awk -F'/' '{ print $NF }') # Create Terra table with gcp pointers echo -e "entity:${root_entity}_id\tassembly_fasta\tterra_upload_set" > ${output_dir}/assembly_terra_table_${date_tag}.tsv @@ -59,15 +56,12 @@ for assembly in $assembly_files; do # capture samplename from assembly filename samplename=$(echo ${assembly} | awk -F"${alt_delimiter}|.fasta" '{ print $1 }') # write samplename, gcp pointer, and terra_upload_set to terra data table - echo -e "${samplename}\t${gcp_uri}${assembly}\t${terra_upload_set}" >> ${output_dir}/assembly_terra_table_${date_tag}.tsv + echo -e "${samplename}\t${gcp_uri}/${assembly}\t${terra_upload_set}" >> ${output_dir}/assembly_terra_table_${date_tag}.tsv done # remove duplicates from tsv if samplename not unique awk '!a[$1]++' ${output_dir}/assembly_terra_table_${date_tag}.tsv > temp.tsv && mv temp.tsv ${output_dir}/assembly_terra_table_${date_tag}.tsv # Import Terra table to sepcified terra_workspace -docker run --rm -v "$HOME"/.config:/.config -v ${output_dir}:/data broadinstitute/terra-tools:tqdm bash -c "cd data; python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv /data/assembly_terra_table_${date_tag}.tsv" - -echo "DONE" -date +python3 /scripts/import_large_tsv/import_large_tsv.py --project ${terra_project} --workspace ${terra_workspace} --tsv ${output_dir}/assembly_terra_table_${date_tag}.tsv