Skip to content

Commit

Permalink
changes to dorado basecall task: added logic for selecting model to b…
Browse files Browse the repository at this point in the history
…e used at runtime; improved logging of dorado STDERR to a file; parsed explict model name from STDERR file or accept user input string; added dorado_log task output file
  • Loading branch information
kapsakcj committed Nov 18, 2024
1 parent dec6ef8 commit 3a6488b
Showing 1 changed file with 29 additions and 17 deletions.
46 changes: 29 additions & 17 deletions tasks/basecalling/task_dorado_basecall.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -17,54 +17,66 @@ task basecall {
# Capture Dorado version and log it
dorado --version > DORADO_VERSION 2>&1
echo "Captured Dorado version:" $(cat DORADO_VERSION)

# Define the model to use, substituting "sup" with the full model name if given
resolved_model="~{dorado_model}"
if [ "$resolved_model" = "sup" ]; then
resolved_model="[email protected]"

# if user provides fast, hac, or sup, pass those strings to dorado basecaller command
if [ "~{dorado_model}" == "fast" ] || [ "~{dorado_model}" == "hac" ] || [ "~{dorado_model}" == "sup" ]; then
dorado_model_variable=~{dorado_model}
echo "DEBUG: dorado_model_variable is set to: $dorado_model_variable"
# if user provides explicit model name, for example "[email protected]", then provide hardcoded path to directory in container filesystem that has the dorado models
else
dorado_model_variable="/dorado_models/~{dorado_model}"
echo "DEBUG: dorado_model_variable is set to: $dorado_model_variable"
fi

# Log the resolved model namet
echo "Using Dorado model: $resolved_model"
echo "$resolved_model" > "DORADO_MODEL"

# Define a log file path to capture output
log_file="dorado_basecall.log"

# Create a unique output directory for each scatter job
base_name=$(basename "~{input_file}" .pod5)
sam_output="output/sam_${base_name}/"
mkdir -p "$sam_output"

echo "### Starting basecalling for ~{input_file} ###" | tee -a "$log_file"
echo "### Starting basecalling for ~{input_file} ###" | tee -a dorado_basecall.log

# Set SAM file path with unique naming based on POD5 basename
sam_file="$sam_output/${base_name}.sam"

echo "Processing ~{input_file}, expected output: $sam_file" | tee -a "$log_file"
echo "Processing ~{input_file}, expected output: $sam_file" | tee -a dorado_basecall.log

# Run Dorado basecaller and log output
# This part "2> >(tee -a log.txt >&2)" is used to redirect STDERR to the screen AND to append the STDERR to log.txt file.
# Useful for troubleshooting in Terra and for parsing for important information.
dorado basecaller \
"~{dorado_model}" \
"${dorado_model_variable}" \
"~{input_file}" \
--kit-name ~{kit_name} \
--emit-sam \
--no-trim \
--output-dir "$sam_output" \
--verbose | tee -a "$log_file" || { echo "ERROR: Dorado basecaller failed for ~{input_file}"; exit 1; }
--verbose 2> >(tee -a dorado_basecall.log >&2) || { echo "ERROR: Dorado basecaller failed for ~{input_file}"; exit 1; }

# Log the resolved model name
echo "DEBUG: Parsing model name from dorado log or capturing string from user input..."
if [ "~{dorado_model}" == "fast" || "~{dorado_model}" == "hac" || "~{dorado_model}" == "sup" ]; then
echo "DEBUG: User provided either fast, hac, or sup as input for dorado_model variable, parsing log for explicit model name now..."
grep -m 1 'downloading' dorado_basecall.log | sed -e 's/.*downloading //' -e 's/ with.*//' | tr -d '\n' | tee DORADO_MODEL

# (else) if user provides explicit model name, just output that string, no parsing involved
else
echo "~{dorado_model}" | tee DORADO_MODEL
fi

# Rename the generated SAM file to the unique name based on input_file
generated_sam=$(find "$sam_output" -name "*.sam" | head -n 1)
mv "$generated_sam" "$sam_file"

echo "Basecalling completed for ~{input_file}. SAM file renamed to: $sam_file" | tee -a "$log_file"
echo "Basecalling completed for ~{input_file}. SAM file renamed to: $sam_file" | tee -a "dorado_basecall.log"
>>>

output {
Array[File] sam_files = glob("output/sam_*/*.sam")
String dorado_docker = docker
String dorado_version = read_string("DORADO_VERSION")
String dorado_model_used = read_string("DORADO_MODEL")
# keeping this dorado_log just for debugging purposes, not a wf output
File dorado_log = "dorado_basecall.log"
}

runtime {
Expand Down

0 comments on commit 3a6488b

Please sign in to comment.