Merge pull request #3 from DOH-HNH0303/jdj_dev

summary now functional
DOH-HNH0303 · Aug 21, 2024 · 006ac28 · 006ac28
2 parents 997a652 + 65cef8c
commit 006ac28
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 28 deletions.
diff --git a/bin/plot.R b/bin/plot.R
@@ -1,13 +1,10 @@
+#!/usr/bin/env Rscript
+
+
+install.packages("argparser")
 library(tidyverse)
-library(broom)
-library(ggplot2)
-library("magrittr")
-library(dplyr)
-library(readr)
-library(tidyr)
 library(argparser)
 
-
 p <- arg_parser("create visuals for Varcraft summary")
 # Add command line arguments
 
@@ -27,9 +24,9 @@ step<-argv$step
 df <- read_csv("summary.csv") %>%
   mutate(y = 100*n/length)
 
-cash <- read_tsv("results.txt",col_names = c("name", "year_a", "ps"))
+#cash <- read_tsv("results.txt",col_names = c("name", "year_a", "ps"))
 
-mash <- read_tsv("results.txt", col_names = c("sample", "mock", "true_ani")) %>%
+mash <- read_tsv("mash.txt", col_names = c("sample", "mock", "true_ani")) %>%
   mutate(x=100*(1-true_ani)) %>%
   select(sample, mock, true_ani, x) #list column name to select
 library(stringr)
@@ -66,7 +63,6 @@ mash <- mash %>%
 
 df <- merge(df, mash, by=c("sample","ani","rep")) %>% unique()
 
-
 p <- ggplot(df, aes(x=100*ani, y=true_ani))+
   geom_point()+
   labs(x="Predicted ANI (Mash)", y = "Actual ANI")+

diff --git a/bin/summary.R b/bin/summary.R
@@ -0,0 +1,63 @@
+#!/usr/bin/env Rscript
+
+library(tidyverse)
+library(broom)
+
+# load dataframe
+df <- read_csv("summary.csv") %>%
+  mutate(ani = 100*ani,
+         y = 100*n/length)
+
+mash <- read_tsv("mash.txt", col_names = c("sample", "mock", "true_ani")) %>%
+  separate_wider_delim(cols = c("mock"), delim = '_', names = c("id","ani","rep")) %>%
+  mutate(true_ani = as.numeric(true_ani),
+         sample = str_remove_all(id, pattern = '.ani'),
+         ani = str_remove_all(ani, pattern = '.rep'),
+         ani = 100*as.numeric(str_replace_all(ani, pattern = '-', replacement = '.')),
+         rep = str_remove_all(rep, pattern = ".fasta")
+         ) %>%
+  select(sample, ani, rep, true_ani)
+
+df <- merge(df, mash, by=c("sample","ani","rep"))
+
+p <- ggplot(df, aes(x=ani, y=true_ani))+
+  geom_point()+
+  labs(x="Predicted ANI (Mash)", y = "Actual ANI")+
+  theme_bw()
+ggsave(plot = p, filename = "pred-vs-ani_act.jpg", dpi = 300, width = 5, height = 4)
+# 
+p <- df %>%
+  mutate(diff = ani-true_ani) %>%
+  ggplot(aes(x=ani, y=diff))+
+  geom_point()+
+  labs(x="Predicted ANI", y = "Difference (Actual ANI - Predicted ANI)")+
+  theme_bw()
+ggsave(plot = p, filename = "ani_diff.jpg", dpi = 300, width = 5, height = 4)
+
+# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/
+fit <- nls(y ~ SSasymp(true_ani, yf, y0, log_alpha), data = df)
+sink("summary.nls.txt")
+summary(fit)
+sink()
+
+df.fitted <- augment(fit, newdata = data.frame(true_ani=seq(from=min(df$ani), to=max(df$ani), by = 0.0001)))
+write.csv(x=df.fitted, file="df_fitted.csv", quote=F, row.names=F)
+
+threshold <- df.fitted %>%
+  filter( abs(`.fitted`) == min(abs(`.fitted`))) %>%
+  filter( true_ani == max(true_ani)) %>%
+  .$true_ani %>%
+  as.character()
+writeLines(threshold, "threshold.txt")
+# output funtion from r script
+
+# Threshold @ 0: 96.0936
+cat(paste0("Threshold @ 0: ",threshold, sep = "\n"))
+
+p <- ggplot(data = augment(fit), aes(x=as.numeric(true_ani), y=as.numeric(y)))+
+  geom_point()+
+  geom_line(data = df.fitted, aes(x=as.numeric(true_ani), y=as.numeric(.fitted)))+
+  geom_vline(xintercept = as.numeric(threshold), linetype="dashed", color = "red")+
+  theme_bw()+
+  labs(x="Sample-Reference % ANI", y = "% Ambiguous Bases (N) in Consensus")
+ggsave(plot = p, filename = "nls.jpg", dpi = 300, width = 5, height = 4)
diff --git a/conf/modules.config b/conf/modules.config
@@ -51,8 +51,7 @@ process {
         stageInMode = 'copy'
         publishDir = [
             [
-                path: { "${params.outdir}/" },
-                pattern: "none",
+                path: { "${params.outdir}/summary/" },
                 mode: 'copy'
             ],
         ]

diff --git a/modules/local/summary.nf b/modules/local/summary.nf
@@ -1,27 +1,23 @@
 process SUMMARY {
     label 'process_medium'
 
-    container 'docker.io/hnh0303/varcraft-tool:1.0'
+    container 'docker.io/jdj0303/epitome-base:1.0'
 
     input:
-    tuple val(sample), path(assemblies), path(ref), path(summary)
-    //path assemblies
-    //val sample
-    //path summary
+    path assemblies
+    path mash
 
     output:
-    path "${sample}_summary.csv", emit: summary
+    path "df_fitted.csv", emit: df_fitted
+    path "threshold.txt", emit: threshold
+    path "*.jpg",         emit: plots
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     """
-    cat ${summary}
-    ls ${assemblies} | wc -l > assembly_count.txt
-    cat ${assemblies} > all.fa
-    echo ${sample}
-    summary.sh all.fa
-    mv summary.csv ${sample}_summary.csv
+    summary.sh ${assemblies}
+    summary.R
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -14,7 +14,7 @@ params {
     input                      = null
     max_ani                    = 0.2
     min_ani                    = 0
-    step                       = 0.1
+    step                       = 0.05
     rep                        = 1
 
     // References

diff --git a/workflows/varcraft.nf b/workflows/varcraft.nf
@@ -162,9 +162,8 @@ workflow VARCRAFT {
     IVAR_CONSENSUS.out.consensus.collect().view()
     SUMMARY (
 
-        IVAR_CONSENSUS.out.as_sample//.collect()
-        .combine(manifest.map{ sample, assembly, fastq_1, fastq_2 -> [ sample, assembly ] })
-        .combine(MASH.out.mash_summary).collect().view()
+        IVAR_CONSENSUS.out.consensus.splitFasta().collectFile(name: 'all.fa'),
+        MASH.out.mash_summary.map{ sample, summary -> summary }.splitText().collectFile(name: 'mash.txt')
 
 
     )