Skip to content

Commit

Permalink
Merge pull request #2 from DOH-JDJ0303/norm_ref
Browse files Browse the repository at this point in the history
Norm ref
  • Loading branch information
DOH-JDJ0303 authored Mar 1, 2024
2 parents 25120a0 + 0135479 commit a689514
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 26 deletions.
77 changes: 58 additions & 19 deletions bin/ref-select_accurate.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#!/usr/bin/env Rscript

# check for required packages
list.of.packages <- c("readr", "dplyr","tidyr")
list.of.packages <- c("readr", "dplyr","tidyr","stringr","patchwork","ggplot2")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# load packages
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(patchwork)

# load args
args <- commandArgs(trailingOnly=T)
Expand All @@ -17,37 +20,73 @@ refs_comp <- args[2]
sample <- args[3]
gen_frac <- args[4]


# load alignment file
paf <- read_tsv(paf_file, col_names = F) %>%
rename(CONTIG = 6,
rename(TARGET = 6,
LENGTH = 7,
ALIGN = 11) %>%
mutate(ASSEMBLY=gsub(CONTIG, pattern="[0-9]$",replacement=""))
START = 8,
END = 9) %>%
mutate(ASSEMBLY=str_remove_all(TARGET, pattern="(\\d+$)"),
CONTIG = str_extract(TARGET, "(\\d+$)")) %>%
select(TARGET, ASSEMBLY, CONTIG, LENGTH, START, END)

# load reference contig lengths
contig_lengths <- read_tsv(refs_comp) %>%
rename(CONTIG = 1,
# determine missing targets & add back to paf
paf <- read_tsv(refs_comp) %>%
rename(TARGET = 1,
LENGTH = 2) %>%
mutate(ASSEMBLY=gsub(CONTIG, pattern="[0-9]$",replacement="")) %>%
group_by(ASSEMBLY) %>%
summarise(TOT_LENGTH = sum(LENGTH))
mutate(ASSEMBLY=str_remove_all(TARGET, pattern="(\\d+$)"),
CONTIG = str_extract(TARGET, "(\\d+$)"),
START = 0,
END = 0,
ALIGN = 0) %>%
select(TARGET, ASSEMBLY, CONTIG, LENGTH, START, END) %>%
filter(!(TARGET %in% paf$TARGET)) %>%
rbind(paf)

# get alignment depth across each target
expand_align <- function(index){
subdf <- paf[index,]
rslt <- data.frame("ASSEMBLY" = subdf$ASSEMBLY, "CONTIG" = subdf$CONTIG, SITE = subdf$START:subdf$END, "LENGTH" = subdf$LENGTH )
return(rslt)
}
depth <- do.call(rbind, lapply(1:nrow(paf), FUN = expand_align)) %>%
group_by(ASSEMBLY, CONTIG, SITE, LENGTH) %>%
count()

# make alignment plots
make_plot <- function(a){
subdf <- depth %>% filter(ASSEMBLY == a)
xscales <- subdf %>% select(ASSEMBLY, CONTIG, LENGTH) %>% unique()
p <- ggplot()+
geom_histogram(data = subdf, aes(x = SITE, y = n), stat = "identity", color = "black")+
geom_segment(data = xscales, aes(x = 0, xend = LENGTH, y = 0, yend = 0))+
facet_grid(ASSEMBLY~CONTIG, scales = "free")+
theme_bw()+
theme(strip.text.y = element_text(angle = 0))+
labs(x = "Site", y = "Depth")
return(p)
}
plots <- lapply(unique(depth$ASSEMBLY), FUN = make_plot)
p <- wrap_plots(plots, ncol = 1, nrow = length(unique(depth$ASSEMBLY)))
ggsave(plot = p, file = paste0(sample,".ref-genfrac.jpg"), dpi = 300, width = 50, height = 50, limitsize = F)

# sum alignments
align.summary <- paf %>%
# set max depth to 1 & determine genome fraction
ref.summary <- depth %>%
mutate(n_adj = case_when(n > 0 ~ 1,
TRUE ~ 0)) %>%
group_by(ASSEMBLY, LENGTH) %>%
summarize(ALIGN = sum(n_adj)-1) %>%
group_by(ASSEMBLY) %>%
summarise(TOT_ALIGN = sum(ALIGN)) %>%
merge(contig_lengths, by = "ASSEMBLY") %>%
mutate(GENOME_FRAC = TOT_ALIGN / TOT_LENGTH)
summarize(GENOME_FRAC = sum(ALIGN) / sum(LENGTH))

# select references
ref.list <- align.summary %>%
subset(GENOME_FRAC > gen_frac) %>%
ref.list <- ref.summary %>%
subset(as.numeric(GENOME_FRAC) > as.numeric(gen_frac)) %>%
select(ASSEMBLY)
if(nrow(ref.list) == 0){
ref.list <- data.frame(ASSEMBLY = "none_selected")
}

# write outputs
write.table(x= align.summary, file = paste0(sample,".ref-summary.csv"), quote = F, sep = ",", row.names = F)
write.table(x= ref.summary, file = paste0(sample,".ref-summary.csv"), quote = F, sep = ",", row.names = F)
write.table(x= ref.list, file = paste0(sample,".ref-list.csv"), quote = F, sep = ",", row.names = F, col.names = F)
14 changes: 10 additions & 4 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,16 @@ process {
ext.args = ""
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/${meta.id}/taxonomy/" },
pattern: "*ref-summary.csv"
[
path: { "${params.outdir}/${meta.id}/taxonomy/" },
pattern: "*ref-summary.csv",
mode: 'copy'
],
[
path: { "${params.outdir}/${meta.id}/taxonomy" },
pattern: "*.jpg",
mode: 'copy'
]
]
}
withName: 'BWA_MEM' {
Expand Down
8 changes: 5 additions & 3 deletions modules/local/summarize_taxa.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ process SUMMARIZE_TAXA {
path refs_comp

output:
tuple val(meta), path("*.ref-summary.csv"), emit: ref_summary, optional: true
tuple val(meta), path("*.ref-list.csv"), emit: ref_list
tuple val(meta), path("*.taxa-summary.csv"), emit: sm_summary
tuple val(meta), path("*.ref-summary.csv"), emit: ref_summary, optional: true
tuple val(meta), path("*.ref-list.csv"), emit: ref_list
tuple val(meta), path("*.taxa-summary.csv"), emit: sm_summary
tuple val(meta), path("*.jpg"), emit: cov_plot, optional: true


when:
task.ext.when == null || task.ext.when
Expand Down

0 comments on commit a689514

Please sign in to comment.