Skip to content

Commit

Permalink
condense only occurs when more than 1 reference
Browse files Browse the repository at this point in the history
  • Loading branch information
DOH-JDJ0303 committed May 22, 2024
1 parent 3d52bc1 commit 3f73426
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 36 deletions.
80 changes: 45 additions & 35 deletions bin/condense.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,43 +35,53 @@ clusters.df <- read_csv(clusters_path) %>%
#---- LOAD SEQ LENGTHS ----#
len.df <- read_csv(lengths_path, col_names = c("seq","length"))

#---- LOAD PAIRWISE DISTANCES ----#
dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST", "PVAL", "HASH")) %>%
select(ID1, ID2, DIST)
dist.mat <- dist.df %>%
pivot_wider(names_from="ID2", values_from="DIST") %>%
column_to_rownames(var="ID1") %>%
as.matrix() %>%
as.dist()
#---- CONDENSE CLOSELY RELATED REFERENCES ----#
# This is only performed when two or more references were generated
if(nrow(clusters.df) > 1){
#---- LOAD PAIRWISE DISTANCES ----#
dist.df <- read_tsv(dist_path, col_names = c("ID1","ID2","DIST", "PVAL", "HASH")) %>%
select(ID1, ID2, DIST)
dist.mat <- dist.df %>%
pivot_wider(names_from="ID2", values_from="DIST") %>%
column_to_rownames(var="ID1") %>%
as.matrix() %>%
as.dist()

#---- HIERARCHICAL CLUSTER ----#
# create tree
tree <- hclust(dist.mat, method = "complete") %>%
as.phylo()
# test that tree is ultrametric and rooted (required for cutree)
if(! is.ultrametric(tree)){
cat("Error: Tree is not ultrameric!")
q()
}
if(! is.rooted(tree)){
cat("Error: Tree is not rooted!")
q()
}

#---- CUT DENDROGRAM AT DISTANCE THRESHOLD ----#
clusters.refs.df <- cutree(as.hclust(tree), h = as.numeric(threshold)) %>%
data.frame() %>%
rownames_to_column(var = "seq") %>%
rename(cluster = 2) %>%
left_join(clusters.df, by = "seq") %>%
left_join(len.df, by = "seq") %>%
group_by(cluster) %>%
mutate(n2 = n()) %>%
filter(!(n < 10 & n2 > 1)) %>%
filter(length == max(length)) %>%
ungroup() %>%
select(seq,taxa,segment,cluster,n,n2,length)
#---- HIERARCHICAL CLUSTER ----#
# create tree
tree <- hclust(dist.mat, method = "complete") %>%
as.phylo()
# test that tree is ultrametric and rooted (required for cutree)
if(! is.ultrametric(tree)){
cat("Error: Tree is not ultrameric!")
q()
}
if(! is.rooted(tree)){
cat("Error: Tree is not rooted!")
q()
}

#---- CUT DENDROGRAM AT DISTANCE THRESHOLD ----#
clusters.refs.df <- cutree(as.hclust(tree), h = as.numeric(threshold)) %>%
data.frame() %>%
rownames_to_column(var = "seq") %>%
rename(cluster = 2) %>%
left_join(clusters.df, by = "seq") %>%
left_join(len.df, by = "seq") %>%
group_by(cluster) %>%
mutate(n2 = n()) %>%
filter(!(n < 10 & n2 > 1)) %>%
filter(length == max(length)) %>%
ungroup() %>%
select(seq,taxa,segment,cluster,n,n2,length)
}else{
clusters.refs.df <- clusters.df %>%
left_join(len.df, by = "seq") %>%
mutate(cluster = 1,
n2 = 1) %>%
select(seq,taxa,segment,cluster,n,n2,length)
}
#----- SAVE OUTPUT -----#
write.csv(x= clusters.refs.df, file = paste0(file.base,".condensed.csv"), quote = F, row.names = F)


Expand Down
4 changes: 3 additions & 1 deletion modules/local/condense.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ process CONDENSE {
do
mv \${s}.fa.gz tmp/
done
rm *.fa.gz && mv tmp/*.fa.gz ./ && rm -r tmp
rm *.fa.gz || true
mv tmp/*.fa.gz ./ || true
rm -r tmp || true
# get min seq length - for FastANI
min_len=\$(cat ${prefix}.condensed.csv | cut -f 7 -d ',' | tail -n +2 | sort -n | sed -n 1p)
Expand Down

0 comments on commit 3f73426

Please sign in to comment.