forked from kapsakcj/centroid
-
Notifications
You must be signed in to change notification settings - Fork 0
/
centroid.py
96 lines (85 loc) · 3.04 KB
/
centroid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
#Author: Rich Stanton
#Author email: [email protected]
#super incredibly minor modifications by: Rachael St. Jacques
#simm email: [email protected]
#description: selects optimal reference genome given a set of fasta files
#Requires: mash
#Usage: python centroid.py /path/to/assemblies/
VERSION = "0.1.0"
import sys
import glob
import numpy
import operator
from operator import itemgetter
import subprocess
import argparse
def Mash_List(Mash_Index):
"""Takes in an index of Mash files and makes a list of the info from each"""
List1 = glob.glob(Mash_Index)
Combined = []
for files in List1:
f = open(files, 'r')
String1 = f.readline()
Combined.append(String1)
f.close()
Combined.sort()
return Combined
def Average_Mash(input_mash_list):
"""Takes in a Mash_List (made using Mash_List) and then makes a list with the average value for each entry"""
Average_List = []
List1 = input_mash_list[0].split('\t'.encode())
Entry = List1[0]
values = []
for entries in input_mash_list:
List1 = entries.split('\t'.encode())
if List1[0] != Entry:
average = numpy.mean(values)
Total = []
Total.append(Entry)
Total.append(average)
Average_List.append(Total)
Entry = List1[0]
values = []
values.append(float(List1[2]))
else:
values.append(float(List1[2]))
average = numpy.mean(values)
Total = []
Total.append(Entry)
Total.append(average)
Average_List.append(Total)
Average_List.sort(key=operator.itemgetter(1))
return Average_List
def Mash_List_Maker(input_assembly_list):
"""Makes a list of the all x all mash outputs"""
Output = []
for files in input_assembly_list:
for files2 in input_assembly_list:
String1 = subprocess.check_output('mash dist ' + files + ' ' + files2, shell=True)
Output.append(String1)
Output.sort()
# write TSV to file
with open("mash-results.tsv", 'w', newline='') as tsvfile:
for item in Output:
tsvfile.write(item.decode("utf-8"))
return Output
def Mash_Centroid(input_assembly_list):
"""Returns the name of the fasta with the lowest average mash index"""
List1 = Mash_List_Maker(input_assembly_list)
Averages = Average_Mash(List1)
Best = Averages[0][0]
return Best
parser = argparse.ArgumentParser(
description = '''Selects optimal reference genome given a set of fasta files''',
usage = 'centroid.py [options] /path/to/fasta_files/',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('folder', help='Folder containing *.fasta files', type=str)
parser.add_argument('--version', action='version', version=str(VERSION))
options = parser.parse_args()
Folder = options.folder
Assembly_List = glob.glob(Folder + '/*.fasta')
Centroid = Mash_Centroid(Assembly_List)
print(Centroid)
ref_genome = Centroid.decode().rsplit('/',1)[1]
print(ref_genome, file=open("centroid_out.txt", "w"))