-
Notifications
You must be signed in to change notification settings - Fork 1
/
straln.py
432 lines (386 loc) · 16.6 KB
/
straln.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
# Name: straln.py
# Language: python3
# Libraries:
# Description: Runs the structural alignments needed for HOMEP
# Author: Edoardo Sarti
# Date: Aug 17 2016
import os
import sys
import multiprocessing
import subprocess
import re
import time
import Bio
def repo_inspector(repo_filename):
repo_file = open(repo_filename, 'r')
text = repo_file.read().split('\n')
repo_file.close()
repo_info = {}
for line in text:
fields = line.split()
if line and fields[0] == 'BEGIN':
chain_1 = fields[2]
chain_2 = fields[4]
record = True
if not record and not line:
continue
if record:
recorded_text = line + '\n'
if fields[0] == 'END':
if chain_1 not in repo_info:
repo_info[chain_1] = {}
if chain_2 in repo_info[chain_1]:
print("WARNING: multiple occurrences of couple {0} {1} in file {2}".format(chain_1, chain_2, repo_filename))
repo_info[chain_1][chain_2] = recorded_text
record = False
return repo_info
def write_on_repo(repo_filename, textdict, append=False):
if append:
repo_file = open(repo_filename, 'a')
else:
repo_file = open(repo_filename, 'w')
for val1 in textdict:
for val2 in textdict[val1]:
repo_file.write("BEGIN\t\tCHAIN_1: " + chain_1 + "\tCHAIN_2: " + chain_2 + "\n" + textdict + "\nEND\n\n\n")
repo_file.close()
def FrTMjob(data):
locations, target, exelist = data
topologytype, topology, chain_1, straln_path = target
topology_path = locations['FSYSPATH'][topologytype] + topology + '/'
strfasta_filename = topology_path + locations['TREE']['straln'] + 'str_' + chain_1 + '_fasta.dat'
strpdb_filename = topology_path + locations['TREE']['straln'] + 'str_' + chain_1 + '_pdb.dat'
str_repo_path = locations['FSYSPATH']['repocstraln']
seq_repo_path = locations['FSYSPATH']['repocseqaln']
# Checks for needed locations:
# Superfamily path
if not os.path.exists(topology_path):
raise NameError("ERROR: Superfamily {0} not found in path {1}".format(topologytype+' '+topology, topology_path))
# structure/ folder
if not os.path.exists(topology_path + locations['TREE']['str']):
raise NameError("ERROR: {0} folder not found in path {1}".format(locations['TREE']['str'], topology_path))
# Main pdb file
if not os.path.exists(topology_path + locations['TREE']['str'] + chain_1 + '.pdb'):
raise NameError("ERROR: File {0} not found in {1}".format(chain_1 + '.pdb', topology_path + locations['TREE']['str']))
# Secondary pdb files
for chain_2 in exelist:
if not os.path.exists(topology_path + locations['TREE']['str'] + chain_2 + '.pdb'):
raise NameError("ERROR: File {0} not found in {1}".format(chain_2 + '.pdb', topology_path + locations['TREE']['str']))
# Creates, if needed, the alignment locations
for name, val in locations['TREE'].items():
if 'aln' in name and not os.path.exists(topology_path + val):
os.mkdir(topology_path + val)
# Creates, if needed, the repository locations
if not os.path.exists(seq_repo_path):
os.mkdir(seq_repo_path)
if not os.path.exists(str_repo_path):
os.mkdir(str_repo_path)
repo_info_str_pdb = repo_inspector(locations['FSYSPATH']['repocstraln'] + 'str_' + chain_1 + '_pdb.dat')
repo_info_str_fasta = repo_inspector(locations['FSYSPATH']['repocstraln'] + 'str_' + chain_1 + '_fasta.dat')
repo_info_seq_fasta = repo_inspector(locations['FSYSPATH']['repocseqaln'] + 'seq_' + chain_1 + '_fasta.dat')
"""
# Checks if the main sequence file already exists. If it does, checks again if none of the alignments in exelist
# is contained in the file. Cancels from exelist any found alignment.
if os.path.exists(fasta_path):
fasta_file = open(fasta_path, 'r')
text = fasta_file.read().split('\n')
for line in text:
if 'CHAIN_2:' in line and field[1] in exelist:
logmsg = "Update repository with {0}_{1} sequence alignment".format(exelist[2], field[1])
update_repository.append(field[1])
exelist.remove[field[1]]
fasta_file.close()
# Checks if the main structure file already exists. If it does, checks again if none of the alignments in exelist
# is contained in the file. Cancels from exelist any found alignment.
if os.path.exists(pdb_path):
pdb_file = open(pdb_path, 'r')
text = pdb_file.read().split('\n')
for line in text:
if 'CHAIN_2:' in line and field[1] in exelist:
logmsg = "Update repository with {0}_{1} alignment".format(exelist[2], field[1])
update_repository.append(field[1])
exelist.remove[field[1]]
pdb_file.close()
"""
# Creates the temporary folder for sequence alignments
fasta_tmpfolder_path = topology_path + locations['TREE']['stralns'] + 'tmp_' + chain_1 + '_fasta/'
if not os.path.exists(fasta_tmpfolder_path):
os.mkdir(fasta_tmpfolder_path)
# Creates the temporary folder for structure alignments
pdb_tmpfolder_path = topology_path + locations['TREE']['stralns'] + 'tmp_' + chain_1 + '_pdb/'
if not os.path.exists(pdb_tmpfolder_path):
os.mkdir(pdb_tmpfolder_path)
pdb1_filename = topology_path + locations['TREE']['str'] + chain_1 + '.pdb'
for chain_2 in exelist:
if repo_info_str_fasta[chain_1] and repo_info_str_fasta[chain_1][chain_2] and repo_info_str_pdb[chain_1] and repo_info_str_pdb[chain_1][chain_2]:
continue
# Defines filenames
pdb2_filename = topology_path + locations['TREE']['str'] + chain_2 + '.pdb'
fasta_output_filename = fasta_tmpfolder_path + 'straln_' + chain_1 + '_' + chain_2 + '_fasta.tmp'
pdb_output_filename = 'straln_' + chain_1 + '_' + chain_2 + '_pdb.tmp' # This filename is without path for a constraint on flags length of frtmalign (see below)
stdout_filename = fasta_tmpfolder_path + 'output_' + chain_1 + '_' + chain_2 + '.tmp'
# If sequence and structure temporary files are present, jump
if os.path.exists(fasta_output_filename) and os.path.exists(pdb_output_filename):
continue
# Fr-TM-align call
# The Fr-TM-align command cannot exceed a certain length. Thus, we are compelled to run it locally into structures/
# The stdout file from Fr-TM-align can go directly to the right temporary folder (but needs to be cleaned)
stdout_file = open(stdout_filename, 'w')
fnull = open(os.devnull, 'w')
print(straln_path, pdb1_filename[-10:], pdb2_filename[-10:], '-o', pdb_output_filename)
p = subprocess.Popen([straln_path, pdb1_filename[-10:], pdb2_filename[-10:], '-o', pdb_output_filename], stdout=stdout_file, stderr=fnull, cwd=topology_path+locations['TREE']['str'])
p.wait()
fnull.close()
stdout_file.close()
# Moves the Fr-TM-align output file into the structure temporary folder
os.rename(topology_path + locations['TREE']['str'] + pdb_output_filename, pdb_tmpfolder_path + pdb_output_filename)
# Reads and then removes the stdout file from Fr-TM-align
stdout_file = open(stdout_filename, 'r')
text = stdout_file.read().split('\n')
stdout_file.close()
os.remove(stdout_filename)
# From the stdout file from Fr-TM-align, it takes the RMSD, TM-score and the two aligned sequences
chkaln = -1000
for nl in range(len(text)):
if "Aligned length" in text[nl]:
fields = re.split('=|,|\s',text[nl])
fields = list(filter(None, fields))
RMSD = float(fields[4])
TMscore = float(fields[6])
elif chkaln+1 == nl:
seq_1 = text[nl]
elif chkaln+3 == nl:
seq_2 = text[nl]
elif "denotes the residue pairs of distance" in text[nl]:
chkaln = nl
# Creates a sequence temporary file already correctly formatted
fasta_output_file = open(fasta_output_filename, 'w')
fasta_output_file.write(">" + chain_1 + "\n" + seq_1.replace('\x00', '') + "\n>" + chain_2 + "\n" + seq_2.replace('\x00', '') + "\n\nRMSD\t{0:.2f}\nTM-score\t{1:.5f}\nstr_SEQID\t{2:.5f}\n\n".format(RMSD, TMscore, calculate_seqid((seq_1, seq_2))))
fasta_output_file.close()
fasta1_filename = topology_path + locations['TREE']['seq'] + chain_1 + '.fa'
fasta1_file = open(fasta1_filename, 'r')
text = fasta1_file.read().split('\n')
for line in text:
if line and line[0] != '>':
sequence_1 = line.strip()
break
fasta1_file.close()
seqfasta_file = open(seqfasta_filename, 'w')
for chain_2 in exelist:
seqfasta_file.write("BEGIN\t\tCHAIN_1: " + chain_1 + "\tCHAIN_2: " + chain_2 + "\n")
if not (repo_info_seq_fasta[chain_1] and repo_info_seq_fasta[chain_1][chain_2]):
fasta2_filename = topology_path + locations['TREE']['seq'] + chain_2 + '.fa'
fasta2_file = open(fasta2_filename, 'r')
text = fasta2_file.read().split('\n')
fasta2_file.close()
for line in text:
if line and line[0] != '>':
sequence_2 = line.strip()
break
result = Bio.pairwise2.align.globalds(sequence_1, sequence_2, Bio.SubsMat.MatrixInfo.blosum62, -10.0, -0.5)
seqaln = [result[0][0][0], result[0][1][0]]
text = "{0}\n{1}\n{2}\n{3}\n\nseq_SEQID: {4}\n".format(chain_1, seqaln[0], chain_2, seqaln[1], calculate_seqid(seqaln))
seqfasta_file.write(text)
repo_info_seq_fasta[chain_1][chain_2] = text
else:
for line in repo_info_seq_fasta[chain_1][chain_2]:
seqfasta_file.write(line+"\n")
seqfasta_file.write("END\n\n\n")
write_on_repo(locations['FSYSPATH']['repocseqaln'] + 'seq_' + chain_1 + '_fasta.dat', repo_info_seq_fasta)
# Writes on the main sequence file. Each alignment begins with a "BEGIN" line, followed by two lines reporting the two chain names
# (format: "CHAIN_X: chain_name", where X={1,2}), and ends with an "END" line.
strfasta_file = open(strfasta_filename, 'w')
for chain_2 in exelist:
strfasta_file.write("BEGIN\t\tCHAIN_1: " + chain_1 + "\tCHAIN_2: " + chain_2 + "\n")
if not (repo_info_str_fasta[chain_1] and repo_info_str_fasta[chain_1][chain_2]):
tmp_filename = fasta_tmpfolder_path + 'straln_' + chain_1 + '_' + chain_2 + '_fasta.tmp'
if not os.path.exists(tmp_filename):
continue
tmp_file = open(tmp_filename)
text = tmp_file.read().split('\n')
tmp_file.close()
for line in text:
strfasta_file.write(line+'\n')
repo_info_str_fasta[chain_1][chain_2] = text
os.remove(tmp_filename)
else:
for line in repo_info_str_fasta[chain_1][chain_2]:
strfasta_file.write(line + '\n')
strfasta_file.write("END\n\n\n")
write_on_repo(locations['FSYSPATH']['repocstraln'] + 'str_' + chain_1 + '_fasta.dat', repo_info_str_fasta)
time.sleep(1)
os.rmdir(strfasta_tmpfolder_path)
strfasta_file.close()
# Writes on the main structure file. Each alignment begins with a "BEGIN" line, followed by two lines reporting the two chain names
# (format: "CHAIN_X: chain_name", where X={1,2}), and ends with an "END" line.
strpdb_file = open(strpdb_filename, 'w')
for chain_2 in exelist:
strpdb_file.write("BEGIN\t\tCHAIN_1: " + chain_1 + "\tCHAIN_2: " + chain_2 + "\n")
if not (repo_info_str_pdb[chain_1] and repo_info_str_pdb[chain_1][chain_2]):
tmp_filename = pdb_tmpfolder_path + 'straln_' + chain_1 + '_' + chain_2 + '_pdb.tmp'
if not os.path.exists(tmp_filename):
continue
tmp_file = open(tmp_filename)
text = tmp_file.read().split('\n')
tmp_file.close()
for line in text:
strpdb_file.write(line+'\n')
repo_info_str_pdb[chain_1][chain_2] = text
os.remove(tmp_filename)
else:
for line in repo_info_str_pdb[chain_1][chain_2]:
strpdb_file.write(line + '\n')
strpdb_file.write("END\n\n\n")
write_on_repo(locations['FSYSPATH']['repocstraln'] + 'str_' + chain_1 + '_pdb.dat', repo_info_str_pdb[chain_1][chain_2])
time.sleep(1)
os.rmdir(pdb_tmpfolder_path)
strpdb_file.close()
return (repo_info_seq_fasta, repo_info_str_fasta, repo_info_str_pdb)
def calculate_seqid(alignment):
ntot = 0
naln = 0
for na in range(len(alignment[0])):
if alignment[0][na] != '-' and alignment[1][na] != '-':
ntot += 1
if alignment[0][na] == alignment[1][na]:
naln += 1
if ntot > 0:
return naln/ntot
else:
return 0
def make_new_table(locations, fasta_repos, external_filename):
seq_fasta_repo, str_fasta_repo = fasta_repos
names = {'alpha' : 'a', 'beta' : 'b'}
instructions_filename = locations['SYSFILES']['H_topologytype']
instructions_file = open(instructions_filename, 'r')
text = instructions_file.read().split('\n')
instructions_file.close()
instructions = {}
for line in text:
if not line:
continue
fields = line.split()
if fields[0] not in instructions:
instructions[fields[0]] = {}
instructions[fields[0]][fields[1]] = fields[2]
table_filename = locations['FSYSPATH']['main'] + external_filename
table_file = open(table_filename, 'w')
table = {}
for toptype in sorted(list(instructions.keys())):
table[toptype] = {}
for top in sorted(list(instructions[toptype].keys())):
table[toptype][top] = {}
for chain_1 in sorted(instructions[toptype][top]):
table[toptype][top][chain_1] = {}
for chain_2 in sorted(instructions[toptype][top]):
if chain_1 == chain_2:
continue
text = seq_fasta_repo[chain_1][chain_2].split('\n')
for line in text:
if not line:
continue
fields = line.split()
if fields[0] == 'seq_SEQID':
seq_seqid = float(fields[1])
text = str_fasta_repo[chain_1][chain_2].split('\n')
for line in text:
if not line:
continue
fields = line.split()
if fields[0] == 'str_SEQID':
str_seqid = float(fields[1])
elif fields[0] == 'TM-score':
TMscore = float(fields[1])
elif fields[0] == 'RMSD':
RMSD = float(fields[1])
table_file.write("{0}\t{1}\t{2}\t{3}\t{4:10.8f}\t{5:10.8f}\t{6:10.8f}\t{7:10.6f}\n".format(names[sf[0]], str(int(sf[1])).zfill(3), chain_1, chain_2, seq_seqid, str_seqid, TMscore, RMSD))
table[toptype][top][chain_1][chain_2] = (seq_seqid, str_seqid, TMscore, RMSD)
table_file.close()
return table
def structure_alignment(options, locations):
aligner_path = options['straln_path']
np = int(options['number_of_procs'])
external_filename = options['output_tab']
already_processed = []
ex_list = {}
hidden_repository_filename = locations['SYSFILES']['repocstraln']
if os.path.exists(hidden_repository_filename):
hidden_repository_file = open(hidden_repository_filename, 'r')
text = hidden_repository_file.read().split("\n")
for line in text:
if not line:
continue
fields = line.split()
already_processed.append((fields[2], fields[3]))
hidden_repository_file.close()
repository_filename = locations['FSYSPATH']['main'] + external_filename
if os.path.exists(repository_filename):
repository_file = open(repository_filename, 'r')
text = repository_file.read().split("\n")
for line in text:
if not line:
continue
fields = line.split()
if not (fields[2], fields[3]) in already_processed:
already_processed.append((fields[2], fields[3]))
external_filename = 'new_' + external_filename
repository_file.close()
topologies = []
for ss in 'alpha', 'beta':
for i in os.listdir(locations['FSYSPATH'][ss]):
if re.match('^\d*$', str(i)):
topologies.append((ss, i))
ex_check = {}
exelist_filename = locations['SYSFILES']['H_scheduledalns']
if os.path.exists(exelist_filename):
exelist_file = open(exelist_filename, 'r')
text = exelist_file.read().split('\n')
for line in text:
if not line:
continue
fields = line.split()
if fields[2] not in ex_list:
ex_list[fields[2]] = []
ex_list[fields[2]].append(fields[3])
exelist_file.close()
exelist = {}
exelist_file = open(exelist_filename, 'a')
for sf in topologies:
structs = [x[-10:-4] for x in os.listdir(locations['FSYSPATH'][sf[0]] + sf[1] + '/' + locations['TREE']['str']) if x[-4:] == '.pdb']
# print(structs)
if len(structs) > 1:
for s1 in structs:
exelist[s1] = {}
for s2 in structs:
if s1 == s2 or (s1, s2) in already_processed:
continue
# print("sf0", sf[0], "sf1", sf[1], "s1", s1, "s2", s2)
exelist[s1][s2] = (sf[0], sf[1], s1, s2)
if s1 not in ex_list or s2 not in ex_list[s1]:
exelist_file.write("{0}\t{1}\t{2}\t{3}\n".format(sf[0], sf[1], s1, s2))
exelist_file.close()
# print("exelist", sorted(list(exelist.keys())))
data = []
for i in sorted(list(exelist.keys())):
exesublist = []
if not list(exelist[i].keys()):
continue
for j in sorted(list(exelist[i].keys())):
exesublist.append(exelist[i][j][3])
jtmp = sorted(list(exelist[i].keys()))[0]
data.append((locations, (exelist[i][jtmp][0], exelist[i][jtmp][1], exelist[i][jtmp][2], aligner_path), exesublist))
pool = multiprocessing.Pool(processes=np)
pool_outputs = pool.map(FrTMjob, data)
pool.close()
pool.join()
repo_info_seq_fasta, repo_info_str_fasta, repo_info_str_pdb = pool_outputs[0]
tot_repo_info_seq_fasta = repo_info_seq_fasta.copy()
tot_repo_info_str_fasta = repo_info_str_fasta.copy()
tot_repo_info_str_pdb = repo_info_str_pdb.copy()
for triplet in pool_outputs[1:]:
repo_info_seq_fasta, repo_info_str_fasta, repo_info_str_pdb = triplet
tot_repo_info_seq_fasta.update(repo_info_seq_fasta)
tot_repo_info_str_fasta.update(repo_info_str_fasta)
tot_repo_info_str_pdb.update(repo_info_str_pdb)
fasta_repos = (tot_repo_info_seq_fasta, tot_repo_info_str_fasta)
table = make_new_table(locations, fasta_repos, external_filename)
return table