-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lib.py
84 lines (75 loc) · 4.19 KB
/
Lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# This file contains functions used by all the other files in this project.
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import os
import time
import re
import sys
import random
import math
import tensorflow as tf
# This section contains physical characteristics of the amino acids. Currently
# unused, though they may prove to be useful in the future.
NONE, POLAR, NONPOLAR, AROMATIC = 0, 1, 2, 3
NONALIPHATIC, ALIPHATIC = 0, 1
NEUTRAL, POSITIVE, NEGATIVE = 0, 1, 2
HYDRONEUTRAL, HYDROPHILIC, HYDROPHOBIC = 0, 1, 2
VOCAB = {
# CODE CHARGED ALIPHATIC AROMATIC POLAR H.PHOBIC POS.CHARGE NEG.CHARGE TINY SMALL LARGE MASS
' ': [0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0.0000],
'A': [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 71.0779],
'R': [2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 156.1857],
'N': [3, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 114.1026],
'D': [4, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 115.0874],
'C': [5, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 103.1429],
'E': [6, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 129.1140],
'Q': [7, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 128.1292],
'G': [8, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 57.0513],
'H': [9, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 137.1393],
'I': [10, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 113.1576],
'L': [11, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 113.1576],
'K': [12, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 128.1723],
'M': [13, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 131.1961],
'F': [14, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 147.1739],
'P': [15, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 97.1152],
'S': [16, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 87.0773],
'T': [17, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 101.1039],
'W': [18, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 186.2099],
'Y': [19, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 163.1733],
'V': [20, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 99.1311],
';': [21, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0.0000]
}
# These functions contain code for reading the different kinds of input files.
def read_cppd_file(filename):
peptide_set = set()
with open(filename, "r") as file:
pattern = re.compile(r'^(\d{4})\s+(\w+)$')
for line in file:
match = pattern.match(line)
if match:
peptide = match.group(2) + ' '
peptide_set.add(peptide)
return list(peptide_set)
def read_fasta_file(filename):
peptide_set = set()
with open(filename, "r") as file:
peptide = ""
for line in file:
if line[0] == '>':
if peptide != "":
peptide_set.add(peptide + ';')
peptide = ""
else:
peptide += line.strip()
return list(peptide_set)
def read_file_in_dataset(dataset, file):
# Read in files
input_file = [f for f in os.listdir(dataset) if f.startswith(file)][0]
input_file = os.path.join(dataset, input_file)
if input_file.endswith("cppd"):
return read_cppd_file(input_file)
elif input_file.endswith("fasta"):
return read_fasta_file(input_file)
else:
print("Unknown file format for {}".format(input_file))
return []