-
Notifications
You must be signed in to change notification settings - Fork 3
/
krfe.py
165 lines (151 loc) · 6.19 KB
/
krfe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Imports
import ml
import sys
import data
import kmers
import numpy
import time
import matrix
import matplotlib.pyplot as plt
from operator import itemgetter
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.model_selection import StratifiedKFold, cross_val_predict
# Function to extract the discriminating k-mers
def extract(parameters):
history = []
# Initialize the maximum score
max_score = 0
# Initialize the best identified score
best_score = 0
# Initialize the best set of k-mers score
best_k_mers = []
# Initialize the best length of k
best_k_length = 0
# Initialize the best number of features
best_features_number = 0
# Get the parameters
T = float(parameters["T"])
k_min = int(parameters["k_min"])
k_max = int(parameters["k_max"])
k_mers_path = str(parameters["k_mers_path"])
file_path = str(parameters["training_fasta"])
# Load the training data
D = data.loadData(file_path)
# If the target values are not given
if len(D[0]) != 3:
print("Target values are not given")
sys.exit(0)
# Iterate through the length of k-mers to explore
for k in range(k_min, k_max + 1):
# Displays the current analysis
print("Analysis of the " + str(k) + "-mers")
# Get the k-mers existing in the sequences
print("Get k-mers...")
K = kmers.getKmers(D, k)
# Generate the samples matrix (X) and the target values (y)
print("Compute matrices...")
X, y = matrix.generateSamplesTargets(D, K , k)
# Scale the features between 0 and 1
print("Scaling...")
X = ml.minMaxScaler(X)
# If it is possible to apply a variance filter
try:
# Instancies the filter method
print("VarianceThreshold...")
varianceThreshold = VarianceThreshold(threshold = 0.01)
# Apply the filter
X = varianceThreshold.fit_transform(X)
# Compute the list of k-mers indices to retain
indices = [i for i, value in enumerate(varianceThreshold.get_support()) if value == True]
# Update the list of k-mers
K = dict.fromkeys(list(itemgetter(*indices)(list(K.keys()))), 0)
# Clear the indices list
indices.clear()
# If not, pass on
except: pass
print("Reducing number of features with SelectFromModel...")
# Instantiate a linear svm classifier
clf = ml.svm()
# Select from model
if len(X[0]) > 100 :
selectFromModel = SelectFromModel(estimator = clf, max_features = 100).fit(X, y)
indices = [i for i, value in enumerate(selectFromModel.get_support()) if value == True]
X = X[:,indices]
# Update the list of k-mers
K = dict.fromkeys(list(itemgetter(*indices)(list(K.keys()))), 0)
indices.clear()
else: pass
# Get the number of features
n_features = numpy.size(X, 1)
# List of scores related to each subset of features
scores = []
# List of number features related to each subset of features
n_features = []
# List of indices related to each subset of features
selected_features = []
# Initialize the empty list of indices
indices = numpy.empty(0, int)
# Apply recursive feature elimination
rfe = RFE(estimator = clf , n_features_to_select = 1, step = 1).fit(X, y)
# Iterate through the subset of features
for i in range(1, rfe.ranking_.max()):
# Merge the indices of this iteration to the list of indices
indices = numpy.concatenate((indices, numpy.where(rfe.ranking_ == i)[0]), axis = None)
# Save the indices related to the actual subset of features
selected_features.append(indices)
# Split the data using stratified K-Folds cross-validator
skf = StratifiedKFold(n_splits = 5, random_state = 1, shuffle = True)
# Perform the cross-validation for the actual subset of features
cv_results = cross_validate(clf, X[:,indices], y, cv=skf, scoring={'f1_macro': make_scorer(f1_score, average='macro')}, n_jobs=-1)
# Compute the F1 score of the actual subset of features
score = cv_results['test_f1_macro'].mean()
# Save the score of the actual subset of features
scores.append(score)
# Save the number of features of the actual subset of features
n_features.append(len(indices))
# Compute evaluation and save results for all features
skf = StratifiedKFold(n_splits = 5, random_state = 0, shuffle = True)
y_pred = cross_val_predict(clf, X, y, cv = skf, n_jobs = -1)
score = ml.compute_f1_score(y, y_pred)
scores.append(score)
n_features.append(X.shape[1])
selected_features.append(numpy.where(rfe.ranking_ != 0)[0])
# Get the best solution for this length of k-mers (Better score with lesser number of features)
if max(scores) > max_score:
max_score = max(scores)
best_score = max_score
best_k_length = k
best_features_number = n_features[scores.index(max(scores))]
best_k_mers = [list(K.keys())[i] for i in selected_features[ n_features.index(best_features_number)]]
# Get best solution according to the threshold T
for s, n in zip(scores, n_features):
if s >= max_score*T and n <= best_features_number:
best_score = s
best_k_length = k
best_features_number = n
best_k_mers = [list(K.keys())[i] for i in selected_features[n_features.index(best_features_number)]]
# Save the history
history.append(scores)
# Plot the history
for i, h in enumerate(history):
label = str(list(range(k_min, k_max + 1))[i]) + "-mers"
plt.plot(list(range(1, len(h) + 1))[0:100], h[0:100], label = label)
plt.axvline(best_features_number, linestyle=':', color='r')
plt.suptitle("Distribution of F1-scores according to the length of k and the number of features", fontsize = 12)
plt.title("Solution: F1 score = " + str(round(best_score, 2)) + ", Number of features = " + str(best_features_number) + ", Length of k = " + str(best_k_length), fontsize = 10)
plt.xlabel('Number of features', fontsize = 10)
plt.ylabel('F1 score', fontsize = 10)
plt.legend()
plt.show()
# Save the extracted k-mers
kmers.saveExtractedKmers(k_mers_path, best_k_mers)
# Dipslay the solution
print("\nIdentified solution:")
print("Evaluation score (F1 score) =", best_score)
print("Length of k =", best_k_length)
print("Number of k-mers =", best_features_number)
print("Extracted k-mers =", best_k_mers)
print("Extracted k-mers saved at the path:", k_mers_path)