-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn_classifier.m
171 lines (131 loc) · 5.55 KB
/
knn_classifier.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
function [posterior_probability_matrix,rate] = knn_classifier(train_dataset,test_dataset,classes)
%knn_classifier: k - nearest neighbors classifier
% Validation data: 20% of training data used to adjust k- value
% Posterior prob.: (nº of neighbors of a given class)/(total nº of neighbors)
% Return: posteriori probability and number of hits achieved with test
% data
% Global variables
no_of_exemples = size(test_dataset,1);
c = size(classes,1);
posterior_probability_matrix = zeros(no_of_exemples,c);
% Split training data into training and validation to adjust k-parameter
n = size(train_dataset,1);
p = 0.2;
[training_idx, validation_idx] = crossvalind('HoldOut', n, p);
[training_data,validation_data] = split_data(train_dataset,training_idx,validation_idx);
% k best value's search grid
k_min = 1;
k_max = 10;
best_k = get_classifier_gs(training_data,validation_data,k_min,k_max,classes);
% classification
real_classes = table2array(test_dataset(:,1));
count_hits = 0;
for new_exemple = 1:no_of_exemples
exemple = table2array(test_dataset(new_exemple,2:end));
distances_vector = get_euclidian_distances(exemple,train_dataset);
[class,classes_freq] = classify(best_k,train_dataset,distances_vector,classes);
prob = get_posterior_probabilities(best_k,classes_freq,classes);
posterior_probability_matrix(new_exemple,:) = prob;
% count classifier hits
if class == real_classes(new_exemple)
count_hits = count_hits+1;
end
end
rate = count_hits;
end
%% FUNCTIONS
% SPLIT_DATA:
function [training_data,validation_data] = split_data(data,training_idx,validation_idx)
%split_data: Splits data into training and validation data
% Uses crossvalind vectors for HoldOut as cvMethod to split data
training_data = data(find(training_idx==1),:);
validation_data = data(find(validation_idx==1),:);
end
% GET_CLASSIFIER_GS:
function [best_k] = get_classifier_gs(training,validation,k_min,k_max,classes)
%get_classifier_gs: gets the best k value for knn classifier based on a
%grid search
% Return: knn classifier k- parameter
no_of_val_exemples = size(validation,1);
best_k = 0;
best_hit = 0;
stagnation_test = 0;
for k = k_min:k_max
if stagnation_test == 3
break
end
predicted_classes = strings(no_of_val_exemples,1);
real_classes = string(table2array(validation(:,1)));
% classification of each exemple on validation data
for val_exemple = 1:no_of_val_exemples
% compute distance from exemple to each case in training data
exemple = table2array(validation(val_exemple,2:end));
distances_vector = get_euclidian_distances(exemple,training);
% knn classification
[class,classes_freq] = classify(k,training,distances_vector,classes);
predicted_classes(val_exemple) = class;
end
% compute the hits of the knn classifier
count_hits = size(find(predicted_classes == real_classes),1);
% check for improvment considering k value
if count_hits > best_hit
best_k = k;
best_hit = count_hits;
stagnation_test = 0;
else
stagnation_test = stagnation_test + 1;
end
end
end
% GET_EUCLIDIAN_DISTANCES:
function [distances_vector] = get_euclidian_distances(exemple,training)
%get_euclidian_distances: Calculates euclidian distances between an exemple
% and a dataset of exemples.
% Return: a vector of euclidian distances
cases = size(training,1);
distances_vector = zeros(1,cases);
for case_idx = 1:cases
train_case = table2array(training(case_idx,2:end));
dist = sum((exemple - train_case).^2); % euclidian distance
distances_vector(case_idx) = dist;
end
end
% CLASSIFY
function [predicted_class,classes_freq] = classify(k,training,distances_vector,classes)
%classify: classify a new exemple based on knn classifier
% Return: predicted class and a vector of the neighbors' classes
% get k-neighbors indexes
[k_min_dist,k_min_idx] = mink(distances_vector,k);
% get each neighbors' class
neighbors_classes = strings(1,k);
for neighbor = 1:k
neighbor_idx = k_min_idx(neighbor);
neighbor_class = table2array(training(neighbor_idx,1));
neighbors_classes(neighbor) = neighbor_class;
end
c = size(classes,1);
classes_freq = zeros(1,c);
% get each class frequency among neighbors
[frequencies,Categories] = histcounts(categorical(neighbors_classes),categorical(classes));
categories = string(Categories);
for class = 1:c
class_name = classes(class);
freq_idx = find(categories == class_name);
freq = frequencies(freq_idx);
classes_freq(class) = freq;
end
% chooses predicted class as the class with maximum frequency
[max_freq,max_class_idx] = max(classes_freq);
predicted_class = classes(max_class_idx);
end
% GET_POSTERIOR_PROBABILITIES
function [posterior_probabilities] = get_posterior_probabilities(k,classes_freq,classes)
%get_posterior_probabilities: calculates the posterior probability of all
%classes to an exemple of test data
% Return: vector of posterior probabilities
c = size(classes,1);
posterior_probabilities = zeros(1,c);
for class = 1:c
posterior_probabilities(class) = classes_freq(class)/k;
end
end