-
Notifications
You must be signed in to change notification settings - Fork 409
/
main.rs
176 lines (153 loc) · 7.85 KB
/
main.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
// Bring catboost module into the scope
use catboost;
fn sigmoid(x: f64) -> f64 {
1. / (1. + (-x).exp())
}
fn answer(makes_over_50k_a_year: bool) -> &'static str {
if makes_over_50k_a_year {
"makes over 50K a year"
} else {
"doesn't make over 50K a year"
}
}
fn main() {
// Load "adult.cbm" model that we trained withing Jupyter Notebook
let model_path = "adult.cbm";
let model = catboost::Model::load(model_path).unwrap();
// You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data
// from UCI Adult Dataset.
println!("Adult dataset model metainformation\n");
println!("tree count: {}", model.get_tree_count());
// In our case we were solving a binary classification problem (weather person makes over 50K a year), so the
// dimension of the prediction will be 1, it will return probability of the object to belong to the positive
// class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see
// `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive
// class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of
// positive class.
//
// For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of
// multiclassification, where N is a number of classes.
println!("prediction dimension: {}", model.get_dimensions_count());
println!("numeric feature count: {}", model.get_float_features_count());
println!("categoric feature count: {}", model.get_cat_features_count());
// Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need
// to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it
// in contains human-readable description of the dataset.
//
// So the first line of test part of the dataset is:
//
// "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K."
//
// Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all
// "continuous" features are numeric and all other features are categoric):
//
// numericFeatures: {25, 226802, 7, 0, 0, 40}
// categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"}
//
// And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and
// in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we
// can, but result of prediction will be garbage).
//
// Now lets run it! And let's call this person "person A", to make variable names unique.
//
// [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
println!();
let person_a_numeric_features = vec![25., 226_802., 7., 0., 0., 40.];
let person_a_categoric_features = vec![
String::from("Private"),
String::from("11th"),
String::from("Never-married"),
String::from("Machine-op-inspct"),
String::from("Own-child"),
String::from("Black"),
String::from("Male"),
String::from("United-States"),
];
let person_a_prediction = model
.calc_model_prediction(
vec![person_a_numeric_features.clone()],
vec![person_a_categoric_features.clone()],
)
.unwrap();
// Since we made prediction only for one person and prediction dimension is 1, proability of person A make
// over 50K will have index 0 in `person_a_prediction`.
//
// CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply
// sigmoid function.
let person_a_makes_over_50k_probability = sigmoid(person_a_prediction[0]);
println!(
"Person A make over 50K a year with probability {}",
person_a_makes_over_50k_probability
);
// When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5,
// this means that our formula is optimized for this threashold, though we may change threshold to optimize some
// other metric on a different dataset, but we won't do it in this tutorial.
let classification_threshold = 0.5;
let person_a_makes_over_50k = person_a_makes_over_50k_probability > classification_threshold;
println!("Person A {}", answer(person_a_makes_over_50k));
// Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test"
// we can find following line:
//
// "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K."
//
// Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county"
// feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric
// features with string "nan", now, when we apply trained model we must also use "nan" for missing features.
// Lets write out feature vectors for Person B:
//
// numericFeatures = {40, 85019, 16, 0, 0, 45};
// categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"};
//
// And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this
// example.
println!();
let person_b_numeric_features = vec![40., 85019., 16., 0., 0., 45.];
let person_b_categoric_features = vec![
String::from("Private"),
String::from("Doctorate"),
String::from("Married-civ-spouce"),
String::from("Prof-specialty"),
String::from("Husband"),
String::from("Asian-Pac-Islander"),
String::from("Male"),
String::from("nan"),
];
let person_b_prediction = model
.calc_model_prediction(
vec![person_b_numeric_features.clone()],
vec![person_b_categoric_features.clone()],
)
.unwrap();
let person_b_makes_over_50k_probability = sigmoid(person_b_prediction[0]);
let person_b_makes_over_50k = person_b_makes_over_50k_probability > classification_threshold;
println!(
"Person B make over 50K a year with probability {}",
person_b_makes_over_50k_probability
);
println!("Person B {}", answer(person_b_makes_over_50k));
// Let's try to apply the model to Person A and Person B in one call.
println!();
let persons_ab_numberic_features = vec![person_a_numeric_features, person_b_numeric_features];
let persons_ab_categoric_features = vec![person_a_categoric_features, person_b_categoric_features];
let persons_ab_predictions = model
.calc_model_prediction(persons_ab_numberic_features, persons_ab_categoric_features)
.unwrap();
let persons_ab_make_over_50k_probabilities =
vec![sigmoid(persons_ab_predictions[0]), sigmoid(persons_ab_predictions[1])];
let persons_ab_make_over_50k = vec![
persons_ab_make_over_50k_probabilities[0] > classification_threshold,
persons_ab_make_over_50k_probabilities[1] > classification_threshold,
];
println!("Using batch interface");
// Predictions should be same as above
println!(
"Person A make over 50K a year with probability {}",
persons_ab_make_over_50k_probabilities[0]
);
println!("Person A {}", answer(persons_ab_make_over_50k[0]));
println!(
"Person B make over 50K a year with probability {}",
persons_ab_make_over_50k_probabilities[1]
);
println!("Person B {}", answer(persons_ab_make_over_50k[1]));
}