Skip to content

Commit

Permalink
Add notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
susanli2016 authored Sep 15, 2021
1 parent f6da528 commit a6dbf6a
Showing 1 changed file with 313 additions and 0 deletions.
313 changes: 313 additions & 0 deletions cv_load_CatBoost.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"\n",
"dtype = {\n",
" 'Usage': 'category',\n",
" 'Description': 'category',\n",
" 'status': 'category',\n",
"}\n",
"df = pd.read_csv('data/reservations.csv.gz', dtype=dtype, parse_dates=['created', 'arrival', 'departure'])\n",
"df.drop(columns='Usage', inplace=True)\n",
"\n",
"df.loc[df['cancel_date'] == '0001-01-01T00:00:00', ['cancel_date']] = None\n",
"df['cancel_date'] = pd.to_datetime(df['cancel_date'])\n",
"\n",
"df['arrival_year'] = df['arrival'].dt.year"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"appearances = {}\n",
"for contract_id in df['contract_id'].unique():\n",
" subset_df = df.loc[df['contract_id'] == contract_id].sort_values(by=['arrival', 'created'])\n",
" # save the last known state\n",
" # fill in blanks for bad years\n",
" # handle case where cancel year might come after a series of misses\n",
" yearly_state = {arrival_year: status for (created, arrival_year, status) in subset_df[['created', 'arrival_year', 'status']].itertuples(index=False, name=None)}\n",
" earliest = subset_df['arrival_year'].min()\n",
" latest = min(subset_df['arrival_year'].max(), 2019)\n",
" activity = [(year, yearly_state.get(year, 'no-show')) for year in range(earliest, latest + 1)] \n",
" if activity:\n",
" resort_id = subset_df['resort_id'].values[0]\n",
" appearances[str(contract_id)] = [resort_id] + activity"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"rows = []\n",
"for r in list(appearances.values()):\n",
" resort_id, activity = r[0], r[1:]\n",
" row = [None] * 5\n",
" row[-len(activity):] = [s for year, s in activity]\n",
" rows.append([resort_id] + row)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(rows, columns=['resort_id', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"resort_id 0.000000\n",
"year_2015 0.515846\n",
"year_2016 0.205276\n",
"year_2017 0.098678\n",
"year_2018 0.048311\n",
"year_2019 0.000000\n",
"dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum() / df.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df.fillna('missing', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"resort_id 57\n",
"year_2015 4\n",
"year_2016 4\n",
"year_2017 4\n",
"year_2018 4\n",
"year_2019 3\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"X = df.drop(\"year_2019\", axis=1)\n",
"y = df[\"year_2019\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 1, 2, 3, 4]\n"
]
}
],
"source": [
"cat_features = list(range(0, X.shape[1]))\n",
"print(cat_features)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0:\tlearn: 0.9089395\ttest: 0.9088238\tbest: 0.9088238 (0)\ttotal: 83.9ms\tremaining: 755ms\n",
"5:\tlearn: 0.6921539\ttest: 0.6930650\tbest: 0.6930650 (5)\ttotal: 219ms\tremaining: 146ms\n",
"9:\tlearn: 0.6811982\ttest: 0.6819978\tbest: 0.6819978 (9)\ttotal: 319ms\tremaining: 0us\n",
"\n",
"bestTest = 0.6819978385\n",
"bestIteration = 9\n",
"\n"
]
},
{
"data": {
"text/plain": [
"<catboost.core.CatBoostClassifier at 0x7f2c8c533390>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from catboost import CatBoostClassifier\n",
"\n",
"clf = CatBoostClassifier(iterations=10, verbose=5, learning_rate=0.5)\n",
"clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"y_prob = clf.predict_proba(data=X_val)\n",
"y_pred = clf.predict(data=X_val)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.7176913425345044\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"accuracy = accuracy_score(y_val, y_pred)\n",
"print('Accuracy:', accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC: 0.8310445038554471\n"
]
}
],
"source": [
"from sklearn.metrics import roc_auc_score\n",
"\n",
"auc = roc_auc_score(y_val, y_prob, multi_class=\"ovo\", average=\"macro\")\n",
"print('AUC:', auc)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" active 0.75 0.87 0.80 11582\n",
" cancelled 0.63 0.52 0.57 5562\n",
" no-show 0.73 0.52 0.61 3578\n",
"\n",
" accuracy 0.72 20722\n",
" macro avg 0.70 0.64 0.66 20722\n",
"weighted avg 0.71 0.72 0.71 20722\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report\n",
"\n",
"print(classification_report(y_val, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit a6dbf6a

Please sign in to comment.