From 179572da8dd598cabff912bee71ef230676bd3ec Mon Sep 17 00:00:00 2001 From: "Vadim \"Paddy" Date: Mon, 13 Nov 2017 20:25:44 +0100 Subject: [PATCH] Pandas example pandas example, regression and classification problems --- ...gression and classification problems.ipynb | 2201 +++++++++++++++++ 1 file changed, 2201 insertions(+) create mode 100644 notebooks/pandas + xgboost + regression and classification problems.ipynb diff --git a/notebooks/pandas + xgboost + regression and classification problems.ipynb b/notebooks/pandas + xgboost + regression and classification problems.ipynb new file mode 100644 index 00000000..4e2ab6e5 --- /dev/null +++ b/notebooks/pandas + xgboost + regression and classification problems.ipynb @@ -0,0 +1,2201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import xgboost as xgb\n", + "\n", + "from sklearn.datasets import load_boston, load_breast_cancer\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from eli5.explain import explain_prediction, explain_weights\n", + "from eli5 import show_prediction\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regression example " + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# load boston data set from sklearn \n", + "\n", + "boston = load_boston()\n", + "X, y = boston.data, boston.target\n", + "fn = boston.feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# create a pandas dataframe\n", + "df_reg = pd.DataFrame(X) \n", + "df_reg.columns = fn\n", + "df_reg['target'] = y" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtarget
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + "\n", + " PTRATIO B LSTAT target \n", + "0 15.3 396.90 4.98 24.0 \n", + "1 17.8 396.90 9.14 21.6 \n", + "2 17.8 392.83 4.03 34.7 \n", + "3 18.7 394.63 2.94 33.4 \n", + "4 18.7 396.90 5.33 36.2 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_reg.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "# split the data\n", + "X_train, X_test, y_train, y_test = train_test_split(df_reg.drop('target',1),\n", + " df_reg['target'], test_size=0.33, random_state=1111)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# making a simple regression model \n", + "reg = xgb.XGBRegressor(learning_rate=0.1, max_depth=4, n_estimators=200).fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
WeightFeature
\n", + " 0.6139\n", + " \n", + " \n", + " LSTAT\n", + "
\n", + " 0.1499\n", + " \n", + " \n", + " RM\n", + "
\n", + " 0.0485\n", + " \n", + " \n", + " PTRATIO\n", + "
\n", + " 0.0390\n", + " \n", + " \n", + " DIS\n", + "
\n", + " 0.0338\n", + " \n", + " \n", + " NOX\n", + "
\n", + " 0.0323\n", + " \n", + " \n", + " TAX\n", + "
\n", + " 0.0224\n", + " \n", + " \n", + " RAD\n", + "
\n", + " 0.0156\n", + " \n", + " \n", + " AGE\n", + "
\n", + " 0.0119\n", + " \n", + " \n", + " CRIM\n", + "
\n", + " 0.0109\n", + " \n", + " \n", + " B\n", + "
\n", + " 0.0099\n", + " \n", + " \n", + " CHAS\n", + "
\n", + " 0.0089\n", + " \n", + " \n", + " INDUS\n", + "
\n", + " 0.0028\n", + " \n", + " \n", + " ZN\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "Explanation(estimator=\"XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,\\n learning_rate=0.1, max_delta_step=0, max_depth=4,\\n min_child_weight=1, missing=None, n_estimators=200, nthread=-1,\\n objective='reg:linear', reg_alpha=0, reg_lambda=1,\\n scale_pos_weight=1, seed=0, silent=True, subsample=1)\", description='\\nXGBoost feature importances; values are numbers 0 <= x <= 1;\\nall values sum to 1.\\n', error=None, method='feature importances', is_regression=True, targets=None, feature_importances=FeatureImportances(importances=[FeatureWeight(feature='LSTAT', weight=0.61387324, std=None, value=None), FeatureWeight(feature='RM', weight=0.14986457, std=None, value=None), FeatureWeight(feature='PTRATIO', weight=0.048537645, std=None, value=None), FeatureWeight(feature='DIS', weight=0.039025772, std=None, value=None), FeatureWeight(feature='NOX', weight=0.033829696, std=None, value=None), FeatureWeight(feature='TAX', weight=0.032349244, std=None, value=None), FeatureWeight(feature='RAD', weight=0.022447839, std=None, value=None), FeatureWeight(feature='AGE', weight=0.015569737, std=None, value=None), FeatureWeight(feature='CRIM', weight=0.011939988, std=None, value=None), FeatureWeight(feature='B', weight=0.010946172, std=None, value=None), FeatureWeight(feature='CHAS', weight=0.0098862341, std=None, value=None), FeatureWeight(feature='INDUS', weight=0.0089406818, std=None, value=None), FeatureWeight(feature='ZN', weight=0.0027891493, std=None, value=None)], remaining=0), decision_tree=None, highlight_spaces=None, transition_features=None)" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# explaing weights with eli5\n", + "res = explain_weights(reg)\n", + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y\n", + " \n", + "\n", + "\n", + " \n", + " (score 21.635)\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Contribution?\n", + " Feature
\n", + " +22.634\n", + " \n", + " <BIAS>\n", + "
\n", + " +0.713\n", + " \n", + " PTRATIO\n", + "
\n", + " +0.482\n", + " \n", + " AGE\n", + "
\n", + " +0.465\n", + " \n", + " DIS\n", + "
\n", + " +0.452\n", + " \n", + " CRIM\n", + "
\n", + " +0.391\n", + " \n", + " NOX\n", + "
\n", + " +0.121\n", + " \n", + " RAD\n", + "
\n", + " +0.116\n", + " \n", + " B\n", + "
\n", + " +0.048\n", + " \n", + " INDUS\n", + "
\n", + " -0.009\n", + " \n", + " CHAS\n", + "
\n", + " -0.011\n", + " \n", + " ZN\n", + "
\n", + " -0.472\n", + " \n", + " TAX\n", + "
\n", + " -1.588\n", + " \n", + " LSTAT\n", + "
\n", + " -1.707\n", + " \n", + " RM\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "Explanation(estimator=\"XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,\\n learning_rate=0.1, max_delta_step=0, max_depth=4,\\n min_child_weight=1, missing=None, n_estimators=200, nthread=-1,\\n objective='reg:linear', reg_alpha=0, reg_lambda=1,\\n scale_pos_weight=1, seed=0, silent=True, subsample=1)\", description='\\nFeatures with largest coefficients.\\n\\nFeature weights are calculated by following decision paths in trees\\nof an ensemble. Each leaf has an output score, and expected scores can also be\\nassigned to parent nodes. Contribution of one feature on the decision path\\nis how much expected score changes from parent to child. Weights of all \\nfeatures sum to the output score of the estimator.\\n\\nCaveats:\\n1. Feature weights just show if the feature contributed positively or\\n negatively to the final score, and does show how increasing or decreasing\\n the feature value will change the prediction.\\n2. In some cases, feature weight can be close to zero for an important feature.\\n For example, in a single tree that computes XOR function, the feature at the\\n top of the tree will have zero weight because expected scores for both\\n branches are equal, so decision at the top feature does not change the\\n expected score. For an ensemble predicting XOR functions it might not be\\n a problem, but it is not reliable if most trees happen to choose the same\\n feature at the top.\\n', error=None, method='decision paths', is_regression=True, targets=[TargetExplanation(target='y', feature_weights=FeatureWeights(pos=[FeatureWeight(feature='', weight=22.633517121386614, std=None, value=1.0), FeatureWeight(feature='PTRATIO', weight=0.71346409995485705, std=None, value=17.800000000000001), FeatureWeight(feature='AGE', weight=0.48187521582714654, std=None, value=65.200000000000003), FeatureWeight(feature='DIS', weight=0.46489889117343264, std=None, value=2.7591999999999999), FeatureWeight(feature='CRIM', weight=0.45233942697540158, std=None, value=0.14476), FeatureWeight(feature='NOX', weight=0.39107394837889692, std=None, value=0.54700000000000004), FeatureWeight(feature='RAD', weight=0.12076196097746818, std=None, value=6.0), FeatureWeight(feature='B', weight=0.11581397951093525, std=None, value=391.5), FeatureWeight(feature='INDUS', weight=0.047550996258487771, std=None, value=10.01)], neg=[FeatureWeight(feature='RM', weight=-1.7074111557609595, std=None, value=5.7309999999999999), FeatureWeight(feature='LSTAT', weight=-1.5881134295995292, std=None, value=13.609999999999999), FeatureWeight(feature='TAX', weight=-0.47201327246835173, std=None, value=432.0), FeatureWeight(feature='ZN', weight=-0.010531127358105305, std=None, value=0.0), FeatureWeight(feature='CHAS', weight=-0.0085488562562867458, std=None, value=0.0)], pos_remaining=0, neg_remaining=0), proba=None, score=21.634677799000038, weighted_spans=None)], feature_importances=None, decision_tree=None, highlight_spaces=None, transition_features=None)" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explain_me = X_test.iloc[1]\n", + "explain_prediction(reg,explain_me)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y\n", + " \n", + "\n", + "\n", + " \n", + " (score 22.653)\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Contribution?\n", + " Feature
\n", + " +22.634\n", + " \n", + " <BIAS>\n", + "
\n", + " +4.732\n", + " \n", + " LSTAT\n", + "
\n", + " +0.635\n", + " \n", + " PTRATIO\n", + "
\n", + " +0.335\n", + " \n", + " B\n", + "
\n", + " +0.087\n", + " \n", + " TAX\n", + "
\n", + " +0.027\n", + " \n", + " ZN\n", + "
\n", + " +0.006\n", + " \n", + " AGE\n", + "
\n", + " -0.009\n", + " \n", + " CHAS\n", + "
\n", + " -0.058\n", + " \n", + " NOX\n", + "
\n", + " -0.068\n", + " \n", + " INDUS\n", + "
\n", + " -0.177\n", + " \n", + " RAD\n", + "
\n", + " -0.395\n", + " \n", + " CRIM\n", + "
\n", + " -1.301\n", + " \n", + " DIS\n", + "
\n", + " -3.796\n", + " \n", + " RM\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_prediction(reg, X_test.iloc[44])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Classification example" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# load the breast cancer dataset from sklearn\n", + "bc_data = load_breast_cancer()\n", + "\n", + "X, y = bc_data.data, bc_data.target\n", + "fn = bc_data.feature_names\n", + "\n", + "df_cls = pd.DataFrame(X) \n", + "df_cls.columns = fn\n", + "df_cls['target'] = y" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df_cls.drop('target',1),\n", + " df_cls['target'], test_size=0.33, random_state=1111)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
WeightFeature
\n", + " 0.5162\n", + " \n", + " \n", + " worst radius\n", + "
\n", + " 0.1203\n", + " \n", + " \n", + " worst area\n", + "
\n", + " 0.0708\n", + " \n", + " \n", + " worst concave points\n", + "
\n", + " 0.0678\n", + " \n", + " \n", + " mean concave points\n", + "
\n", + " 0.0443\n", + " \n", + " \n", + " worst perimeter\n", + "
\n", + " 0.0234\n", + " \n", + " \n", + " concavity error\n", + "
\n", + " 0.0227\n", + " \n", + " \n", + " worst concavity\n", + "
\n", + " 0.0194\n", + " \n", + " \n", + " worst texture\n", + "
\n", + " 0.0139\n", + " \n", + " \n", + " mean texture\n", + "
\n", + " 0.0125\n", + " \n", + " \n", + " worst compactness\n", + "
\n", + " 0.0121\n", + " \n", + " \n", + " mean area\n", + "
\n", + " 0.0099\n", + " \n", + " \n", + " worst fractal dimension\n", + "
\n", + " 0.0085\n", + " \n", + " \n", + " area error\n", + "
\n", + " 0.0068\n", + " \n", + " \n", + " worst smoothness\n", + "
\n", + " 0.0065\n", + " \n", + " \n", + " mean compactness\n", + "
\n", + " 0.0059\n", + " \n", + " \n", + " concave points error\n", + "
\n", + " 0.0054\n", + " \n", + " \n", + " worst symmetry\n", + "
\n", + " 0.0046\n", + " \n", + " \n", + " compactness error\n", + "
\n", + " 0.0045\n", + " \n", + " \n", + " radius error\n", + "
\n", + " 0.0044\n", + " \n", + " \n", + " smoothness error\n", + "
\n", + " … 10 more …\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "Explanation(estimator=\"XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\\n gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,\\n min_child_weight=1, missing=None, n_estimators=200, nthread=-1,\\n objective='binary:logistic', reg_alpha=0, reg_lambda=1,\\n scale_pos_weight=1, seed=0, silent=True, subsample=1)\", description='\\nXGBoost feature importances; values are numbers 0 <= x <= 1;\\nall values sum to 1.\\n', error=None, method='feature importances', is_regression=False, targets=None, feature_importances=FeatureImportances(importances=[FeatureWeight(feature='worst radius', weight=0.51623583, std=None, value=None), FeatureWeight(feature='worst area', weight=0.12025225, std=None, value=None), FeatureWeight(feature='worst concave points', weight=0.070798568, std=None, value=None), FeatureWeight(feature='mean concave points', weight=0.067776814, std=None, value=None), FeatureWeight(feature='worst perimeter', weight=0.044348486, std=None, value=None), FeatureWeight(feature='concavity error', weight=0.023369428, std=None, value=None), FeatureWeight(feature='worst concavity', weight=0.022670191, std=None, value=None), FeatureWeight(feature='worst texture', weight=0.01943423, std=None, value=None), FeatureWeight(feature='mean texture', weight=0.01385165, std=None, value=None), FeatureWeight(feature='worst compactness', weight=0.012482738, std=None, value=None), FeatureWeight(feature='mean area', weight=0.012116443, std=None, value=None), FeatureWeight(feature='worst fractal dimension', weight=0.0098966593, std=None, value=None), FeatureWeight(feature='area error', weight=0.0084517095, std=None, value=None), FeatureWeight(feature='worst smoothness', weight=0.0067674643, std=None, value=None), FeatureWeight(feature='mean compactness', weight=0.0065437648, std=None, value=None), FeatureWeight(feature='concave points error', weight=0.005850066, std=None, value=None), FeatureWeight(feature='worst symmetry', weight=0.0053554713, std=None, value=None), FeatureWeight(feature='compactness error', weight=0.0046069697, std=None, value=None), FeatureWeight(feature='radius error', weight=0.0044625001, std=None, value=None), FeatureWeight(feature='smoothness error', weight=0.0044323583, std=None, value=None)], remaining=10), decision_tree=None, highlight_spaces=None, transition_features=None)" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cls = xgb.XGBClassifier(learning_rate=0.1, max_depth=4, n_estimators=200).fit(X_train, y_train)\n", + "\n", + "explain_weights(cls)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=1\n", + " \n", + "\n", + "\n", + " \n", + " (probability 0.999, score 7.189)\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Contribution?\n", + " Feature
\n", + " +1.124\n", + " \n", + " worst area\n", + "
\n", + " +0.995\n", + " \n", + " area error\n", + "
\n", + " +0.965\n", + " \n", + " <BIAS>\n", + "
\n", + " +0.772\n", + " \n", + " worst concave points\n", + "
\n", + " +0.735\n", + " \n", + " worst concavity\n", + "
\n", + " +0.693\n", + " \n", + " worst radius\n", + "
\n", + " +0.448\n", + " \n", + " mean concave points\n", + "
\n", + " +0.379\n", + " \n", + " mean texture\n", + "
\n", + " +0.328\n", + " \n", + " worst texture\n", + "
\n", + " +0.326\n", + " \n", + " worst smoothness\n", + "
\n", + " +0.324\n", + " \n", + " worst perimeter\n", + "
\n", + " +0.311\n", + " \n", + " symmetry error\n", + "
\n", + " +0.168\n", + " \n", + " concave points error\n", + "
\n", + " +0.107\n", + " \n", + " mean smoothness\n", + "
\n", + " +0.069\n", + " \n", + " worst symmetry\n", + "
\n", + " +0.055\n", + " \n", + " worst compactness\n", + "
\n", + " +0.050\n", + " \n", + " mean concavity\n", + "
\n", + " +0.050\n", + " \n", + " texture error\n", + "
\n", + " +0.043\n", + " \n", + " radius error\n", + "
\n", + " +0.039\n", + " \n", + " fractal dimension error\n", + "
\n", + " +0.017\n", + " \n", + " smoothness error\n", + "
\n", + " +0.009\n", + " \n", + " mean area\n", + "
\n", + " +0.008\n", + " \n", + " perimeter error\n", + "
\n", + " -0.023\n", + " \n", + " mean symmetry\n", + "
\n", + " -0.218\n", + " \n", + " mean compactness\n", + "
\n", + " -0.584\n", + " \n", + " compactness error\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "Explanation(estimator=\"XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\\n gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,\\n min_child_weight=1, missing=None, n_estimators=200, nthread=-1,\\n objective='binary:logistic', reg_alpha=0, reg_lambda=1,\\n scale_pos_weight=1, seed=0, silent=True, subsample=1)\", description='\\nFeatures with largest coefficients.\\n\\nFeature weights are calculated by following decision paths in trees\\nof an ensemble. Each leaf has an output score, and expected scores can also be\\nassigned to parent nodes. Contribution of one feature on the decision path\\nis how much expected score changes from parent to child. Weights of all \\nfeatures sum to the output score of the estimator.\\n\\nCaveats:\\n1. Feature weights just show if the feature contributed positively or\\n negatively to the final score, and does show how increasing or decreasing\\n the feature value will change the prediction.\\n2. In some cases, feature weight can be close to zero for an important feature.\\n For example, in a single tree that computes XOR function, the feature at the\\n top of the tree will have zero weight because expected scores for both\\n branches are equal, so decision at the top feature does not change the\\n expected score. For an ensemble predicting XOR functions it might not be\\n a problem, but it is not reliable if most trees happen to choose the same\\n feature at the top.\\n', error=None, method='decision paths', is_regression=False, targets=[TargetExplanation(target=1, feature_weights=FeatureWeights(pos=[FeatureWeight(feature='worst area', weight=1.1235545349517999, std=None, value=259.19999999999999), FeatureWeight(feature='area error', weight=0.99513999088252636, std=None, value=11.359999999999999), FeatureWeight(feature='', weight=0.9653413446432455, std=None, value=1.0), FeatureWeight(feature='worst concave points', weight=0.77224521211149366, std=None, value=0.0), FeatureWeight(feature='worst concavity', weight=0.73486851069418668, std=None, value=0.0), FeatureWeight(feature='worst radius', weight=0.69264272195294629, std=None, value=9.2620000000000005), FeatureWeight(feature='mean concave points', weight=0.44815761366602197, std=None, value=0.0), FeatureWeight(feature='mean texture', weight=0.37885852413839266, std=None, value=14.449999999999999), FeatureWeight(feature='worst texture', weight=0.32835835694758092, std=None, value=17.039999999999999), FeatureWeight(feature='worst smoothness', weight=0.32550012690326535, std=None, value=0.1162), FeatureWeight(feature='worst perimeter', weight=0.32374884425169465, std=None, value=58.359999999999999), FeatureWeight(feature='symmetry error', weight=0.31051591376352639, std=None, value=0.027109999999999999), FeatureWeight(feature='concave points error', weight=0.16783322462060404, std=None, value=0.0), FeatureWeight(feature='mean smoothness', weight=0.10733293085377055, std=None, value=0.091380000000000003), FeatureWeight(feature='worst symmetry', weight=0.069425421819121674, std=None, value=0.25919999999999999), FeatureWeight(feature='worst compactness', weight=0.055312077574221651, std=None, value=0.070569999999999994), FeatureWeight(feature='mean concavity', weight=0.050148272656335903, std=None, value=0.0), FeatureWeight(feature='texture error', weight=0.05010757558343841, std=None, value=0.7873), FeatureWeight(feature='radius error', weight=0.04271116338843281, std=None, value=0.22040000000000001), FeatureWeight(feature='fractal dimension error', weight=0.038998819719372718, std=None, value=0.0033990000000000001), FeatureWeight(feature='smoothness error', weight=0.017278769157235502, std=None, value=0.0091719999999999996), FeatureWeight(feature='mean area', weight=0.0088171407955835394, std=None, value=227.19999999999999), FeatureWeight(feature='perimeter error', weight=0.0078695603340078052, std=None, value=1.4350000000000001)], neg=[FeatureWeight(feature='compactness error', weight=-0.58416265286521285, std=None, value=0.0080070000000000002), FeatureWeight(feature='mean compactness', weight=-0.21836524322833473, std=None, value=0.042759999999999999), FeatureWeight(feature='mean symmetry', weight=-0.023108175315257103, std=None, value=0.17219999999999999)], pos_remaining=0, neg_remaining=0), proba=0.99924588, score=7.189130579999999, weighted_spans=None)], feature_importances=None, decision_tree=None, highlight_spaces=None, transition_features=None)" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explain_prediction(cls,X_test.iloc[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=0\n", + " \n", + "\n", + "\n", + " \n", + " (probability 0.999, score -7.052)\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Contribution?\n", + " Feature
\n", + " +2.338\n", + " \n", + " worst area\n", + "
\n", + " +1.287\n", + " \n", + " worst concave points\n", + "
\n", + " +1.139\n", + " \n", + " worst radius\n", + "
\n", + " +1.020\n", + " \n", + " worst perimeter\n", + "
\n", + " +0.610\n", + " \n", + " mean concave points\n", + "
\n", + " +0.481\n", + " \n", + " worst concavity\n", + "
\n", + " +0.428\n", + " \n", + " mean texture\n", + "
\n", + " +0.390\n", + " \n", + " area error\n", + "
\n", + " +0.302\n", + " \n", + " worst texture\n", + "
\n", + " +0.185\n", + " \n", + " worst symmetry\n", + "
\n", + " +0.179\n", + " \n", + " fractal dimension error\n", + "
\n", + " +0.152\n", + " \n", + " mean concavity\n", + "
\n", + " +0.096\n", + " \n", + " concave points error\n", + "
\n", + " +0.060\n", + " \n", + " mean smoothness\n", + "
\n", + " +0.043\n", + " \n", + " mean area\n", + "
\n", + " +0.028\n", + " \n", + " worst compactness\n", + "
\n", + " +0.028\n", + " \n", + " perimeter error\n", + "
\n", + " +0.016\n", + " \n", + " mean fractal dimension\n", + "
\n", + " -0.023\n", + " \n", + " mean symmetry\n", + "
\n", + " -0.040\n", + " \n", + " worst fractal dimension\n", + "
\n", + " -0.053\n", + " \n", + " smoothness error\n", + "
\n", + " -0.094\n", + " \n", + " mean compactness\n", + "
\n", + " -0.114\n", + " \n", + " symmetry error\n", + "
\n", + " -0.179\n", + " \n", + " worst smoothness\n", + "
\n", + " -0.261\n", + " \n", + " compactness error\n", + "
\n", + " -0.965\n", + " \n", + " <BIAS>\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_prediction(cls, X_test.iloc[34])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}