diff --git a/Collaborative Filtering RecSys with Implicit Data_Hotel booking.ipynb b/Collaborative Filtering RecSys with Implicit Data_Hotel booking.ipynb new file mode 100644 index 0000000..b30117a --- /dev/null +++ b/Collaborative Filtering RecSys with Implicit Data_Hotel booking.ipynb @@ -0,0 +1,820 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import scipy.sparse as sparse\n", + "import numpy as np\n", + "from scipy.sparse.linalg import spsolve\n", + "from pandas.api.types import CategoricalDtype\n", + "import random\n", + "from sklearn import metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "booking = pd.read_csv('hotel_booking_update.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
consumer_idtotal_including_taxnameidorder_id
0345251.00Super8 By Wyndham West Kelowna Bc16316801
1379333.60The Sutton Place Hotel - Edmonton16366838
2379312.45The Sutton Place Hotel - Edmonton16366844
3387615.23Hotel Alma16396847
4389110.00Quality Inn& Conference Centre Downtown16416850
\n", + "
" + ], + "text/plain": [ + " consumer_id total_including_tax name \\\n", + "0 345 251.00 Super8 By Wyndham West Kelowna Bc \n", + "1 379 333.60 The Sutton Place Hotel - Edmonton \n", + "2 379 312.45 The Sutton Place Hotel - Edmonton \n", + "3 387 615.23 Hotel Alma \n", + "4 389 110.00 Quality Inn& Conference Centre Downtown \n", + "\n", + " id order_id \n", + "0 1631 6801 \n", + "1 1636 6838 \n", + "2 1636 6844 \n", + "3 1639 6847 \n", + "4 1641 6850 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "booking.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Best Western Plus Meridian Hotel, Lloydminster 4\n", + "Hotel Signature Quebec 3\n", + "Delta Hotels Ottawa City Centre 3\n", + "Holiday Inn Ottawa Parliament Hill 3\n", + "Econo Lodge Inn & Suites University 3\n", + " ..\n", + "Four Points By Sheraton Vancouver Airport 1\n", + "DoubleTree by Hilton Montreal 1\n", + "Days Inn by Wyndham Moose Jaw 1\n", + "BEST WESTERN WARREN HOTEL 1\n", + "HOLIDAY INN WEST 1\n", + "Name: name, Length: 379, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "booking.name.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The most booked hotel has only been booked 4 times." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "493 22\n", + "685 5\n", + "1764 4\n", + "3020 3\n", + "718 3\n", + " ..\n", + "2427 1\n", + "865 1\n", + "2914 1\n", + "1379 1\n", + "514 1\n", + "Name: consumer_id, Length: 368, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "booking.consumer_id.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The most frequent customers has made 22 bookings And below are his (or her) bookings. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
consumer_idtotal_including_taxnameidorder_id
51493560.16Podollan Inn & Spa Grande Prairie22118771
193493114.44Holiday Inn Express Hotel & Suites Edmonton South22098769
198493144.40Hampton Inn by Hilton Lloydminster22108770
204493161.59Home2 Suites by Hilton Fort St. John, Fort St....261610393
205493420.74Best Western Plus Meridian Hotel, Lloydminster261910397
206493163.49Holiday Inn Express Hotel & Suites Edson, Edson262210401
207493327.01Best Western Plus Hinton Inn & Suites262510405
217493980.94Best Western Plus Sherwood Park Inn & Suites262010398
218493182.76Podollan Inn & Spa Grande Prairie, Grande Prairie262310402
219493248.92Best Western Plus Fox Creek, Fox Creek262610406
255493398.84Podollan Inn & Spa Grande Prairie, Grande Prairie261910399
294493980.94Best Western Plus Sherwood Park Inn & Suites262010396
295493365.52Podollan Inn & Spa Grande Prairie, Grande Prairie262310404
378493606.04Hampton Inn by Hilton Edmonton/Sherwood Park, ...298611623
379493155.47Paradise Inn & Suites Valleyview, Valleyview298711627
380493280.13Best Western Plus Meridian Hotel, Lloydminster261911629
381493106.93Microtel Inn & Suites By Wyndham Whitecourt, W...298911630
382493150.11Towneplace Suites by Marriott Red Deer, Red Deer299011632
383493280.13Best Western Plus Meridian Hotel, Lloydminster261911637
384493703.05Best Western Plus Meridian Hotel, Lloydminster261911639
385493612.85Crowne Plaza Houston River Oaks, Houston299411642
386493754.50Embassy Suites by Hilton Portland Downtown, Po...299911662
\n", + "
" + ], + "text/plain": [ + " consumer_id total_including_tax \\\n", + "51 493 560.16 \n", + "193 493 114.44 \n", + "198 493 144.40 \n", + "204 493 161.59 \n", + "205 493 420.74 \n", + "206 493 163.49 \n", + "207 493 327.01 \n", + "217 493 980.94 \n", + "218 493 182.76 \n", + "219 493 248.92 \n", + "255 493 398.84 \n", + "294 493 980.94 \n", + "295 493 365.52 \n", + "378 493 606.04 \n", + "379 493 155.47 \n", + "380 493 280.13 \n", + "381 493 106.93 \n", + "382 493 150.11 \n", + "383 493 280.13 \n", + "384 493 703.05 \n", + "385 493 612.85 \n", + "386 493 754.50 \n", + "\n", + " name id order_id \n", + "51 Podollan Inn & Spa Grande Prairie 2211 8771 \n", + "193 Holiday Inn Express Hotel & Suites Edmonton South 2209 8769 \n", + "198 Hampton Inn by Hilton Lloydminster 2210 8770 \n", + "204 Home2 Suites by Hilton Fort St. John, Fort St.... 2616 10393 \n", + "205 Best Western Plus Meridian Hotel, Lloydminster 2619 10397 \n", + "206 Holiday Inn Express Hotel & Suites Edson, Edson 2622 10401 \n", + "207 Best Western Plus Hinton Inn & Suites 2625 10405 \n", + "217 Best Western Plus Sherwood Park Inn & Suites 2620 10398 \n", + "218 Podollan Inn & Spa Grande Prairie, Grande Prairie 2623 10402 \n", + "219 Best Western Plus Fox Creek, Fox Creek 2626 10406 \n", + "255 Podollan Inn & Spa Grande Prairie, Grande Prairie 2619 10399 \n", + "294 Best Western Plus Sherwood Park Inn & Suites 2620 10396 \n", + "295 Podollan Inn & Spa Grande Prairie, Grande Prairie 2623 10404 \n", + "378 Hampton Inn by Hilton Edmonton/Sherwood Park, ... 2986 11623 \n", + "379 Paradise Inn & Suites Valleyview, Valleyview 2987 11627 \n", + "380 Best Western Plus Meridian Hotel, Lloydminster 2619 11629 \n", + "381 Microtel Inn & Suites By Wyndham Whitecourt, W... 2989 11630 \n", + "382 Towneplace Suites by Marriott Red Deer, Red Deer 2990 11632 \n", + "383 Best Western Plus Meridian Hotel, Lloydminster 2619 11637 \n", + "384 Best Western Plus Meridian Hotel, Lloydminster 2619 11639 \n", + "385 Crowne Plaza Houston River Oaks, Houston 2994 11642 \n", + "386 Embassy Suites by Hilton Portland Downtown, Po... 2999 11662 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "booking.loc[booking['consumer_id'] == 493]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = booking.groupby(['consumer_id', 'name', 'id'])['total_including_tax'].sum().reset_index()\n", + "df['count'] = df['name'].map(booking['name'].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "hotel_lookup = df[['id', 'name']].drop_duplicates() \n", + "hotel_lookup['id'] = hotel_lookup.id.astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df['consumer_id'] = df.consumer_id.astype(int) \n", + "df = df[['id', 'count', 'consumer_id']] \n", + "grouped_purchased = df.groupby(['consumer_id', 'id']).sum().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "customers = list(np.sort(grouped_purchased['consumer_id'].unique()))\n", + "hotels = list(grouped_purchased['id'].unique())\n", + "quantity = list(grouped_purchased['count']) \n", + "rows = grouped_purchased['consumer_id'].astype(CategoricalDtype(categories = customers)).cat.codes \n", + "cols = grouped_purchased['id'].astype(CategoricalDtype(categories = hotels)).cat.codes\n", + "purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(hotels)))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<368x379 sparse matrix of type ''\n", + "\twith 405 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "purchases_sparse" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "99.70961913502352" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] \n", + "num_purchases = len(purchases_sparse.nonzero()[0]) \n", + "sparsity = 100*(1 - (num_purchases/matrix_size))\n", + "sparsity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I learned that in generally, for collaborative filtering to work, the maximum sparsity you could get away with would probably be around 99.5% or so. We are at over 99.7%. So we exceed this limit. So we should not expect to get decent results. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def make_train(ratings, pct_test = 0.2):\n", + " \n", + " test_set = ratings.copy() \n", + " test_set[test_set != 0] = 1 \n", + " training_set = ratings.copy() \n", + " nonzero_inds = training_set.nonzero() \n", + " nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) \n", + " random.seed(0) \n", + " num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) \n", + " samples = random.sample(nonzero_pairs, num_samples) \n", + " user_inds = [index[0] for index in samples] \n", + " item_inds = [index[1] for index in samples] \n", + " training_set[user_inds, item_inds] = 0 \n", + " training_set.eliminate_zeros() \n", + " return training_set, test_set, list(set(user_inds))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "hotel_train, hotel_test, hotel_users_altered = make_train(purchases_sparse, pct_test = 0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def implicit_weighted_ALS(training_set, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):\n", + " \n", + " conf = (alpha*training_set) \n", + " num_user = conf.shape[0]\n", + " num_hotel = conf.shape[1] \n", + " \n", + " rstate = np.random.RandomState(seed)\n", + " \n", + " X = sparse.csr_matrix(rstate.normal(size = (num_user, rank_size))) \n", + " Y = sparse.csr_matrix(rstate.normal(size = (num_hotel, rank_size))) \n", + " \n", + " X_eye = sparse.eye(num_user)\n", + " Y_eye = sparse.eye(num_hotel)\n", + " lambda_eye = lambda_val * sparse.eye(rank_size) \n", + " \n", + " for iter_step in range(iterations): \n", + " \n", + " yTy = Y.T.dot(Y)\n", + " xTx = X.T.dot(X)\n", + " \n", + " for u in range(num_user):\n", + " conf_samp = conf[u,:].toarray() \n", + " pref = conf_samp.copy() \n", + " pref[pref != 0] = 1 \n", + " CuI = sparse.diags(conf_samp, [0]) \n", + " yTCuIY = Y.T.dot(CuI).dot(Y) \n", + " yTCupu = Y.T.dot(CuI + Y_eye).dot(pref.T) \n", + " X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu) \n", + " \n", + " for i in range(num_hotel):\n", + " conf_samp = conf[:,i].T.toarray() \n", + " pref = conf_samp.copy()\n", + " pref[pref != 0] = 1 \n", + " CiI = sparse.diags(conf_samp, [0]) \n", + " xTCiIX = X.T.dot(CiI).dot(X) \n", + " xTCiPi = X.T.dot(CiI + X_eye).dot(pref.T) \n", + " Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)\n", + " \n", + " return X, Y.T" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "user_vecs, hotel_vecs = implicit_weighted_ALS(hotel_train, lambda_val = 0.1, alpha = 15, iterations = 1,\n", + " rank_size = 20)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0., 0., 0., 0., 0.])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_vecs[0,:].dot(hotel_vecs).toarray()[0,:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "This method is deprecated. Please use the AlternatingLeastSquares class instead\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2bc89c4642aa435f9abae1f99fdefa66", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "alpha = 15\n", + "user_vecs, hotel_vecs = implicit.alternating_least_squares((hotel_train*alpha).astype('double'), \n", + " factors=20, \n", + " regularization = 0.1, \n", + " iterations = 50)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def auc_score(predictions, test):\n", + " \n", + " fpr, tpr, thresholds = metrics.roc_curve(test, predictions)\n", + " return metrics.auc(fpr, tpr)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_mean_auc(training_set, altered_users, predictions, test_set):\n", + " \n", + " store_auc = [] \n", + " popularity_auc = [] \n", + " pop_hotels = np.array(test_set.sum(axis = 0)).reshape(-1) \n", + " hotel_vecs = predictions[1]\n", + " for user in altered_users: \n", + " training_row = training_set[user,:].toarray().reshape(-1) \n", + " zero_inds = np.where(training_row == 0) \n", + " user_vec = predictions[0][user,:]\n", + " pred = user_vec.dot(hotel_vecs).toarray()[0,zero_inds].reshape(-1)\n", + " actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) \n", + " pop = pop_hotels[zero_inds] \n", + " store_auc.append(auc_score(pred, actual)) \n", + " popularity_auc.append(auc_score(pop, actual)) \n", + " \n", + " return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.496, 0.51)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "calc_mean_auc(hotel_train, hotel_users_altered, \n", + " [sparse.csr_matrix(user_vecs), sparse.csr_matrix(hotel_vecs.T)], hotel_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our RecSys did not beat popularity. Our RecSys had a mean AUC of 0.496, while the popular hotel benchmark had a higher AUC of 0.51. So, simply put, at this moment, we are better off just recommend popular hotels to every one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}