From 40ec6ba06e8808e565fb11206efbdb5f32ebd118 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Mon, 30 Aug 2021 13:08:46 -0400 Subject: [PATCH] Add notebook --- ...e Classification & Hotel Recommender.ipynb | 1561 +++++++++++++++++ 1 file changed, 1561 insertions(+) create mode 100644 Sentence Classification & Hotel Recommender.ipynb diff --git a/Sentence Classification & Hotel Recommender.ipynb b/Sentence Classification & Hotel Recommender.ipynb new file mode 100644 index 0000000..53a9031 --- /dev/null +++ b/Sentence Classification & Hotel Recommender.ipynb @@ -0,0 +1,1561 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "import numpy as np\n", + "from sklearn import preprocessing\n", + "from sklearn.model_selection import learning_curve\n", + "from sklearn.model_selection import ShuffleSplit\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import confusion_matrix\n", + "import itertools\n", + "import seaborn as sns\n", + "from nltk.corpus import stopwords\n", + "from xgboost import XGBClassifier\n", + "import string\n", + "from psycopg2 import connect\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('sentence_training_set_1.csv', encoding=\"latin-1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencelabel
0LocationSituated right in the city centre of B...0.0
1Located in the charming area of Albuquerque, t...0.0
2Hotel The Ein Kerem is an elegant boutique ho...1.0
3This hotel is located on Via del Tritone in th...0.0
4Surrounded by wonderful Cretan countryside, t...0.0
\n", + "
" + ], + "text/plain": [ + " sentence label\n", + "0 LocationSituated right in the city centre of B... 0.0\n", + "1 Located in the charming area of Albuquerque, t... 0.0\n", + "2 Hotel The Ein Kerem is an elegant boutique ho... 1.0\n", + "3 This hotel is located on Via del Tritone in th... 0.0\n", + "4 Surrounded by wonderful Cretan countryside, t... 0.0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0 2777\n", + "0.0 2454\n", + "11.0 1\n", + "Name: label, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.label.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sentence 0\n", + "label 2\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=['label'], inplace=True)\n", + "df = df[df.label != 11]\n", + "df = df.sample(frac=1).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def print_plot(index):\n", + " example = df[df.index == index][['sentence', 'label']].values[0]\n", + " if len(example) > 0:\n", + " print(example[0])\n", + " print('label:', example[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Take NY-27 West/Sunrise Highway to the belt Parkway West ramp on the left to Verrazano Bridge\n", + "label: 1.0\n" + ] + } + ], + "source": [ + "print_plot(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " All rooms are in good condition, and are traditionally furnished with burgundy features\n", + "label: 1.0\n" + ] + } + ], + "source": [ + "print_plot(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df['length'] = df['sentence'].apply(len)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare sentence length of the different labels." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAADQCAYAAABStPXYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEt5JREFUeJzt3X+QnVd93/H3p5YNFKglmY3GI6kVGTShTmdQxNaYJuMJuDG2k4mciUNIM7HqaEaZqRvokNIq7UxJSH/gMq0bZ1JPnZhETkhixy1jDaU2qoDSdGIHGYR/YKgWB4+kka01thVcBojJt3/cs3C1rLV3tbves3ffr5ln7nnOc+6956z26LPPc5/7PKkqJEnqzV9b6Q5IkjQXA0qS1CUDSpLUJQNKktQlA0qS1CUDSpLUJQOqE0men2f7tiSPLPA1fzfJdYvrGSTZneRoW3a/SJuNSQ62NgeTbFjs+0rQ/dy4N8lzST5yljYvS3JnkqkkDyTZttj3XSsMKJ1Vko3Ae4E3AZcC732R8NkHHKqq7cChti6Nuw8APzdPmz3As1X1OuBm4KZl79WYMKA6k+RVSQ4l+UySh5PsGtq8LsmHkjyW5O4kf709541J/leSB5Pcl+TiJezS24CDVfVMVT0LHASumqPdLmB/K+8Hrl3CPkg9zg2q6hDw1XmaDc+Nu4ErkmQp+zGuDKj+fB34iaraCbwF+A9Dv8zfB/znqvrbwF8A/yjJ+cBvANdV1RuBDwL/5mxvkOQ9SY7MsdwyR/PNwLGh9eOtbrZNVXWylZ8ENo02XGlkvc2NUX17DlXVC8Bp4KJFvN6asW6lO6DvEuDfJrkc+CsGv9wz/9kfq6r/08q/D7wTuBf4O8DBNlfPA05yFlX1AQaHJpZFVVUSr6Glpbbq54YWxoDqz88CE8Abq+ovk3wZeHnbNvs//WIwaR+tqjeP+gZJ3tPeZ7ZPVdU7Z9WdAH54aH0L8Mk5nvtUkour6mQ7jHJq1P5II+ptbozqBLAVOJ5kHXAh8JVzfK01xUN8/bkQONUm4FuAvzW07W8mmZls/wD4E+CLwMRMfZLzk3z/2d6gqj5QVTvmWOaagPcBVybZ0E6OuLLVzXYAmDnDbzdwz4jjlUbV29wY1fDcuA74eHmV7pEYUP35EDCZ5GHgeuALQ9u+CNyY5DFgA3BrVX2TwS/9TUk+BxwB/t5SdaaqngF+Dfh0W97X6kjy20kmW9P3Az+S5Cjw99u6tJS6mhsASf438McMTnw4nuRtrf59SX68NbsduCjJFPBuPMN1ZDHIJUk9cg9KktQlA0qS1CUDSpLUJQNKktSlLgLqqquuKgbfW3BxGadl0ZwbLmO6jKSLgHr66adXugtSl5wbWsu6CChJkmYzoCRJXTKgJEldMqAkSV0yoCRJXTKgJEldGimgkqxvt1H+Qrul8puTbExyMMnR9rihtU2SW5JMJXkoyc7lHYKkuSRnLtJqM+oe1K8D91bV64E3AI8xuGT8oaraDhziO5eQvxrY3pa9wK1L2mNJ0powb0AluRC4nME9Taiqb1bVc8AuYH9rth+4tpV3AXfUwP3A+naHVUmSRjbKHtRrgWngd5J8tt2k7pXApqo62do8CWxq5c3AsaHnH291Z0iyN8nhJIenp6fPfQTSmHFuSAOjBNQ6YCeDO1T+APD/mHVHyHb74pGvr9Sec1tVTVbV5MTExEKeKo0154Y0MEpAHQeOV9UDbf1uBoH11Myhu/Z4qm0/AWwdev6WVidJ0sjmDaiqehI4luT7WtUVwOeBA8DuVrcbuKeVDwDXt7P5LgNODx0KlCRpJOtGbPeLwIeSXAA8DtzAINzuSrIHeAJ4e2v7UeAaYAr4WmsrSdKCjBRQVXUEmJxj0xVztC3gxkX2S5K0xnklCUlSlwwoSVKXDChJUpcMKElSl0Y9i0/SKjd8wdha0NfqpZXhHpQkqUsGlCSpSwaUJKlLBpQkqUsGlCSpSwaUJKlLBpQkqUsGlCSpSwaUJKlLIwVUki8neTjJkSSHW93GJAeTHG2PG1p9ktySZCrJQ0l2LucAJEnjaSF7UG+pqh1VNXNfqH3AoaraDhxq6wBXA9vbshe4dak6K0laOxZziG8XsL+V9wPXDtXfUQP3A+uTXLyI95EkrUGjBlQBH0vyYJK9rW5TVZ1s5SeBTa28GTg29Nzjre4MSfYmOZzk8PT09Dl0XRpPzg1pYNSA+qGq2sng8N2NSS4f3thu876g6yNX1W1VNVlVkxMTEwt5qjTWnBvSwEgBVVUn2uMp4MPApcBTM4fu2uOp1vwEsHXo6VtanSRJI5s3oJK8MsmrZ8rAlcAjwAFgd2u2G7inlQ8A17ez+S4DTg8dCpQkaSSj3LBwE/DhDO52tg74g6q6N8mngbuS7AGeAN7e2n8UuAaYAr4G3LDkvZYkjb15A6qqHgfeMEf9V4Ar5qgv4MYl6Z0kac3yShKSpC4ZUJKkLhlQkqQuGVCSpC4ZUJKkLhlQkqQuGVCSpC6N8kVdSavE4Pv00nhwD0qS1CUDSpLUJQNKktQlA0qS1CUDSpLUJQNKktSlkQMqyXlJPpvkI239tUkeSDKV5M4kF7T6l7X1qbZ92/J0/cX6+Z1FkrR6LWQP6l3AY0PrNwE3V9XrgGeBPa1+D/Bsq7+5tZMkaUFGCqgkW4AfBX67rQd4K3B3a7IfuLaVd7V12vYrWntJkkY26h7UfwL+GfBXbf0i4LmqeqGtHwc2t/Jm4BhA2366tT9Dkr1JDic5PD09fY7dl8aPc0MamDegkvwYcKqqHlzKN66q26pqsqomJyYmlvKlv83Po7QavRRzQ1oNRrkW3w8CP57kGuDlwN8Afh1Yn2Rd20vaApxo7U8AW4HjSdYBFwJfWfKeSzpnw3+0Va1cP6SzmXcPqqp+uaq2VNU24B3Ax6vqZ4FPANe1ZruBe1r5QFunbf941fJOAfeUJGn8LOZ7UP8ceHeSKQafMd3e6m8HLmr17wb2La6LkqS1aEG326iqTwKfbOXHgUvnaPN14KeWoG+SpDXMK0lIkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSujTKLd9fnuTPknwuyaNJfrXVvzbJA0mmktyZ5IJW/7K2PtW2b1uOjnuTQkkab6PsQX0DeGtVvQHYAVyV5DLgJuDmqnod8Cywp7XfAzzb6m9u7SRJWpBRbvleVfV8Wz2/LQW8Fbi71e8Hrm3lXW2dtv2KxP0cSdLCjPQZVJLzkhwBTgEHgS8Bz1XVC63JcWBzK28GjgG07acZ3BJ+9mvuTXI4yeHp6enFjUIaI84NaWCkgKqqb1XVDmALg9u8v36xb1xVt1XVZFVNTkxMLPblpLHh3JAGFnQWX1U9B3wCeDOwPsm6tmkLcKKVTwBbAdr2C4GvLElvJS254ROOPBivnoxyFt9EkvWt/ArgR4DHGATVda3ZbuCeVj7Q1mnbP15VtZSdliSNv3XzN+FiYH+S8xgE2l1V9ZEknwf+KMm/Bj4L3N7a3w78XpIp4BngHcvQb0nSmJs3oKrqIeAH5qh/nMHnUbPrvw781JL0TpK0ZnklCUlSlwwoSVKXRvkMaizMPjvJ0zYkqW/uQUmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6NMoNC7cm+USSzyd5NMm7Wv3GJAeTHG2PG1p9ktySZCrJQ0l2LvcgJEnjZ5Q9qBeAX6qqS4DLgBuTXALsAw5V1XbgUFsHuBrY3pa9wK1L3mtJ0tibN6Cq6mRVfaaVv8rgdu+bgV3A/tZsP3BtK+8C7qiB+4H1SS5e8p5Lksbagj6DSrKNwd11HwA2VdXJtulJYFMrbwaODT3teKub/Vp7kxxOcnh6enqB3ZbGl3NDGhg5oJK8CvivwD+pqr8Y3lZVBSzoDktVdVtVTVbV5MTExEKeKo0154Y0MFJAJTmfQTh9qKr+W6t+aubQXXs81epPAFuHnr6l1UmSNLJRzuILcDvwWFX9x6FNB4DdrbwbuGeo/vp2Nt9lwOmhQ4HdSL6zSJL6M8ot338Q+Dng4SRHWt2/AN4P3JVkD/AE8Pa27aPANcAU8DXghiXtsSRpTZg3oKrqT4AX28+4Yo72Bdy4yH7Nyb0dSVo7vJKEJKlLBpQkqUsGlCSpSwaUJKlLBpQkqUsGlCSpS6N8D0pSx/z6hcaVe1CSpC4ZUJKkLhlQkqQu+RmUpDMMf6ZVC7qJjrS03IOSJHXJgJIkdcmAkiR1aZQbFn4wyakkjwzVbUxyMMnR9rih1SfJLUmmkjyUZOdydl6SNL5G2YP6XeCqWXX7gENVtR041NYBrga2t2UvcOvSdFOStNbMG1BV9SngmVnVu4D9rbwfuHao/o4auB9Yn+TipeqsJGntONfPoDZV1clWfhLY1MqbgWND7Y63OkmSFmTRJ0m0W7wv+NsSSfYmOZzk8PT09GK7IY0N54Y0cK4B9dTMobv2eKrVnwC2DrXb0uq+S1XdVlWTVTU5MTFxjt2Qxo9zQxo414A6AOxu5d3APUP117ez+S4DTg8dCuxWMvciSVo5817qKMkfAj8MvCbJceC9wPuBu5LsAZ4A3t6afxS4BpgCvgbcsAx9liStAfMGVFX9zItsumKOtgXcuNhOSeqD1+XTSvJKEpKkLhlQkqQuGVCSpC4ZUJKkLhlQkqQuGVCSpC4ZUJKkLhlQkqQuzftF3bXMLylK0soxoCSNxD/Y9FLzEJ8kqUsGlCSpSx7iG5GHN6TvmH07GueEloMBdQ4MK0lafgaUpEXzjzYth2X5DCrJVUm+mGQqyb7leI9eeAdeSVoeSx5QSc4DfhO4GrgE+Jkklyz1+/TIsJLOnAdnW6T5LMchvkuBqap6HCDJHwG7gM8vw3t1a9QJuJjDIR5W0Wq22JCa/Ts/DvPBk0/OtBwBtRk4NrR+HHjT7EZJ9gJ72+rzSb54ltd8DfD0kvWwI3NM0nMa6yr9i3Rs/12be6vqqoU+aYFzA8b/5zjjjHGe7Xd+lc6HYa8Bnh6DcbyYkebGip0kUVW3AbeN0jbJ4aqaXOYudcGxaiFzA9bOz3GtjBPW1ljPZjlOkjgBbB1a39LqJEka2XIE1KeB7Ulem+QC4B3AgWV4H0nSGFvyQ3xV9UKSfwzcB5wHfLCqHl3ky458uGMMOFYt1Fr5Oa6VccLaGuuLSq3100QkSV3yYrGSpC4ZUJKkLnUfUON22aQkX07ycJIjSQ63uo1JDiY52h43tPokuaWN/aEkO1e292eX5INJTiV5ZKhuwWNLsru1P5pk90qMZTUYt7kBzg/nxyxV1e3C4CSLLwHfC1wAfA64ZKX7tcgxfRl4zay6fw/sa+V9wE2tfA3wP4AAlwEPrHT/5xnb5cBO4JFzHRuwEXi8PW5o5Q0rPbbelnGcG21czo+zjG2tzY/e96C+fdmkqvomMHPZpHGzC9jfyvuBa4fq76iB+4H1SS5eiQ6Ooqo+BTwzq3qhY3sbcLCqnqmqZ4GDwIKvxrAGrJW5Ac6PNTs/eg+ouS6btHmF+rJUCvhYkgfbJW0ANlXVyVZ+EtjUyuMw/oWObRzG/FIY15+T82PA+YH3g1oJP1RVJ5J8D3AwyReGN1ZVJRnLc//HeWxaMs4PfVvve1Bjd9mkqjrRHk8BH2ZwqOapmUMT7fFUaz4O41/o2MZhzC+Fsfw5OT+cH8N6D6ixumxSklcmefVMGbgSeITBmGbOxtkN3NPKB4Dr2xk9lwGnhw4HrBYLHdt9wJVJNrQzmq5sdTrTWM0NcH7g/PhuK32WxnwLg7NZ/i+DM5b+5Ur3Z5Fj+V4GZ1t9Dnh0ZjzARcAh4CjwP4GNrT4Mbv74JeBhYHKlxzDP+P4QOAn8JYNj43vOZWzAzwNTbblhpcfV6zJOc6ONx/nh/Dhj8VJHkqQu9X6IT5K0RhlQkqQuGVCSpC4ZUJKkLhlQkqQuGVCrUJLnl+E1dyS5Zmj9V5L806V+H2k5OTfGiwGlGTsYfK9G0pmcGyvEgFrlkrwnyafbPWN+tdVtS/JYkt9K8miSjyV5Rdv2d1vbI0k+kOSRdiWC9wE/3ep/ur38JUk+meTxJO9coSFK58S5sfoZUKtYkiuB7QyuV7YDeGOSy9vm7cBvVtX3A88BP9nqfwf4haraAXwLoAa3a/hXwJ1VtaOq7mxtX8/g8v6XAu9Ncv5LMCxp0Zwb48GAWt2ubMtngc8wmDTb27Y/r6ojrfwgsC3JeuDVVfWnrf4P5nn9/15V36iqpxlcxHLTPO2lXjg3xoC321jdAvy7qvovZ1Qm24BvDFV9C3jFObz+7Nfw90WrhXNjDLgHtbrdB/x8klcBJNnc7qMzp6p6Dvhqkje1qncMbf4q8Opl66n00nJujAEDahWrqo8xOBTxp0keBu5m/om0B/itJEeAVwKnW/0nGHzwO/xBsLQqOTfGg1czX2OSvKqqnm/lfcDFVfWuFe6WtOKcG/3xuOna86NJfpnBv/0TwD9c2e5I3XBudMY9KElSl/wMSpLUJQNKktQlA0qS1CUDSpLUJQNKktSl/w+bvoS4XATWSAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "graph = sns.FacetGrid(data=df,col='label')\n", + "graph.map(plt.hist,'length',bins=50,color='blue');" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 The air-conditioned apartments at Mc Queen hot...\n", + "1 From our restaurant, where we serve our gener...\n", + "2 Our hotel in Flagstaff, AZ, brings you true r...\n", + "3 All of the rooms are suites are well-equipped ...\n", + "4 Guests will find themselves in close proximit...\n", + "Name: sentence, dtype: object\n", + "0 1.0\n", + "1 1.0\n", + "2 1.0\n", + "3 1.0\n", + "4 0.0\n", + "Name: label, dtype: float64\n" + ] + } + ], + "source": [ + "X = df['sentence']\n", + "y = df['label']\n", + "print(X.head())\n", + "print(y.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean sentence\n", + "def text_process(text):\n", + " nopunc = [char for char in text if char not in string.punctuation]\n", + " nopunc = ''.join(nopunc)\n", + " return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10819\n", + "The air-conditioned apartments at Mc Queen hotel are minimally decorated\n", + " (0, 3531)\t1\n", + " (0, 4381)\t1\n", + " (0, 5855)\t1\n", + " (0, 5960)\t1\n", + " (0, 6980)\t1\n", + " (0, 7959)\t1\n", + " (0, 8602)\t1\n" + ] + } + ], + "source": [ + "count_vect = CountVectorizer(analyzer=text_process, token_pattern=r'\\w{1,}', ngram_range=(1, 3)).fit(X)\n", + "print(len(count_vect.vocabulary_))\n", + "r0 = X[0]\n", + "print(r0)\n", + "vocab0 = count_vect.transform([r0])\n", + "print(vocab0)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "antiques\n", + "tax\n" + ] + } + ], + "source": [ + "print(count_vect.get_feature_names()[5950])\n", + "print(count_vect.get_feature_names()[10216])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CountVectorizer(analyzer=,\n", + " binary=False, decode_error='strict', dtype=,\n", + " encoding='utf-8', input='content', lowercase=True, max_df=1.0,\n", + " max_features=None, min_df=1, ngram_range=(1, 3), preprocessor=None,\n", + " stop_words=None, strip_accents=None, token_pattern='\\\\w{1,}',\n", + " tokenizer=None, vocabulary=None)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "count_vect" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of the sparse matrix: (5231, 10819)\n", + "Non-Zero occurences: 61369\n", + "Density of the matrix = 0.10843692803867197\n" + ] + } + ], + "source": [ + "X = count_vect.transform(X)\n", + "#Shape of the matrix:\n", + "print(\"Shape of the sparse matrix: \", X.shape)\n", + "#Non-zero occurences:\n", + "print(\"Non-Zero occurences: \",X.nnz)\n", + "\n", + "# DENSITY OF THE MATRIX\n", + "density = (X.nnz/(X.shape[0]*X.shape[1]))*100\n", + "print(\"Density of the matrix = \",density)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Training Set Results---\n", + "Confusion Matrix for XGBoost Classifier:\n", + "[[1461 515]\n", + " [ 107 2101]]\n", + "Score: 85.134\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.93 0.74 0.82 1976\n", + " 1.0 0.80 0.95 0.87 2208\n", + "\n", + " micro avg 0.85 0.85 0.85 4184\n", + " macro avg 0.87 0.85 0.85 4184\n", + "weighted avg 0.86 0.85 0.85 4184\n", + "\n" + ] + } + ], + "source": [ + "xgb = XGBClassifier(n_jobs=-1)\n", + "xgb.fit(X_train,y_train)\n", + "y_pred = xgb.predict(X_train)\n", + "print(\"---Training Set Results---\")\n", + "print(\"Confusion Matrix for XGBoost Classifier:\")\n", + "print(confusion_matrix(y_train, y_pred))\n", + "print(\"Score: \",round(accuracy_score(y_train,y_pred)*100,3))\n", + "print(\"Classification Report:\")\n", + "print(classification_report(y_train, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Test Set Results---\n", + "Confusion Matrix for XGBoost Classifier:\n", + "[[334 144]\n", + " [ 33 536]]\n", + "Score: 83.095\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.91 0.70 0.79 478\n", + " 1.0 0.79 0.94 0.86 569\n", + "\n", + " micro avg 0.83 0.83 0.83 1047\n", + " macro avg 0.85 0.82 0.82 1047\n", + "weighted avg 0.84 0.83 0.83 1047\n", + "\n" + ] + } + ], + "source": [ + "pred = xgb.predict(X_test)\n", + "print(\"---Test Set Results---\")\n", + "print(\"Confusion Matrix for XGBoost Classifier:\")\n", + "print(confusion_matrix(y_test,pred))\n", + "print(\"Score: \",round(accuracy_score(y_test,pred)*100,3))\n", + "print(\"Classification Report:\")\n", + "print(classification_report(y_test,pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,\n", + " n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):\n", + " plt.figure()\n", + " plt.title(title)\n", + " if ylim is not None:\n", + " plt.ylim(*ylim)\n", + " plt.xlabel(\"Training examples\")\n", + " plt.ylabel(\"Score\")\n", + " train_sizes, train_scores, test_scores = learning_curve(\n", + " estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)\n", + " train_scores_mean = np.mean(train_scores, axis=1)\n", + " train_scores_std = np.std(train_scores, axis=1)\n", + " test_scores_mean = np.mean(test_scores, axis=1)\n", + " test_scores_std = np.std(test_scores, axis=1)\n", + " plt.grid()\n", + "\n", + " plt.fill_between(train_sizes, train_scores_mean - train_scores_std,\n", + " train_scores_mean + train_scores_std, alpha=0.1,\n", + " color=\"r\")\n", + " plt.fill_between(train_sizes, test_scores_mean - test_scores_std,\n", + " test_scores_mean + test_scores_std, alpha=0.1, color=\"g\")\n", + " plt.plot(train_sizes, train_scores_mean, 'o-', color=\"r\",\n", + " label=\"Training score\")\n", + " plt.plot(train_sizes, test_scores_mean, 'o-', color=\"g\",\n", + " label=\"Cross-validation score\")\n", + "\n", + " plt.legend(loc=\"best\")\n", + " return plt" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJztvXmcFOW1//8+PfvGjmwioFFZBWFAiRpBo+ISDei9ajDRRCUa8cbrN0aMxng1JCTfLGr05xqXKFdU3Ei+JAYNY1wDI6IGBERc2JRNltmnu8/vj6eqp7qnZ3qGmZ5p4Lxfr3pV1fM8VXWqpud86tlOiapiGIZhGM0R6mwDDMMwjMzHxMIwDMNIiYmFYRiGkRITC8MwDCMlJhaGYRhGSkwsDMMwjJSYWBj7NSLyVxG5uLPtyCREZLiIlIuIdOA1+4jIByKS11HXNNoXEwsjLYjIJyLy9c62Q1VPV9VH03FuEekiIreLyGciUiEiH3n7vdJxvXbkNuA3qqoiUuz9rab7mSJS4t3TeYG0UhH5i4h8KSI7RWSliMwWke5e/iUiEvGeQ4WIrBORK/3jVfULYDEwowPv02hHTCyMfRYRye7Ea+cCLwMjgClAF2AisB2YsBfn65B7EZF+wGTgeQBVrQC+D9wuIr29Yr8GylV1vnfMV4Ey4HVgqKp2w91zGBgdOP2bqlqsqsXAucCvReToQP5c71rGvoiq2mJLuy/AJ8DXm8g7C1gO7ATeAI4K5M0CPgL2ACuBqYG8S3AO6/c4p/xzL+014DfAl8DHwOmBY8qAywLHN1d2CPBP79ovAXcDjzdxD5cBXwDFzTwDBb4S2H8E+Lm3PQnYAFwPfA48BnwAnBUonw1sBcZ6+8d6z2sn8C4wKeHZrPNs/xiY3oRN3wFeSpL+CPCEZ9d2oG8g7zXgDyn+3pcAryWkLQG+lXA/VcCgzv592tL6xWoWRofivWk+hHvD7AncBywItGV/BJwAdAX+B3jcexv2OQbnFPsAswNpq4FeuLfiPzbTHt9c2f/FObiewC3At5u5la8Df1P3Zr639AV6AINwzTNPABcG8k8DtqnqMhEZAPw/nED2AH4EPCMivUWkCLgTJ3wlwFdxYpyMUbj7T+S/cUIxH/iRqn4O4J17IvBMa25MRMYDRwDlfpqqhoG1xNdGjH0EEwujo5kB3Keq/1LViLr+hFrcWzOq+rSqblLVqKo+CXxIfLPOJlX9g6qGVbXaS/tUVR9Q1QjwKNAPJybJSFpWRA4BxgM3q2qdqr4GLGjmPnoCm/fqCTQQBX6mqrXevfwvcLaIFHr538IJCMBFwEJVXeg9m0U4R3xG4FwjRaRAVTer6oomrtkNV/uIQ1W/BFYAhcCzgazuOD/xuZ8gIr/2+i0qReSmQNljvfQ9ONF9DPf3C7LHs8HYxzCxMDqaQcD/8ZzKThHZCQwE+gOIyHdEZHkgbySuFuCzPsk5Y45MVau8zeImrt9U2f7AjkBaU9fy2Y4TmrawVVVrAvasxTVFfcMTjLNxAgLuuf1HwnM7HuinqpXA+cAVwGYR+X8iMrSJa34JlCQmishFwGBc89uvEspHCdyrqv5YXb/Fc7imJZ+3VLWbV7vpi+vP+UXCpUpwzWjGPoaJhdHRrAdme07FXwpV9QkRGQQ8AMwEenoO6d9AsEkpXWGSNwM9Am/14ESsKV4CTvOaaZqiCvem7tM3IT/ZvfhNUecAKz0BAffcHkt4bkWqOgdAVV9U1VNwTn0V7jkm4z1c81AMETkI1w90Oa558D9F5ATvvJXAv4BpzdxnI9SNfnoG+EbgOtnAV3D9LcY+homFkU5yRCQ/sGTjnNgVInKMOIpE5EwRKQGKcA50K4CIfBdXs0g7qvoprlnnFhHJFZGJBBxdEh7DOfBnRGSoiIREpKeI/ERE/Kah5cC3RCRLRKYAJ7bAlHnAqcCVNNQqAB7H1ThO886XLyKTRORgbw7DOZ5w1QIVuNpAMhYBY0UkP5B2F/C8qi5W1c3Aj4EHAv1IPwa+JyKzPGFBRA7GDQhIioj0BKbimrZ8JgCfeM/a2McwsTDSyUKgOrDcoqrluDfYu3BNHGtxI2lQ1ZXAb4E3cSONRuFGP3UU02kY/vpz4Emc822EqtbiOrlX4Rzwblw7fS/cmzjAD3GCs9M79/OpDPCc9Zu4TuonA+nrcbWNn+DEdD1wHe5/OARcC2wCduBE6UqS4L3x/8M7FyLyTVxz1nWBMg9657rZ238NOAn4GrDGawL7G26k2R8Cp5/oz7PANadtBa4O5E8H7k31DIzMRFTt40eGkQwReRJYpao/62xb2hMRGY7r3J+gHeQAvBrJK8DRwX4aY9/BxMIwPLzhnjtw8xROxdUEJqrqO51qmGFkAJ02A9YwMpC+uGGjPXET5q40oTAMh9UsDMMwjJRYB7dhGIaRkv2mGapXr146ePDgdj9vZWUlRUXNDaXvfDLdxky3DzLfRrOv7WS6jZ1l39tvv71NVXunLNjZwanaaxk3bpymg8WLF6flvO1JptuY6fapZr6NZl/byXQbO8s+XIRhCyRoGIZhtB0TC8MwDCMlaRMLEXlIRLaIyL+byBcRuVNE1orIeyIyNpB3sYh86C32SUzDMIxOJp0d3I/gQjr8qYn804HDveUY4B7gGBHpAfwMKMXFCXpbRBaoC6FsGAckIsLHH39MTU1mTn7u2rUrH3zwQWeb0SyZbmO67cvPz+fggw8mJydnr45Pm1io6j9FZHAzRc4B/uR1sLwlIt28j9xMAhap6g4AEVmE+4TjE02eyTD2c4qKiigpKWHw4ME0/V2nzmPPnj2UlDSKfJ5RZLqN6bRPVdm+fTsbNmxgyJAm4z82S2cOnR1A/PcCNnhpTaU3QkRm4H0Avk+fPpSVlbW7kRUVFWk5b3uS6TZmun2Q+TaWlJSQm5tLRUVbPsyXPiKRCHv2NPqmUkaR6Tam277c3Fx27ty517/zfXqehareD9wPUFpaqpMmTWr3a5SVlZGO87YnmW5jptsHmW/jO++8Q5cuXTrbjCbJ9Ld2yHwbO8K+/Px8jj766L06tjNHQ20k/uMyB3tpTaUbhmEYnURnisUC4DveqKhjgV3qYvm/CJwqIt1FpDsu+ueLnWinYRzwbN++nTFjxjBmzBj69u3LgAEDYvt1dXUtOsd3v/tdVq9e3WyZu+++m7lz57aHyUY7k7ZmKBF5AtdZ3UtENuBGOOUAqOq9uA/jnIH7+E0V8F0vb4eI3AYs9U51q9/ZbRhGC5k7F268ET77DA45BGbPhunT9/p0PXv2ZPny5QDccsstFBcX86Mf/SiWX1tb2zDTN5T8HfThhx9OeZ2rrrpqr21MJ6nu7UAgbXeuqheqaj9VzVHVg1X1j6p6rycUeDPNr1LVw1R1lLovqPnHPqSqX/GW1L8wwzAamDsXZsyATz8FVbeeMcOltzNr165l+PDhXHrppYwYMYLNmzczY8YMSktLGTFiBLfeemus7PHHH8/y5csJh8N069aNWbNmMXr0aCZOnMiWLVsAuOmmm7j99ttj5WfNmsWECRM48sgjeeONNwAXQ+ncc89l+PDhnHfeeZSWlsaELMh1113H8OHDOeqoo7j++usB+PzzzznnnHM46qijGD16NP/6l/uo4a9//WtGjhzJyJEj+cMf/hB3b9OnT4/d21//+lcmTpzI2LFjOf/886msrGz3Z5qp7NMd3IZxQHLNNZDEOcZ46y2oTfgabFUVXHopPPBA8mPGjAHPSbeWVatWcc8993Diie4T43PmzKFHjx6Ew2EmT57Meeedx/Dhw+OO2bVrFyeeeCJz5szh2muv5aGHHmLWrFmNzq2qLFmyhAULFnDrrbfyt7/9jT/84Q/07duXZ555hnfffZexY8c2Ou6LL75g4cKFrFixAhFh586dgKu5nHLKKcycOZNwOExVVRX/+te/mDt3LkuXLiUcDjNhwgQmTZpEQUEBq1at4k9/+hOlpaVs2bKFOXPm8PLLL1NYWMjs2bO54447+MlPfrJXz21f48CtUxnG/kqiUKRKbyOHHXZYnMN+4oknGDt2LGPHjuWDDz5g5cqVjY4pKCjg9NNPB2DcuHF88sknSc89bdq0RmVee+01LrjgAgBGjx7NiBEjGh3Xo0cPQqEQl19+Oc8991wsmmtZWRnf//73AcjOzqZLly689tprnHvuuRQUFFBSUsI3v/lNXn311di9lZaWAvDGG2+wcuVKvvrVrzJmzBjmzp3bpN37I1azMIx9jVQ1gMGDXdNTIoMGQRrmkgTDan/44YfccccdLFmyhG7dunHRRRclnXWem5sb287KyiIcDic9d15eXsoyycjJyaG8vJxFixbx9NNPc8899/DMM88AtGpSY/DeVJUpU6bw2GOPtfj4/QmrWRjG/sbs2VBYGJ9WWOjS08zu3bspKSmhS5cubN68mRdfbP+BjMcddxxPPfUUAO+//37SmsuePXvYvXs3Z511Fr///e955x33ddzJkydz7733Am4S3O7duznhhBN47rnnqK6upqKighdeeIETTjih0Tm/+tWv8sorr7Bu3TrA9Z18+OGH7X5/mYrVLAxjf8Mf9dSOo6FaytixYxk+fDhDhw5l0KBBHHfcce1+jauvvprvfOc7DB8+PLZ07do1rsyuXbuYNm0atbW1RKNRfve73wFw1113cfnll3PfffeRnZ3Nfffdx4QJE7jwwgsZP348AFdeeSWjRo1i7dq1cefs06cPf/zjHzn//PNjw4V/8YtfcPjhh7f7PWYkLfnoxb6w2MePMpdMt081821ctmxZZ5vQLLt37+6wa9XX12t1dbWqqq5Zs0YHDx6s9fX1KY/rSBv3ho6wb+XKlY3SaOHHj6xmYRjGPkVFRQUnn3wy4XAYVY3VEoz0Yk/YMIx9im7duvH22293thkHHNbBbRiGYaTExMIwDMNIiYmFYRiGkRITC8MwDCMlJhaGYbSIzz//nAsuuIDDDjuMcePGccYZZ7BmzZrONispgwcPZtu2bYCbTJeMSy65hPnz5zd7nkceeYRNmzbF9i+77LKkkwAPBEwsDGM/ZO77cxl8+2BC/xNi8O2Dmft+2yLOqipTp05l0qRJfPTRR7z99tv88pe/5Isvvogr15qQHB2FH612b0gUiwcffLBRUMRMoCOeu4mFYexnzH1/LjP+PINPd32Kony661Nm/HlGmwRj8eLF5OTkcMUVV8TSRo8ezQknnEBZWRmnnXYaZ599dsyR/u53v4uF/PZDjldWVnLmmWcyevRoRo4cyZNPPgnArFmzYqHEg9/I8Ln33nu57rrrYvuPPPIIM2fOBOCb3/wm48aNY8SIEdx///1JbS8uLgac4M2cOZMjjzySr3/967Gw6AC33nor48ePZ+TIkcyYMQNVZf78+ZSXlzN9+nTGjBlDdXU1kyZNorzcfU3hiSeeYNSoUYwcOTIWAt2/3o033sjo0aM59thjGwkqwCuvvBL7eNTRRx8d+/b2r371K0aNGsXo0aNjUXiXL1/Osccey1FHHcXUqVP58ssvAZg0aRLXXHMNpaWl3HHHHWzdupVzzz2X8ePHM378eF5//fWm/6B7Q0tm7u0Li83gzlwy3T7VzLcxOIP7h3/9oZ748IlNLnm35Sm30GjJuy2vyWN++NcfNnv9O+64Q6+55pqkeYsXL9bCwkJdt26dqqqWl5fryJEjtaKiQvfs2aPDhw/XZcuW6fz58/Wyyy6LHbdz507dtm2bHnHEERqNRlVV9csvv2x0/i1btuhhhx0W258yZYq++uqrqqq6fft2VVWtqqrSESNG6LZt21RVddCgQbp161ZVVS0qKlJV1ccff1y//vWvazgc1o0bN2rXrl316aefjjuPqupFF12kCxYsUFXVE088UZcuXRrL8/c3btyoAwcO1C1btmh9fb1OnjxZn3vuOVVVBWLHX3fddXrbbbc1uqezzjpLX3vtNVVV3bNnj9bX1+v8+fN14sSJWllZGWfTqFGjtKysTFVVf/rTn+oPf/jDmC1XXnll7JwXXnhh7Ll8+umnOnTo0EbXbcsMbqtZGMZ+Rm0keSjyptLbg3HjxjFkyBDAhRCfOnUqRUVFFBcXM23aNF599VVGjRrFokWLuP7663n11Vfp2rUrXbt2JT8/n0svvZRnn32WwsQAiEDv3r059NBDeeutt9i+fTurVq2KxZy68847Y2/w69evbzaw3+uvv86FF15IVlYW/fv356STTorlLV68mGOOOYZRo0bxj3/8gxUrVjR7v0uXLmXSpEn07t2b7Oxspk+fzj//+U/ARdQ966yzYs8lWRjz4447jmuvvZY777yTnTt3kp2dTVlZGd/97ndjz6BHjx7s2rWLnTt3xr4VcvHFF8euA3D++efHtl966SVmzpzJmDFjOPvss9m9ezcVFRXN3kdrsBnchrGPcfuU5kOUD759MJ/uahyifFDXQZRdUrZX1xwxYkSzncHJnHwiRxxxBMuWLWPhwoXcdNNNnHzyydx8880sWbKEl19+mfnz53PXXXexaNEixo0bB8DZZ5/NrbfeygUXXMBTTz3F0KFDmTp1KiJCWVkZL730Em+++SaFhYVMmjQpaTj0VNTU1PCDH/yA8vJyBg4cyC233LJX5/HJycmJhUFvKrT6rFmzOPPMM1m4cCHHHXfcXkfnDYZQj0ajvPXWW+Tn5++d4SmwmoVh7GfMPnk2hTnxzrswp5DZJ+99iPKTTjqJ2trauH6B9957L/aRoCAnnHACzz//PFVVVVRWVvLcc89xwgknsGnTJgoLC7nooou47rrrWLZsGRUVFezatYszzjiD3//+97z77rtkZWWxfPlyli9fHvss69SpU3nhhRd44oknYh8+2rVrF927d6ewsJBVq1bx1ltvNXsPxx13HE8++SSRSITNmzezePFigJgw9OrVi4qKijhRLCkpifUnBJkwYQKvvPIK27ZtIxKJ8MQTT8Te/lvCRx99xKhRo7j++usZP348q1atYvLkyTz88MNUVVUBsGPHDrp27Ur37t1jz/mxxx5r8jqnnnpq7JOwQNJPzbYFq1kYxn7G9FEuFPmNL9/IZ7s+45CuhzD75Nmx9L1BRHjuuee45ppr+NWvfkV+fj6DBw/m9ttvZ+PGjXFlx44dyyWXXMKECRMAN9z06KOP5sUXX+S6664jFAqRk5PDPffcw549ezjnnHOoqalBVWOhxBPp3r07w4YNY+XKlbHzTpkyhXvvvZdhw4Zx5JFHcuyxxzZ7D9/4xjd48803GT58OIcccggTJ04EXKypyy+/nJEjR9K3b99YqHJww2uvuOIKCgoKePPNN2Pp/fr1Y86cOUyePBlV5cwzz+Scc85p8fO8/fbbWbx4MaFQiBEjRnD66adTV1fHmjVrKC0tJTc3lzPOOINf/OIXPProo1xxxRVUVVVx6KGH8vDDDyc955133slVV13FUUcdRTgc5mtf+1rs2x3tgbj+jX2f0tJS9UcptCdlZWVMmjSp3c/bnmS6jZluH2S+je+88w5HH310Z5vRJHv27KGkpKSzzWiWTLexI+z74IMPGDZsWFyaiLytqqWpjrVmKMMwDCMlJhaGYRhGSkwsDGMfYX9pMjY6h7b+fkwsDGMfIBKJsH37dhMMY69QVbZv396mYbU2Gsow9gEqKyvZs2cPW7du7WxTklJTU5O28f3tRabbmG778vPzOfjgg/f6+LSKhYhMAe4AsoAHVXVOQv4g4CGgN7ADuEhVN3h5EeB9r+hnqnp2Om01jExGVWMzpDORsrKyjB6tBZlvY6bblzaxEJEs4G7gFGADsFREFqhqML7vb4A/qeqjInIS8Evg215etaqOSZd9hmEYRstJZ5/FBGCtqq5T1TpgHpA4a2U48A9ve3GSfMMwDCMDSNukPBE5D5iiqpd5+98GjlHVmYEy/wv8S1XvEJFpwDNAL1XdLiJhYDkQBuao6vNJrjEDmAHQp0+fcfPmzWv3+6ioqIiFOM5UMt3GTLcPMt9Gs6/tZLqNnWXf5MmTWzQpL20hw4HzcP0U/v63gbsSyvQHngXewfVtbAC6eXkDvPWhwCfAYc1dz0KUZy6Zbp9q5tto9rWdTLexs+yjhSHK09nBvREYGNg/2EuLoaqbgGkAIlIMnKuqO728jd56nYiUAUcDH6XRXsMwDKMJ0tlnsRQ4XESGiEgucAGwIFhARHqJiG/DDbiRUYhIdxHJ88sAxwEH5odvDcMwMoC0iYWqhoGZwIvAB8BTqrpCRG4VEX8Y7CRgtYisAfoAfgzlYUC5iLyL6/ieo/GjqAzDMIwOJK3zLFR1IbAwIe3mwPZ8oNEXVVT1DWBUOm0zDMMwWo6F+zAMwzBSYmJhGIZhpMTEwjAMw0iJiYVhGIaREhMLwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJhGIZhpMTEwjAMw0iJiYVhGIaREhMLwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJhGIZhpMTEwjAMw0iJiYVhGIaREhMLwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJhGIZhpCStYiEiU0RktYisFZFZSfIHicjLIvKeiJSJyMGBvItF5ENvuTiddhqGYRjNkzaxEJEs4G7gdGA4cKGIDE8o9hvgT6p6FHAr8Evv2B7Az4BjgAnAz0Ske7psNQzDMJonnTWLCcBaVV2nqnXAPOCchDLDgX9424sD+acBi1R1h6p+CSwCpqTRVsMwDKMZ0ikWA4D1gf0NXlqQd4Fp3vZUoEREerbwWMMwDKODyO7k6/8IuEtELgH+CWwEIi09WERmADMA+vTpQ1lZWbsbWFFRkZbztieZbmOm2weZb6PZ13Yy3cZMtw9VTcsCTAReDOzfANzQTPliYIO3fSFwXyDvPuDC5q43btw4TQeLFy9Oy3nbk0y3MdPtU818G82+tpPpNnaWfUC5tsCnp7MZailwuIgMEZFc4AJgQbCAiPQSEd+GG4CHvO0XgVNFpLvXsX2ql2YYhmF0AmkTC1UNAzNxTv4D4ClVXSEit4rI2V6xScBqEVkD9AFme8fuAG7DCc5S4FYvzTAMw+gE0tpnoaoLgYUJaTcHtucD85s49iEaahqGYRhGJ2IzuA3DMIyUmFgYhmEYKTGxAFCF2lqIRjvbEsMwjIyks+dZZAY1NfDppxAKQV4eFBVBQQHk5na2ZYZhGBmBiYVPKATFxVBfD7t2wfbtIOJqHBs3QmEh5OdDTg5k22MzDOPAwrxeIjk5bvEJhZyAbNvmmqtUXX5hoVtyc90SshY9wzD2X0wsWoIvCD6RCFRVwe7d8WWCzVc5Oa5mYhiGsR9gYrE3ZGW5JUg47MTjyy9d7SMUcs1WxcWuHyQ315qvDMPYZzHv1V5kZ8eLgaprvtq+3dVERJyAFBW55qu8PFf7SBQdwzCMDMQa2ufOhSOPhKFDYcIEePbZ9jmviKtNFBZCSYmrYeTnu5FXX3zhRl999BF8/DFs2QIVFa4z3QVONAzDyCgO7JrF3LkwY4brfwA36unHP3bb06Y1fdze4g/NzctrSAuHYc8e13zl93Hk5zf0fyR2uBuGYXQCB7ZY3Hhjg1D4VFfDnDnpEYtkJGu+CoedeGzf7vazspxwFBc3dLZb85VhGB3IgS0Wn32WPH3jRrjkEjj0UPoVFLiO68MOg9690z/CSaRxbSIahbo611zlzzLPyWno/1B16TZ81zCMNHFgi8Uhh7i+g0Ty82HDBnj1VY6sqYHbb3fpxcVw6KFOOILrQw91jjtdJGu+ikRcP8euXU5I1q6Nn33uC44N3zUMox04sMVi9uz4PgtwjvbXv3bNUNEob5aVMTErC9atcx3S69bB0qXw/PPxndF9+zYIR1BMBg5Mz5BZv2kKmp59LtIwedBmnxuG0QZa7DlE5HjgcFV9WER6A8Wq+nH6TOsApk936xtucDWJ/v1h1qyG/opQiNo+fWDECDjxxPhjq6vhk0/iReSjj+Avf4GdOxvK5eTAoEHJayS9erXvm39i85U/fNdmnxuG0UZaJBYi8jOgFDgSeBjIAR4HjkufaR3E9OlOHNavd2/nLaWgAIYNc0siO3Y0CEhQTMrKXJORT5cuDbWRxGatwsI231ps+G5w9nk0arPPDcNoNS2tWUwFjgaWAajqJhEpSZtV+zo9erhl/Pj49EjEdZ4HayLr1sFbbzWe39GvX+OayGGHwcEHt20klD+zPIjNPjcMIwUt9QB1qqoiogAiksbe3P2YrCzXqX7IITB5cnxedbWboJcoJC+84PohfHJzYfDgOBHpEgq5PpMePfauRrA3s8+zs12a3zdiGMZ+TUvF4ikRuQ/oJiKXA98DHkifWQcgBQUwfLhbgqjGN2sF1y+/DPX1jPXLdusGQ4Y0rpEMGdLQGd4Smmq+qqlxEwiTEQq5JSurYTsUcqISibhaSzDdFxlfiIL7fpphGBlDi8RCVX8jIqcAu3H9Fjer6qK0WmY4RKBnT7dMmBCfFw7Dhg28t3gxR4XDDSLy+uswf3582QEDGgQkKCb9+7esWSvZ8N0gfgd6NOqWSKRhPxxumGDYknAmqg2C4QuJH7wxUYiCZZoSncQ0wzBaTUqxEJEs4CVVnQyYQGQS2dkweDA7JkxwI7aCVFXF10T85Zln4msHeXmu5hHsaPeFpEePxtd89lk3w33TpvjRY83VCEKhveuw98XFF536+vi04NLU8UFx8GfDB8XFF51wGLZujReh5kQncd8w9nNSioWqRkQkKiJdVXVXqvL7JP4/fkWF2w86mWjU9SckNrNkOoWFMHKkW4KouqG0iaO11qyBRYucQ/bp1i2+FrJ1Kzz+uAt4COmPpRV0xO0V3iRRZCIRJxTRqOvkTyZAyUQnUSCaaoIL7jfVDGciZOwDtLTPogJ4X0QWAZV+oqr+V1qs6mjy8uArX4lvQvG31693Q1zDYedI6+rcNiR3GllZDf/0wbfYTEHEhS3p3RuOPTY+Lxx295vYP/Lqq/D008nPV10N11wDd9/tnmN+fkNzlbd9RHW164DPz2/IT1IuaX7iuq1C3ZQTFmldv04i0Wh8LSgScaLaJq7UAAAgAElEQVT6wgvwm9/A5s1uhNu118I3vpH8HE2JUCjkfneffdZYkBJfYloqQpn0mzT2CVoqFs96y/5LsF08OLEtK8s51kQSRSUSaXhL9YWlvr6hTNARBNvk/esG//E7i+xs1yQ1ZEjjvIoKF8Y9WZNPJOJqHzU1bqmsdH0UtbVQW0uvigpXpqYmfp7J3pCb27TQNCU4LSjXbfNmZ5+fVlAQf2x2dvMONtnf7dlnXbDK6mq3v2kT/PSn7pwtrYkFazn+KLXE2o//+2ruHMlsTyY2yUTIfwFqToR8O0yI9lta2sH9qIjkAkd4SatVtb65Y/Z7WuPY/eaOxJpLUFTC4fhaS/DY4D9msuaMjqC42PVRbNzYOG/AAHig6cFxb6xYwSS/TyUadSJSUxO/9rf9JZiWuF1d3TjNX+/Y0fR5/CCMSRiT6v79uSdBAQkKTjJBeu65BqHwqa6Gm292Djh4bHAJpufmurUfYLI98WtBwdpQa/qEgtTWuvhkwefVnAglDk5oajRcUHwSa0UmSh1KS2dwTwIeBT4BBBgoIher6j9THDcFuAPIAh5U1TkJ+Yd45+3mlZmlqgtFZDDwAbDaK/qWql7RslvKQERaPqktOKIoKCyRSLyw+Gtw+Xv2xP/zBJvD/H/atv5zzZrl+iiCDrCgwKW3lFDIHdOWJp+9wQ/9HhQQX3Rqali+ejVj+vWLF5+mBC1xnaQ2FUtLxpdfwg9+0Crzv5aTE1/baUpcgiLTEjFKTPfFKTEtVZ+RH58s+Lz3Roj+/Gf43e8aN9ulqh1B4+a2YF4o1DAqL3hcS0Up2X6yvP2YljZD/RY4VVVXA4jIEcATwLimDvBGUd0NnAJsAJaKyAJVXRkodhPwlKreIyLDgYXAYC/vI1VN+cK33xFsDmsJ0ahr3hg8uEFg/KYwf6mrc0tTb9bB0UHBN8FE/KaTZKOhMh3/zTwnx325MIGd+fmNR5S1lQkTktfE+vaFefMaRKWurkFkEsXJWzZs3MghxcWN0mOLH304WV5NTdvvJSenWdE5qq7ODe9uiSA1JWxvvAF33tkwgGLTJrjpJve7nTq1YfJo8LeZ2EyXOHghmBeJNMRta02tqTUk1vgTa0+JAhbcj0QaRiq2VLDmzXPPaP16N9l39uyGmHftTEvFIscXCgBVXSMiqerEE4C1qroOQETmAecAQbFQoIu33RXY1EJ7DB//h9bU/Icgwbe8YM0lsZ/Fbw5L9jZ32mlw+unxP9ra2sZNA4lrX6gOpCaEpmpiN94Ihx/eqlOtW7GCQ/ZWzFRbJEjNLs2Vr6khq7YWPv88/jrB2tfeOuWaGvcM/VF34H7z/vfr/UjKifv+Etg/qqYGunePz09SrsX7ya7v5/vbiWt/O1nZcNjVqFoy+g5cLeymmxpeBj791EXRhrQIRkvFolxEHsQFDwSYDpSnOGYAsD6wvwE4JqHMLcDfReRqoAj4eiBviIi8g5sIeJOqvtpCW42maE2tpbnmMF9cfAHw1001M/jOKli2mf6D2LlEmv7HaWqd6v6TrX17/H+6YHoqEWzunJlSExNpfkJlO/BOsF8qEb8JMJXwXHhh0xe48cb4l5pgzdkfSBJsog3ue0tWba1rhmrNeTqQE/3ab0tFcOXKxgNGqqrcs0qDWIi2QPFFJA+4CjjeS3oV+P9UtbaZY84DpqjqZd7+t4FjVHVmoMy1ng2/FZGJwB+BkbiotsWqul1ExgHPAyNUdXfCNWYAMwD69Okzbt68eS287ZZTUVFBcWui0XYCmW5jm+xrzRtpqrLNTN6rqK6m2O9HaW6S395ctx2oqK2luC3OvqU1ub2s8VXU1FCcGKCylRx70UXkb9nSKL3moIN46/HHkxzROlpto/fCFIpEkHAYCaxD/r6f5qcHy3rbjcp626HAcRKJUF9TQ55IwzkSyyZcr0d5Ocn+WirCK//4R4tvc/LkyW+rammqci0ViyKgRlUj3n4WkKeqVc0cMxG4RVVP8/ZvAFDVXwbKrMAJynpvfx1wrKpuSThXGfAjVW2yNlNaWqrl5akqO62nrKyMSZMmtft525NMtzHT7YM02BhsK2/L2tsue+stJh1zTPIyibW1VDW+lh7TVHoSQSn7+GMmJRtynYymaoULFrihxcE+lvx8+PnP0YS5KZpgQqIX00CKv/3auk85/rDBxHyeSFw5d974Ezc6b1wlN/5ZxM4lQpxfFSGq8TXpqMSfORqNsnztBsZ85WA0waooDceqauzcvSedSfamzTRi0CD3rZ0WIiItEouWNkO9jGsi8qY4UwD8HfhqM8csBQ4XkSHARuAC4FsJZT4DTgYeEZFhQD6w1fu40g5v9vihwOHAuhbaahidT3v3zfhRfzsYVee6/DWes1KNxqdt3EDVwL5xeagS1ShRjRCNujVANBIhihKNRoj659Eo0QunUJgXpsev7iR70+eE+/dh23Uz2XPOicDuBs+t2lizohpns2i8WAhQR5iPI9sDxyQ0hXr3JsH39YCTVxRJvA5CzDAloZbpC65bx+yNRuOvAQhCOFLPzi83ewUVvDKJvyD/2B1XXkKv2b8jVBNo4CksdJ3caaClYpGvqr5QoKoVItJssB9VDYvITOBF3LDYh1R1hYjcCpSr6gLg/wAPiMh/457OJV4o9K8Bt4pIPRAFrlDVHa2/PcPYt4k5ZCAcDcelBddA0jTnrBsWP6+pNFUlSpRobEACMb8Vc45BX6buunXRejZUfh6XJiIIElv7SLafnu1dwpXJRqi/+CK2XPxtl+551/ZqYA1tqKC4/+B2Ols7o0po90oKjhge209WJkj4qsPZdVBfSm77NVkbN8HAgcgvftHpo6EqRWSsqi4DEJFSoDrFMajqQtxw2GDazYHtlST52p6qPgM800LbDKNDSeasg043uI54b8/BBZI48eC5kjjr2kgt63asiznhRGedmOb75qTOOokTFxGyJbtRWksJSYjivMztN8t4mhso0QzVF55H9YXnUVFXweE9Dk/rKMOWisU1wNMi4g9t7Qecnx6TDKP9aMqRJ6ZFNcqX1V82cuLhaDjOoTf11h1rIklME+dIm3PWIpI0LYg5Y6OzaVYsRGQ8sF5Vl4rIUOD7wDTgb8DHHWCfcYAQdNqJb+eN3ryTOPKoRolohEg0Eu/Um2g+SUyrj9azrWobIXHzVoJOOyShuLfu1rxxG0Y6efaDZ5nz2hw27dnEwK4D+cXJv2D6qM5phrqPhrkPE4GfAFfjQuncD5yXFquMjCcSjTTpyJM59S8qvmj01h6JRuKcelNNKolpjd7KA+vs0N459ZCEKMq1rwV3NkHn17+kP7OOn8W0YftAhIBO4NkPnuXHi35Mddj1CHy26zNm/NlNykuHYKQSi6xAx/L5wP1+f4KILG93a4yMJBwNUxepozZcS1V9FdXhatf23lTTi5fmO+yIRqisr2zk3HOzc+1N3YiR6Pw27tnIjxe5mdudLRiqSkQjhKNhIlFv7e37NVx/OxKNENZAuWik2WP98us/X8/y95Y3Ojas3jWi0bi8x997PPasfKrqq7jx5Rs7RyxEJFtVw7ghrjNacayxDxKOhqmP1FMXqaOyvpKq+qrYm78g5GTlUJBd0Oq39vzstk3Y2lc50N6UVTX2chFcaiO1sd9VoyVaR124jpvLbm7k/KrD1fzk5Z/w4Y4PY04yqUP2miCT5fnOfM/uPeSuyW2U5x8bdMyJTj1xnkTa+LD57CzJIjuUTVYoi6r65NPcPtv1WRoMS+3wnwBeEZFtuNFPrwKIyFeA/fOreQcQkWgk9g9bVV9FdX019VEX4iAkIbJD2eRn58fa8Y3Wkc43ZVWlPtqE823CSTfnsPc2z1+q66qJvOF+T4kT3drKnro93L3k7piTzA5lNzhNyWpIS8jLDmXHfsdZoSzys/LpVtAtvrxkEwq5MtmS3SgvO+TlB/KC+VmhrDgHnpiX1EbJjivv27ju/XUMGzMsLi/uHiQr7iVtwgMT2LincaDKQ7oe0q7P36dZsVDV2SLyMm7009+1YVpiCNd3YewjRKIR51zCThiq6qsIazjWfJQTyiE3O5d8OTBrAHtLJBqhqr6KirqKWE3M3/7Z4p8lfVO+4aUbeHvT2w2O2Pu7xN6yI+5NO5YXqaOyuhIt1zgH3Z5kh7LJCeWQl5VHbnau+z1k5Tba75rXlZysQF5WLjlZOVRsq6Bf/35xecnKNZf3H0//B59XfN7ItgElA1hy+ZI23+OKpSsYMb6dIwu3I5X5lfQr6dfi8rOOnxX3MgJQmFPI7JM7aVKeqr6VJG1NWqwx2oWoRqmL1FEfqaeqvorK+krqIw01hqxQFjlZOeSHDixhiGqU6vrqOMdeWVdJRV0Fq7eu5v1/v9+QV+eeW3C/or6Cyrp4QagJtz78d0V9Bc+vfj7OUSY61KKCorj9qh1VHNT3oAZn64l7UkecmBfKIS+7YTsxLzcrl6xQ275x3h6O+MYTbmzk/AqyC5h1fCu+l3IA4ddOM2U0lJHh+KOPKuoqYs6vPlIf62zOznJvjO3VZ9BRbfCqSk24prHD9rabc+CNHL2X11Qbb4xVDZvZoWyKc4opyi1yS45b9yzoSVFOEYW5hbH8wpxCinOLY2X89fde+B5fVH7R6DJ786ac6W/F7UGi8zsQ+njayrRh05g2bFpsUl46B4uYWOxDRDVKfaSe+mg91fXVVNZVUhuppS5Sx6bdm2I1hrzs9ISibrINXuGMI86Ie1MPOuk1X6xh6fKlVNZVttjRV9ZXtrhTMSShRs66MKeQ/iX9G9J8J57g0H1Hv3n1ZkaPHR1Lz83KbfPzuulrN9mbcivxnZ+ReZhYZCjBDsyaeveGXRupjc07yA5lk5OVQ0l2SYfM7t1RvYNbym5J2gZ/9d+u5uq/pejC8houBYlz1r4DP6jooDhHH+f8m3D0/jovK6/Nb1QrNqzg4C4Ht+kcidibsrE/YWKRAfjCUB+pjzW91IRrYp3PWeJqDMW5HRPuQVVZt3Md5RvLWbJxCUs3LeWjLz9q9phZx89K7tBzi9j0wSbGjBtDcW4x+dn5B9S8CntTNvYXTCw6mOA49NpILZV1Thj8Wc/+cLminKIOc6q14Vre3/I+5ZvKWbpxKUs3LWV7tQvl3C2/G6X9S/nPEf/Jg8seZGvV1kbHDygZwNUTmqlZfAK9i3qnyXrDaF+CUXyb2m8uL+5cTZSLm9Dq7UejUSrqKhrl+6Fp/EgGifso5GSl+sp12zGxSDN+H0Nt2AlDdbja+wYAsfHdhTmFHfq2/WX1l5RvLqd8YzlLNy1l+efLXRMXMLjbYE4achLj+49n/IDxfKXHV2LzLPqX9Lc2eKNZWuJIE8OqB/Ni52miTHNOMy7elzvQEfsMfJSK2oom8/39kPdd+xANccKA2P9BsnWjgJCBuGLJjvHPGYzwuyFrAwO7DGwyv7n9kITS7kNMLNqRZGExIlH3wRd/kk1HC4Oq8umuT1mycYmrOWxayprtrgMhO5TNqINGcfGYi5049B/fbA3A2uDTT5MO1quRtvSttjVvtHGxtxLy/P1kb7PBsr4jjnO03qz/ljjZ5soE1y1xok3lbcrexKE9Dk3pgDuLkIQoyCnoVBuaw8RiL0lHWIz2oD5Sz7+3/Julm5bGmpT8pqMueV0o7VfK1KFTGd9/PGP6jmn1j/NAboMPBk1MFh036MiD35RIGsYckjpj/+008a0WXJnWOtjEss0527Y64iN6HdHaR9rhZIfM5e0t9uRaQCaHxdhVs4slO5bwl9f/wtKNS3nn83diE8UO6XoIJww6gQkDJjC+/3iO6HnEARW6ozXO3W8vDjZlJIYxzwplEZJQLGR5VlZg3wvL4C+JIc5b6oyb4tOsTxnUbVAan5ZhNI+JRQLBsBiV9ZXURercSKAMCIuhqqzfvT5WYyjfVM6qbatQlCzJYuRBI5k+ajrjB7gmpb7FfTvcxraQLMR5UueeGO22ie1QKESWOIeeJVlkZ2Unde7B9uJE557o6A3jQMXEAtektKN6BxV1FY3CYgjSYUNWk9m1cuvK2PDV8o3lfF7pYucU5xZT2q+UM484k957ejNt0jQKc5r9LHq701LnHtex2ELnHpIQOVk5Dc7eC9jmO/emHLrfRt5a557p7cWG0dmYWODa+bdXbY/NA4ijA18o99TuYdnmZTFxeOfzd2IhKgaUDGDiwImU9i9l/IDxDO05NBbPZ8XSFWkXClWlNlJLOBKOc+7+W3uicw++vW/I2sCALgPimmeSbRuGkbmYWHhkhbLaHEyttWzcvTHWEb1k0xJWbVtFVKOEJMTw3sO5YMQFlA4oZXz/8fQv6d+htqm6CKf+CJxQKERRThHFhcXkZbtgdS118PYVOsPY9zGx6CAi0QgfbPsg1t+wZOMSNldsBqAop4ix/cZyzTHXMH7AeMb2G9spTV/B0NeCUJhTSM+CnrGIpfb2bxgHLiYWaaKyrpK3N78dm9uwbPOy2OzMvsV9YyOUxvcfz7DewzplSJ8/9FfVjZ3Pz86nT1Ef8rLz2iXekmEY+w8mFu3E5j2b4+Y2rNy6kohGEIShvYZy7rBzY7OiB5QM6BRH7E8a9L+fnZedR6/CXuRn55OXnXdADas1DKN1mFjsBZFohNXbV8eJw4bdGwAX/uLofkdz9YSrY01KXfK6dJqddZE6N4vcG/bbLa8bhbmF5GXldXgfjWEY+y4mFi2gqr6Kdza/E5vbUL6pnD11ewDoU9SH0v6lXDb2Mib0n8Dw3sM7JKhXMqIapTZcS0Qj7sNHoWy65HWhMKeQvOw8m71qGMZek1bvISJTgDuALOBBVZ2TkH8I8CjQzSszS1UXenk3AJcCEeC/VPXFdNg49/253PDSDWzYvSEW6+i4gce5WsOmpby65lU+ev0jwtEwgnBkzyM5Z+g5jO8/ngkDJsQmcnUG/nBWf/ZxSEKU5JbEvvHQWaJlGMb+R9rEQkSygLuBU4ANwFIRWaCqKwPFbgKeUtV7RGQ4sBAY7G1fAIwA+gMvicgRqhppTxvnvj+XGX+eEZvLsHHPRv7rr/8VCwGRn5XPEcVHcGXplYzvP55x/cfRLb9be5rQKhKHs4q4CYM5WTkM7ja4VcNZDcMwWkM6axYTgLWqug5AROYB5wBBsVDAb9DvCmzyts8B5qlqLfCxiKz1zvdmexp448s3Nvous6J0zevK49MeZ+RBI/lw2Yed+u3jukgd9ZH6mIAV5RTRo6AH+dn5seGsq2V1u3wG1DAMoynSKRYDgPWB/Q3AMQllbgH+LiJXA0XA1wPHvpVw7ID2NvCzXZ8lTd9du5ux/ca29+VahP/9i2jUfX+6IKeAboXdyM9x4mAjlgzD6Aw6u8fzQuARVf2tiEwEHhORkS09WERmADMA+vTpQ1lZWasuflDeQXxR+0Wj9N55vVmxdAUANZU1se10kBjWWkRcdFNaHgKjoqKi1ffekWS6fZD5Npp9bSfTbcx0+9IpFhuBgYH9g720IJcCUwBU9U0RyQd6tfBYVPV+4H6A0tJSnTRpUqsM/G3P38b1WYAb+vrTk37KiGGu6WnF0hXt2gwVHM6qKLlZuZTklrRpOGtZWRmtvfeOJNPtg8y30exrO5luY6bbl06xWAocLiJDcI7+AuBbCWU+A04GHhGRYUA+sBVYAPyviPwO18F9OLCkvQ2cPmo6QKPRUO35cZ+oRuM6pXNCOZTklVCUU2TDWQ3D2GdIm6dS1bCIzARexA2LfUhVV4jIrUC5qi4A/g/wgIj8N66z+xJ134FcISJP4TrDw8BV7T0Symf6qOlMGzqN9bvXt0s8Jn/Ekt8pnRXKojinmOK8YhvOahjGPktaX2u9ORMLE9JuDmyvBI5r4tjZwOx02tceqCr10XrqI/WxiLFFuUX0LOhJfk6+DWc1DGO/wNpA9gI/AF9Uo4gIhdmFdC/qHjec1TAMY3/CxKIF+AH4IlEXGDA/J59ehb0oyCmw4ayGYRwQmFgkwR+xFI6GiUajRKNRehT0oCC7wKKzGoZxQGJi4RGOhtlT64ID5mTl0DWvK4W5hWzO3szg7oM71zjDMIxOxsQCF531oMKDKMgpsOGshmEYSTCviKtJ9Crq1dlmGIZhZCzW+G4YhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkrSKhYiMkVEVovIWhGZlST/9yKy3FvWiMjOQF4kkLcgnXYahmEYzZOdrhOLSBZwN3AKsAFYKiILVHWlX0ZV/ztQ/mrg6MApqlV1TLrsMwzDMFpOOmsWE4C1qrpOVeuAecA5zZS/EHgijfYYhmEYe4moanpOLHIeMEVVL/P2vw0co6ozk5QdBLwFHKyqES8tDCwHwsAcVX0+yXEzgBkAffr0GTdv3rx2v4+KigqKi4vb/bztSabbmOn2QebbaPa1nUy3sbPsmzx58tuqWpqyoKqmZQHOAx4M7H8buKuJstcDf0hIG+CtDwU+AQ5r7nrjxo3TdLB48eK0nLc9yXQbM90+1cy30exrO5luY2fZB5RrC3x6OpuhNgIDA/sHe2nJuICEJihV3eit1wFlxPdnGIZhGB1IOsViKXC4iAwRkVycIDQa1SQiQ4HuwJuBtO4ikudt9wKOA1YmHmsYhmF0DGkbDaWqYRGZCbwIZAEPqeoKEbkVV+3xheMCYJ5XHfIZBtwnIlGcoM3RwCgqwzAMo2NJm1gAqOpCYGFC2s0J+7ckOe4NYFQ6bTMMwzBajs3gNgzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkJK1iISJTRGS1iKwVkVlJ8n8vIsu9ZY2I7AzkXSwiH3rLxem00zAMw2ie7HSdWESygLuBU4ANwFIRWaCqK/0yqvrfgfJXA0d72z2AnwGlgAJve8d+mS57DcMwjKZJZ81iArBWVdepah0wDzinmfIXAk9426cBi1R1hycQi4ApabTVMAzDaIa01SyAAcD6wP4G4JhkBUVkEDAE+Eczxw5IctwMYIa3WyEiq9toczJ6AdvScN72JNNtzHT7IPNtNPvaTqbb2Fn2DWpJoXSKRWu4AJivqpHWHKSq9wP3p8ckh4iUq2ppOq/RVjLdxky3DzLfRrOv7WS6jZluXzqboTYCAwP7B3tpybiAhiao1h5rGIZhpJl0isVS4HARGSIiuThBWJBYSESGAt2BNwPJLwKnikh3EekOnOqlGYZhGJ1A2pqhVDUsIjNxTj4LeEhVV4jIrUC5qvrCcQEwT1U1cOwOEbkNJzgAt6rqjnTZmoK0NnO1E5luY6bbB5lvo9nXdjLdxoy2TwI+2jAMwzCSYjO4DcMwjJSYWBiGYRgpMbEAROQTEXnfCztS7qX1EJFFXriRRV5HO+K40wth8p6IjE2DPQ+JyBYR+XcgrdX2pDNkShM23iIiGwMhXM4I5N3g2bhaRE4LpDcbEqYN9g0UkcUislJEVojID730jHiOzdiXSc8wX0SWiMi7no3/46UPEZF/edd70hvAgojkeftrvfzBqWxPk32PiMjHgWc4xkvvrP+VLBF5R0T+4u1nxPNrNap6wC/AJ0CvhLRfA7O87VnAr7ztM4C/AgIcC/wrDfZ8DRgL/Htv7QF6AOu8dXdvu3uabbwF+FGSssOBd4E83OTLj3CDHrK87UOBXK/M8Hayrx8w1tsuAdZ4dmTEc2zGvkx6hgIUe9s5wL+8Z/MUcIGXfi9wpbf9A+Beb/sC4MnmbE+jfY8A5yUp31n/K9cC/wv8xdvPiOfX2sVqFk1zDvCot/0o8M1A+p/U8RbQTUT6teeFVfWfQOLor9bak9aQKU3Y2BTn4Ea81arqx8BaXDiY1oaEaY19m1V1mbe9B/gAFwUgI55jM/Y1RWc8Q1XVCm83x1sUOAmY76UnPkP/2c4HThYRacb2dNnXFB3+vyIiBwNnAg96+0KGPL/WYmLhUODvIvK2uBAiAH1UdbO3/TnQx9tuUSiSNNBaezrLzpleFf8hv4mns230qvNH4948M+45JtgHGfQMvSaU5cAWnBP9CNipquEk14vZ4uXvAnqm08ZE+1TVf4azvWf4exHJS7QvwY50PsPbgR8DUW+/Jxn0/FqDiYXjeFUdC5wOXCUiXwtmqqsLZswY40yzJ8A9wGHAGGAz8NvONQdEpBh4BrhGVXcH8zLhOSaxL6OeoapGVHUMLorCBGBoZ9qTSKJ9IjISuAFn53hc09L1nWGbiJwFbFHVtzvj+u2NiQWgqhu99RbgOdw/xRd+85K33uIV76xQJK21p8PtVNUvvH/eKPAADVXlTrFRRHJwjniuqj7rJWfMc0xmX6Y9Qx9V3QksBibimm/8Cb3B68Vs8fK7Ats7wsaAfVO8Jj5V1VrgYTrvGR4HnC0in+CaB08C7iADn1+L6OhOkkxbgCKgJLD9Bq698v8S3xH6a2/7TOI7yZakya7BxHcet8oe3BvVx7gOu+7edo8029gvsP3fuHZWgBHEd9Ctw3XMZnvbQ2jonB3RTrYJ8Cfg9oT0jHiOzdiXSc+wN9DN2y4AXgXOAp4mvoP2B972VcR30D7VnO1ptK9f4BnfDszJgP+VSTR0cGfE82v1PXT0BTNtwY0ieddbVgA3erjQwzQAAAR4SURBVOk9gZeBD4GX/B+P90O7G9d2+z5QmgabnsA1QdTj2icv3Rt7gO/hOsPWAt/tABsf82x4DxcHLOj4bvRsXA2cHkg/AzcS6CP/2beTfcfjmpjeA5Z7yxmZ8hybsS+TnuFRwDueLf8Gbg78zyzxnsfTQJ6Xnu/tr/XyD01le5rs+4f3DP8NPE7DiKlO+V/xzj+JBrHIiOfX2sXCfRiGYRgpsT4LwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJh7FOISM9ANNHPEyK05rbwHA+LyJEpylwlItPbx+rMQERe8yOwGkZrsaGzxj6LiNwCVKjqbxLSBffbjiY98ABFRF4DZqrq8s62xdj3sJqFsV8gIl8R922IubjJlf1E5H4RKfe+dXBzoOxrIjJGRLJFZKeIzPG+ifCmiBzklfm5iFwTKD/H+3bCahH5qpdeJCLPeNed712r0Zu7iIwXkVe8QJV/FZE+IpLj7R/vlfm/0vA9hv8RkaUi8m8RudcTP9+O33nXWSkipSLynLhvMNwSeA4rRGSeiHwgIk+JSEESm0737neZuG8oFAXsWOkF4ftVu/6RjH0aEwtjf2Io8HtVHa4u3tcsVS0FRgOniMjwJMd0BV5R1dHAm7iZvMkQVZ0AXAf4wnM18LmqDgduw0WOjT/IRTy9AzhXVcfhZhTfpqr1wHeB+0XkVGAy8HPvsDtUdTwwyrMvGC672runPwLPA1d45WaISDevzHBcGJFhQA3w/QSbDsKFOjlZXQDN94Afikgf3GzwEap6FPDLJp6FcQBiYmHsT3ykquWB/QtFZBmwDBiGc6KJVKvqX73tt3HxrpLxbJIyx+MCxKGqfriYRIbhYvu85IXSnoUXFE5V3/OOfwH4nicg4L5jsAQXguZE73ifBd76feB9dYEHa3Af8DrYy/tY3fcawInT8Qk2fRX3LN7wbJru3dMOXCjtB0RkKlDZxLMwDkCyUxcxjH2GmHMTkcOBHwITVHWniDyOi72TSF1gO0LT/xO1LSiTDAHeU9UTmsgfiftugd/8VQjchfuK3kYR+XmC3b4d0cC2v+/bldgRmbgvwN9U9duNjBUpBU4B/gO4Eji16VszDiSsZmHsr3QB9gC7peFraO3N68B/AojIKJLXXFYCA0RkglcuV0RGeNvnA8W4IHN3i0gXXPTUKLBNREqAc/fCriEiMt7b/hbwWkL+G8CJInKoZ0eRiBzuXa+Lqv4FF/G2UbOaceBiNQtjf2UZzlGvAj7FOfb25g/An0RkpXetlbhaQgxVrRWR84A7PTHIAn4rIltx/RyTVHWTiNyH62+5VEQe9c61mYav57WGD4Brvc7294H7E2z6QkQuBZ4MDDf+CVANPOv1s4Rw3442DMCGzhrGXiPuAzXZqlrjNXv9HThcGz6Z2Rk2fQWYr+7rcYbRbljNwjD2nmLgZU80BPh+ZwqFYaQTq1kYhmEYKbEObsMwDCMlJhaGYRhGSkwsDMMwjJSYWBiGYRgpMbEwDMMwUvL/A1EOSAQQi8ymAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "title = \"Learning Curves (XGB)\"\n", + "cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)\n", + "estimator = XGBClassifier()\n", + "plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4);" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# lbl_enc = preprocessing.LabelEncoder()\n", + "# y = lbl_enc.fit_transform(df.label.values)\n", + "# X_train, X_test, y_train, y_test = train_test_split(df.sentence.values, y, random_state=42, test_size=0.2)\n", + "# target_names = lbl_enc.classes_" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# countvec = CountVectorizer(analyzer='word',token_pattern=r'\\w{1,}',\n", + "# ngram_range=(1, 3), stop_words = 'english', binary=True)\n", + "# countvec.fit(list(X_train) + list(X_test))\n", + "# X_train_countvec = countvec.transform(X_train) \n", + "# X_test_countvec = countvec.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# clf = XGBClassifier(n_jobs=-1)\n", + "# clf.fit(X_train_countvec.tocsc(), y_train)\n", + "# y_pred = clf.predict(X_test_countvec.tocsc())\n", + "# print(\"---Test Set Results---\")\n", + "# print(\"Accuracy with Xgboost: {}\".format(accuracy_score(y_test, y_pred)))\n", + "# print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# for x, y, y_hat in zip(X_test, lbl_enc.inverse_transform(y_test), lbl_enc.inverse_transform(y_pred)):\n", + "# if y != y_hat:\n", + "# print(f'sentence: {x} label: {y} label_pred: {y_hat}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fetch hotels in New York" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "conn = connect(\n", + " host='XXX',\n", + " port='XXX',\n", + " dbname='XXX',\n", + " user='XXX',\n", + " password='XXX')\n", + "conn.readonly = XXX\n", + "query = \"\"\"\n", + " XXX\n", + "\"\"\"\n", + "ny_hotels = pd.read_sql(query, con=conn)\n", + "conn.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedescription
011 Howard11 Howard is located at the crossroads of the ...
11 Hotel Brooklyn BridgeWelcome to your Brooklyn Bridge waterfront ret...
21 Hotel Central Park1 Hotel Central Park is a 5-star property loca...
336 Hudson HotelTicket services and free Wi-Fi head the list o...
46 Columbus Central Park a Sixty Hotelhotelinformation Overlooking Columbus Circle a...
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "0 11 Howard \n", + "1 1 Hotel Brooklyn Bridge \n", + "2 1 Hotel Central Park \n", + "3 36 Hudson Hotel \n", + "4 6 Columbus Central Park a Sixty Hotel \n", + "\n", + " description \n", + "0 11 Howard is located at the crossroads of the ... \n", + "1 Welcome to your Brooklyn Bridge waterfront ret... \n", + "2 1 Hotel Central Park is a 5-star property loca... \n", + "3 Ticket services and free Wi-Fi head the list o... \n", + "4 hotelinformation Overlooking Columbus Circle a... " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ny_hotels.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 18719 sentences in New York hotels\n" + ] + } + ], + "source": [ + "ny_hotels = pd.concat([pd.Series(str(row['name']), str(row['description']).split('. ')) \n", + " for _, row in ny_hotels.iterrows()]).reset_index()\n", + "ny_hotels.columns = ['sentence', 'name']\n", + "ny_hotels['sentence'] = ny_hotels['sentence'].map(lambda x: re.sub(r'\\W+', ' ', x))\n", + "print('We have ', len(ny_hotels), 'sentences in New York hotels')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome to your Brooklyn Bridge waterfront retreat with expansive views of the East River and Manhattan skyline\n", + "\n", + "The hotel is 100 non smoking\n", + "\n", + "A 350 cleaning fee will be charged to any guest who violates the smoking policy\n", + "\n", + " Fee subject to change 150 incidental deposit will be charged to guests with either a major credit card or debit card\n", + "\n", + " Fee subject to change Unless notified prior to or at check in a one night penalty will apply to departures that occur earlier than the date specified on the original reservation\n", + "\n", + "Must be 18 years of age to check in\n", + "\n", + "A daily Facility Fee is payable by the guest upon check in \n", + "\n" + ] + } + ], + "source": [ + "a = 22\n", + "for i in range(a,a+7):\n", + " print(ny_hotels.sentence[i])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "X_final= ny_hotels['sentence']\n", + "X_final = count_vect.transform(X_final)\n", + "y_pred = xgb.predict(X_final)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "18719" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18719" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(ny_hotels)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencename
011 Howard is located at the crossroads of the ...11 Howard
1Bookings of 8 rooms or more will be considered...11 Howard
2150 incidental deposit will be charged to gue...11 Howard
3Fee subject to change Must be 21 years of age...11 Howard
4Conscious design sophisticated dining options ...11 Howard
\n", + "
" + ], + "text/plain": [ + " sentence name\n", + "0 11 Howard is located at the crossroads of the ... 11 Howard\n", + "1 Bookings of 8 rooms or more will be considered... 11 Howard\n", + "2 150 incidental deposit will be charged to gue... 11 Howard\n", + "3 Fee subject to change Must be 21 years of age... 11 Howard\n", + "4 Conscious design sophisticated dining options ... 11 Howard" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ny_hotels.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1.,\n", + " 1., 1., 0.])" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred[:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "results = ny_hotels\n", + "results['label'] = y_pred.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0 12931\n", + "0.0 5788\n", + "Name: label, dtype: int64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results.label.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "results_keep = results.loc[results['label'] == 0.0]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencenamelabel
011 Howard is located at the crossroads of the ...11 Howard0.0
9The 11 Howard is in Manhattan s SoHo neighbor...11 Howard0.0
11Walk one block to reach two subway stops offer...11 Howard0.0
12Located close to Soho Bowery Chinatown and Lit...11 Howard0.0
19One World Trade Center is 1 3 km from 11 Howa...11 Howard0.0
\n", + "
" + ], + "text/plain": [ + " sentence name label\n", + "0 11 Howard is located at the crossroads of the ... 11 Howard 0.0\n", + "9 The 11 Howard is in Manhattan s SoHo neighbor... 11 Howard 0.0\n", + "11 Walk one block to reach two subway stops offer... 11 Howard 0.0\n", + "12 Located close to Soho Bowery Chinatown and Lit... 11 Howard 0.0\n", + "19 One World Trade Center is 1 3 km from 11 Howa... 11 Howard 0.0" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_keep.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "nyc_description = results_keep.groupby('name')['sentence'].agg(lambda col: ' '.join(col)).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesentence
01 Hotel Central Park1 Hotel Central Park is a 5 star property loca...
111 Howard11 Howard is located at the crossroads of the ...
236 Hudson HotelTicket services and free Wi Fi head the list o...
36 Columbus Central Park a Sixty Hotelhotelinformation Overlooking Columbus Circle a...
4AC Hotel by Marriott New York DowntownA truly cosmopolitan hotel with an urban vibe ...
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "0 1 Hotel Central Park \n", + "1 11 Howard \n", + "2 36 Hudson Hotel \n", + "3 6 Columbus Central Park a Sixty Hotel \n", + "4 AC Hotel by Marriott New York Downtown \n", + "\n", + " sentence \n", + "0 1 Hotel Central Park is a 5 star property loca... \n", + "1 11 Howard is located at the crossroads of the ... \n", + "2 Ticket services and free Wi Fi head the list o... \n", + "3 hotelinformation Overlooking Columbus Circle a... \n", + "4 A truly cosmopolitan hotel with an urban vibe ... " + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nyc_description.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesentence
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [name, sentence]\n", + "Index: []" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nyc_description.loc[nyc_description['name'] == '1 Hotel Brooklyn Bridge']" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics.pairwise import linear_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "nyc_description['sentence'] = nyc_description['sentence'].str.replace(r'[^\\w\\s]+', '')\n", + "nyc_description.set_index('name', inplace = True)\n", + "\n", + "tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=10, stop_words='english', token_pattern='[a-zA-Z0-9]{3,}')\n", + "tfidf_matrix = tf.fit_transform(nyc_description['sentence'])\n", + "cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "indices = pd.Series(nyc_description.index)\n", + "\n", + "def recommendations(name, cosine_similarities = cosine_similarities):\n", + " \n", + " recommended_hotels = []\n", + " \n", + " # gettin the index of the hotel that matches the name\n", + " idx = indices[indices == name].index[0]\n", + "\n", + " # creating a Series with the similarity scores in descending order\n", + " score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)\n", + "\n", + " # getting the indexes of the 10 most similar hotels except itself\n", + " top_10_indexes = list(score_series.iloc[1:11].index)\n", + " \n", + " # populating the list with the titles of the best 10 matching hotels\n", + " for i in top_10_indexes:\n", + " recommended_hotels.append(list(nyc_description.index)[i])\n", + " \n", + " return recommended_hotels" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Sheraton Brooklyn New York Hotel',\n", + " 'Hotel Indigo BROOKLYN',\n", + " 'Union Hotel, an Ascend Hotel Collection Member',\n", + " 'Hampton Inn Brooklyn Downtown NY',\n", + " 'EVEN Hotel Brooklyn',\n", + " 'La Quinta Inn & Suites Brooklyn Downtown',\n", + " 'Hotel Le Bleu',\n", + " 'Days Inn by Wyndham Brooklyn Borough Park',\n", + " 'Hilton Brooklyn New York',\n", + " 'NU Hotel Brooklyn']" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('New York Marriott at the Brooklyn Bridge')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Hyatt Centric Times Square New York',\n", + " 'Renaissance New York Times Square Hotel',\n", + " 'Sheraton New York Times Square Hotel',\n", + " 'Novotel New York - Times Square',\n", + " 'Hotel Mela Times Square',\n", + " 'Paramount Times Square',\n", + " 'Crowne Plaza Times Square Manhattan',\n", + " 'The Manhattan at Times Square Hotel',\n", + " 'Millennium Broadway New York Times Square',\n", + " 'Hotel Edison']" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('W New York - Times Square')" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Trump International Hotel & Tower New York',\n", + " 'Hudson New York, Central Park',\n", + " '6 Columbus Central Park a Sixty Hotel',\n", + " 'Kimpton Ink48 Hotel',\n", + " 'Empire Hotel',\n", + " 'Parker New York',\n", + " 'The Time New York',\n", + " 'Hotel Sofitel New York',\n", + " 'Residence Inn New York Manhattan/Central Park',\n", + " 'JW Marriott Essex House New York']" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Mandarin Oriental, New York')" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Best Western Gregory Hotel',\n", + " 'Sleep Inn Coney Island',\n", + " 'Sleep Inn Brooklyn Downtown',\n", + " 'Holiday Inn Express Brooklyn',\n", + " 'Sheraton Brooklyn New York Hotel',\n", + " 'Wyndham Garden Brooklyn Sunset Park',\n", + " 'Hotel Indigo BROOKLYN',\n", + " 'Days Inn by Wyndham Jamaica / JFK Airport',\n", + " 'EVEN Hotel Brooklyn',\n", + " 'Aloft New York Brooklyn']" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Best Western Plus Brooklyn Bay Hotel')" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Sheraton Brooklyn New York Hotel',\n", + " 'La Quinta Inn & Suites Brooklyn Downtown',\n", + " 'Sleep Inn Brooklyn Downtown',\n", + " 'Hampton Inn Brooklyn Downtown NY',\n", + " 'NU Hotel Brooklyn',\n", + " 'Hotel Indigo BROOKLYN',\n", + " 'Union Hotel, an Ascend Hotel Collection Member',\n", + " 'Days Inn by Wyndham Brooklyn Borough Park',\n", + " 'Wyndham Garden Brooklyn Sunset Park',\n", + " 'Brooklyn Way Hotel, BW Premier Collection']" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Aloft New York Brooklyn')" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ace Hotel New York',\n", + " 'INNSIDE by MeliĆ  New York Nomad',\n", + " 'Arlo NoMad',\n", + " 'DoubleTree by Hilton New York City - Chelsea',\n", + " 'Hampton Inn New York - 35th Street - Empire State Building',\n", + " 'DoubleTree by Hilton Hotel New York - Times Square South',\n", + " 'Holiday Inn Express - New York City Chelsea',\n", + " 'Nyma The New York Manhattan Hotel',\n", + " 'The Redbury New York',\n", + " 'Hotel Pennsylvania']" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Avalon Hotel')" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Archer Hotel New York',\n", + " 'The Mansfield Hotel',\n", + " 'Element New York Times Square West',\n", + " 'Andaz 5th Avenue - a concept by Hyatt',\n", + " 'DoubleTree by Hilton Hotel New York - Times Square South',\n", + " 'The Redbury New York',\n", + " 'The Westin New York at Times Square',\n", + " 'Ace Hotel New York',\n", + " 'The Westin New York Grand Central',\n", + " 'Hotel Mela Times Square']" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('The Langham, New York, Fifth Avenue')" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['INNSIDE by MeliĆ  New York Nomad',\n", + " 'The New York EDITION',\n", + " 'Hotel Mela Times Square',\n", + " 'The Redbury New York',\n", + " 'W New York - Union Square',\n", + " 'The Westin New York at Times Square',\n", + " 'DoubleTree by Hilton New York City - Chelsea',\n", + " 'Holiday Inn Express New York City Times Square',\n", + " 'Arlo NoMad',\n", + " 'Ace Hotel New York']" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('The James New York - NoMad')" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ramada by Wyndham Flushing Queens',\n", + " 'Aloft New York LaGuardia Airport',\n", + " 'Flushing Central Hotel',\n", + " 'Aloft Long Island City - Manhattan View',\n", + " 'Holiday Inn LaGuardia Airport',\n", + " 'LaGuardia Plaza Hotel',\n", + " 'Best Western JFK Airport Hotel',\n", + " 'Days Inn by Wyndham Jamaica / JFK Airport',\n", + " 'Courtyard Newark Downtown',\n", + " 'Holiday Inn Express Kennedy Airport']" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('New York LaGuardia Airport Marriott')" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Hilton New York JFK Airport',\n", + " 'Hilton Garden Inn Queens/JFK Airport',\n", + " 'Hampton Inn JFK Airport',\n", + " 'Holiday Inn Express Kennedy Airport',\n", + " 'Howard Johnson by Wyndham Jamaica JFK Airport NYC',\n", + " 'Days Inn by Wyndham Jamaica / JFK Airport',\n", + " 'Wyndham Garden Brooklyn Sunset Park',\n", + " 'Hampton Inn Brooklyn Downtown NY',\n", + " 'TRYP By Wyndham Times Square South',\n", + " 'Holiday Inn New York JFK Airport Area']" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Radisson Hotel JFK Airport')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}