diff --git a/Sentence Classification & Hotel Recommender.ipynb b/Sentence Classification & Hotel Recommender.ipynb
new file mode 100644
index 0000000..53a9031
--- /dev/null
+++ b/Sentence Classification & Hotel Recommender.ipynb
@@ -0,0 +1,1561 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import classification_report, accuracy_score\n",
+ "import numpy as np\n",
+ "from sklearn import preprocessing\n",
+ "from sklearn.model_selection import learning_curve\n",
+ "from sklearn.model_selection import ShuffleSplit\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "import itertools\n",
+ "import seaborn as sns\n",
+ "from nltk.corpus import stopwords\n",
+ "from xgboost import XGBClassifier\n",
+ "import string\n",
+ "from psycopg2 import connect\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('sentence_training_set_1.csv', encoding=\"latin-1\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " sentence \n",
+ " label \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " LocationSituated right in the city centre of B... \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Located in the charming area of Albuquerque, t... \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Hotel The Ein Kerem is an elegant boutique ho... \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " This hotel is located on Via del Tritone in th... \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Surrounded by wonderful Cretan countryside, t... \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sentence label\n",
+ "0 LocationSituated right in the city centre of B... 0.0\n",
+ "1 Located in the charming area of Albuquerque, t... 0.0\n",
+ "2 Hotel The Ein Kerem is an elegant boutique ho... 1.0\n",
+ "3 This hotel is located on Via del Tritone in th... 0.0\n",
+ "4 Surrounded by wonderful Cretan countryside, t... 0.0"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0 2777\n",
+ "0.0 2454\n",
+ "11.0 1\n",
+ "Name: label, dtype: int64"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.label.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sentence 0\n",
+ "label 2\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.dropna(subset=['label'], inplace=True)\n",
+ "df = df[df.label != 11]\n",
+ "df = df.sample(frac=1).reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_plot(index):\n",
+ " example = df[df.index == index][['sentence', 'label']].values[0]\n",
+ " if len(example) > 0:\n",
+ " print(example[0])\n",
+ " print('label:', example[1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Take NY-27 West/Sunrise Highway to the belt Parkway West ramp on the left to Verrazano Bridge\n",
+ "label: 1.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_plot(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " All rooms are in good condition, and are traditionally furnished with burgundy features\n",
+ "label: 1.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_plot(100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['length'] = df['sentence'].apply(len)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Compare sentence length of the different labels."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAADQCAYAAABStPXYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEt5JREFUeJzt3X+QnVd93/H3p5YNFKglmY3GI6kVGTShTmdQxNaYJuMJuDG2k4mciUNIM7HqaEaZqRvokNIq7UxJSH/gMq0bZ1JPnZhETkhixy1jDaU2qoDSdGIHGYR/YKgWB4+kka01thVcBojJt3/cs3C1rLV3tbves3ffr5ln7nnOc+6956z26LPPc5/7PKkqJEnqzV9b6Q5IkjQXA0qS1CUDSpLUJQNKktQlA0qS1CUDSpLUJQOqE0men2f7tiSPLPA1fzfJdYvrGSTZneRoW3a/SJuNSQ62NgeTbFjs+0rQ/dy4N8lzST5yljYvS3JnkqkkDyTZttj3XSsMKJ1Vko3Ae4E3AZcC732R8NkHHKqq7cChti6Nuw8APzdPmz3As1X1OuBm4KZl79WYMKA6k+RVSQ4l+UySh5PsGtq8LsmHkjyW5O4kf709541J/leSB5Pcl+TiJezS24CDVfVMVT0LHASumqPdLmB/K+8Hrl3CPkg9zg2q6hDw1XmaDc+Nu4ErkmQp+zGuDKj+fB34iaraCbwF+A9Dv8zfB/znqvrbwF8A/yjJ+cBvANdV1RuBDwL/5mxvkOQ9SY7MsdwyR/PNwLGh9eOtbrZNVXWylZ8ENo02XGlkvc2NUX17DlXVC8Bp4KJFvN6asW6lO6DvEuDfJrkc+CsGv9wz/9kfq6r/08q/D7wTuBf4O8DBNlfPA05yFlX1AQaHJpZFVVUSr6Glpbbq54YWxoDqz88CE8Abq+ovk3wZeHnbNvs//WIwaR+tqjeP+gZJ3tPeZ7ZPVdU7Z9WdAH54aH0L8Mk5nvtUkour6mQ7jHJq1P5II+ptbozqBLAVOJ5kHXAh8JVzfK01xUN8/bkQONUm4FuAvzW07W8mmZls/wD4E+CLwMRMfZLzk3z/2d6gqj5QVTvmWOaagPcBVybZ0E6OuLLVzXYAmDnDbzdwz4jjlUbV29wY1fDcuA74eHmV7pEYUP35EDCZ5GHgeuALQ9u+CNyY5DFgA3BrVX2TwS/9TUk+BxwB/t5SdaaqngF+Dfh0W97X6kjy20kmW9P3Az+S5Cjw99u6tJS6mhsASf438McMTnw4nuRtrf59SX68NbsduCjJFPBuPMN1ZDHIJUk9cg9KktQlA0qS1CUDSpLUJQNKktSlLgLqqquuKgbfW3BxGadl0ZwbLmO6jKSLgHr66adXugtSl5wbWsu6CChJkmYzoCRJXTKgJEldMqAkSV0yoCRJXTKgJEldGimgkqxvt1H+Qrul8puTbExyMMnR9rihtU2SW5JMJXkoyc7lHYKkuSRnLtJqM+oe1K8D91bV64E3AI8xuGT8oaraDhziO5eQvxrY3pa9wK1L2mNJ0powb0AluRC4nME9Taiqb1bVc8AuYH9rth+4tpV3AXfUwP3A+naHVUmSRjbKHtRrgWngd5J8tt2k7pXApqo62do8CWxq5c3AsaHnH291Z0iyN8nhJIenp6fPfQTSmHFuSAOjBNQ6YCeDO1T+APD/mHVHyHb74pGvr9Sec1tVTVbV5MTExEKeKo0154Y0MEpAHQeOV9UDbf1uBoH11Myhu/Z4qm0/AWwdev6WVidJ0sjmDaiqehI4luT7WtUVwOeBA8DuVrcbuKeVDwDXt7P5LgNODx0KlCRpJOtGbPeLwIeSXAA8DtzAINzuSrIHeAJ4e2v7UeAaYAr4WmsrSdKCjBRQVXUEmJxj0xVztC3gxkX2S5K0xnklCUlSlwwoSVKXDChJUpcMKElSl0Y9i0/SKjd8wdha0NfqpZXhHpQkqUsGlCSpSwaUJKlLBpQkqUsGlCSpSwaUJKlLBpQkqUsGlCSpSwaUJKlLIwVUki8neTjJkSSHW93GJAeTHG2PG1p9ktySZCrJQ0l2LucAJEnjaSF7UG+pqh1VNXNfqH3AoaraDhxq6wBXA9vbshe4dak6K0laOxZziG8XsL+V9wPXDtXfUQP3A+uTXLyI95EkrUGjBlQBH0vyYJK9rW5TVZ1s5SeBTa28GTg29Nzjre4MSfYmOZzk8PT09Dl0XRpPzg1pYNSA+qGq2sng8N2NSS4f3thu876g6yNX1W1VNVlVkxMTEwt5qjTWnBvSwEgBVVUn2uMp4MPApcBTM4fu2uOp1vwEsHXo6VtanSRJI5s3oJK8MsmrZ8rAlcAjwAFgd2u2G7inlQ8A17ez+S4DTg8dCpQkaSSj3LBwE/DhDO52tg74g6q6N8mngbuS7AGeAN7e2n8UuAaYAr4G3LDkvZYkjb15A6qqHgfeMEf9V4Ar5qgv4MYl6Z0kac3yShKSpC4ZUJKkLhlQkqQuGVCSpC4ZUJKkLhlQkqQuGVCSpC6N8kVdSavE4Pv00nhwD0qS1CUDSpLUJQNKktQlA0qS1CUDSpLUJQNKktSlkQMqyXlJPpvkI239tUkeSDKV5M4kF7T6l7X1qbZ92/J0/cX6+Z1FkrR6LWQP6l3AY0PrNwE3V9XrgGeBPa1+D/Bsq7+5tZMkaUFGCqgkW4AfBX67rQd4K3B3a7IfuLaVd7V12vYrWntJkkY26h7UfwL+GfBXbf0i4LmqeqGtHwc2t/Jm4BhA2366tT9Dkr1JDic5PD09fY7dl8aPc0MamDegkvwYcKqqHlzKN66q26pqsqomJyYmlvKlv83Po7QavRRzQ1oNRrkW3w8CP57kGuDlwN8Afh1Yn2Rd20vaApxo7U8AW4HjSdYBFwJfWfKeSzpnw3+0Va1cP6SzmXcPqqp+uaq2VNU24B3Ax6vqZ4FPANe1ZruBe1r5QFunbf941fJOAfeUJGn8LOZ7UP8ceHeSKQafMd3e6m8HLmr17wb2La6LkqS1aEG326iqTwKfbOXHgUvnaPN14KeWoG+SpDXMK0lIkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSujTKLd9fnuTPknwuyaNJfrXVvzbJA0mmktyZ5IJW/7K2PtW2b1uOjnuTQkkab6PsQX0DeGtVvQHYAVyV5DLgJuDmqnod8Cywp7XfAzzb6m9u7SRJWpBRbvleVfV8Wz2/LQW8Fbi71e8Hrm3lXW2dtv2KxP0cSdLCjPQZVJLzkhwBTgEHgS8Bz1XVC63JcWBzK28GjgG07acZ3BJ+9mvuTXI4yeHp6enFjUIaI84NaWCkgKqqb1XVDmALg9u8v36xb1xVt1XVZFVNTkxMLPblpLHh3JAGFnQWX1U9B3wCeDOwPsm6tmkLcKKVTwBbAdr2C4GvLElvJS254ROOPBivnoxyFt9EkvWt/ArgR4DHGATVda3ZbuCeVj7Q1mnbP15VtZSdliSNv3XzN+FiYH+S8xgE2l1V9ZEknwf+KMm/Bj4L3N7a3w78XpIp4BngHcvQb0nSmJs3oKrqIeAH5qh/nMHnUbPrvw781JL0TpK0ZnklCUlSlwwoSVKXRvkMaizMPjvJ0zYkqW/uQUmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6ZEBJkrpkQEmSumRASZK6NMoNC7cm+USSzyd5NMm7Wv3GJAeTHG2PG1p9ktySZCrJQ0l2LvcgJEnjZ5Q9qBeAX6qqS4DLgBuTXALsAw5V1XbgUFsHuBrY3pa9wK1L3mtJ0tibN6Cq6mRVfaaVv8rgdu+bgV3A/tZsP3BtK+8C7qiB+4H1SS5e8p5Lksbagj6DSrKNwd11HwA2VdXJtulJYFMrbwaODT3teKub/Vp7kxxOcnh6enqB3ZbGl3NDGhg5oJK8CvivwD+pqr8Y3lZVBSzoDktVdVtVTVbV5MTExEKeKo0154Y0MFJAJTmfQTh9qKr+W6t+aubQXXs81epPAFuHnr6l1UmSNLJRzuILcDvwWFX9x6FNB4DdrbwbuGeo/vp2Nt9lwOmhQ4HdSL6zSJL6M8ot338Q+Dng4SRHWt2/AN4P3JVkD/AE8Pa27aPANcAU8DXghiXtsSRpTZg3oKrqT4AX28+4Yo72Bdy4yH7Nyb0dSVo7vJKEJKlLBpQkqUsGlCSpSwaUJKlLBpQkqUsGlCSpS6N8D0pSx/z6hcaVe1CSpC4ZUJKkLhlQkqQu+RmUpDMMf6ZVC7qJjrS03IOSJHXJgJIkdcmAkiR1aZQbFn4wyakkjwzVbUxyMMnR9rih1SfJLUmmkjyUZOdydl6SNL5G2YP6XeCqWXX7gENVtR041NYBrga2t2UvcOvSdFOStNbMG1BV9SngmVnVu4D9rbwfuHao/o4auB9Yn+TipeqsJGntONfPoDZV1clWfhLY1MqbgWND7Y63OkmSFmTRJ0m0W7wv+NsSSfYmOZzk8PT09GK7IY0N54Y0cK4B9dTMobv2eKrVnwC2DrXb0uq+S1XdVlWTVTU5MTFxjt2Qxo9zQxo414A6AOxu5d3APUP117ez+S4DTg8dCuxWMvciSVo5817qKMkfAj8MvCbJceC9wPuBu5LsAZ4A3t6afxS4BpgCvgbcsAx9liStAfMGVFX9zItsumKOtgXcuNhOSeqD1+XTSvJKEpKkLhlQkqQuGVCSpC4ZUJKkLhlQkqQuGVCSpC4ZUJKkLhlQkqQuzftF3bXMLylK0soxoCSNxD/Y9FLzEJ8kqUsGlCSpSx7iG5GHN6TvmH07GueEloMBdQ4MK0lafgaUpEXzjzYth2X5DCrJVUm+mGQqyb7leI9eeAdeSVoeSx5QSc4DfhO4GrgE+Jkklyz1+/TIsJLOnAdnW6T5LMchvkuBqap6HCDJHwG7gM8vw3t1a9QJuJjDIR5W0Wq22JCa/Ts/DvPBk0/OtBwBtRk4NrR+HHjT7EZJ9gJ72+rzSb54ltd8DfD0kvWwI3NM0nMa6yr9i3Rs/12be6vqqoU+aYFzA8b/5zjjjHGe7Xd+lc6HYa8Bnh6DcbyYkebGip0kUVW3AbeN0jbJ4aqaXOYudcGxaiFzA9bOz3GtjBPW1ljPZjlOkjgBbB1a39LqJEka2XIE1KeB7Ulem+QC4B3AgWV4H0nSGFvyQ3xV9UKSfwzcB5wHfLCqHl3ky458uGMMOFYt1Fr5Oa6VccLaGuuLSq3100QkSV3yYrGSpC4ZUJKkLnUfUON22aQkX07ycJIjSQ63uo1JDiY52h43tPokuaWN/aEkO1e292eX5INJTiV5ZKhuwWNLsru1P5pk90qMZTUYt7kBzg/nxyxV1e3C4CSLLwHfC1wAfA64ZKX7tcgxfRl4zay6fw/sa+V9wE2tfA3wP4AAlwEPrHT/5xnb5cBO4JFzHRuwEXi8PW5o5Q0rPbbelnGcG21czo+zjG2tzY/e96C+fdmkqvomMHPZpHGzC9jfyvuBa4fq76iB+4H1SS5eiQ6Ooqo+BTwzq3qhY3sbcLCqnqmqZ4GDwIKvxrAGrJW5Ac6PNTs/eg+ouS6btHmF+rJUCvhYkgfbJW0ANlXVyVZ+EtjUyuMw/oWObRzG/FIY15+T82PA+YH3g1oJP1RVJ5J8D3AwyReGN1ZVJRnLc//HeWxaMs4PfVvve1Bjd9mkqjrRHk8BH2ZwqOapmUMT7fFUaz4O41/o2MZhzC+Fsfw5OT+cH8N6D6ixumxSklcmefVMGbgSeITBmGbOxtkN3NPKB4Dr2xk9lwGnhw4HrBYLHdt9wJVJNrQzmq5sdTrTWM0NcH7g/PhuK32WxnwLg7NZ/i+DM5b+5Ur3Z5Fj+V4GZ1t9Dnh0ZjzARcAh4CjwP4GNrT4Mbv74JeBhYHKlxzDP+P4QOAn8JYNj43vOZWzAzwNTbblhpcfV6zJOc6ONx/nh/Dhj8VJHkqQu9X6IT5K0RhlQkqQuGVCSpC4ZUJKkLhlQkqQuGVCrUJLnl+E1dyS5Zmj9V5L806V+H2k5OTfGiwGlGTsYfK9G0pmcGyvEgFrlkrwnyafbPWN+tdVtS/JYkt9K8miSjyV5Rdv2d1vbI0k+kOSRdiWC9wE/3ep/ur38JUk+meTxJO9coSFK58S5sfoZUKtYkiuB7QyuV7YDeGOSy9vm7cBvVtX3A88BP9nqfwf4haraAXwLoAa3a/hXwJ1VtaOq7mxtX8/g8v6XAu9Ncv5LMCxp0Zwb48GAWt2ubMtngc8wmDTb27Y/r6ojrfwgsC3JeuDVVfWnrf4P5nn9/15V36iqpxlcxHLTPO2lXjg3xoC321jdAvy7qvovZ1Qm24BvDFV9C3jFObz+7Nfw90WrhXNjDLgHtbrdB/x8klcBJNnc7qMzp6p6Dvhqkje1qncMbf4q8Opl66n00nJujAEDahWrqo8xOBTxp0keBu5m/om0B/itJEeAVwKnW/0nGHzwO/xBsLQqOTfGg1czX2OSvKqqnm/lfcDFVfWuFe6WtOKcG/3xuOna86NJfpnBv/0TwD9c2e5I3XBudMY9KElSl/wMSpLUJQNKktQlA0qS1CUDSpLUJQNKktSl/w+bvoS4XATWSAAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "graph = sns.FacetGrid(data=df,col='label')\n",
+ "graph.map(plt.hist,'length',bins=50,color='blue');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 The air-conditioned apartments at Mc Queen hot...\n",
+ "1 From our restaurant, where we serve our gener...\n",
+ "2 Our hotel in Flagstaff, AZ, brings you true r...\n",
+ "3 All of the rooms are suites are well-equipped ...\n",
+ "4 Guests will find themselves in close proximit...\n",
+ "Name: sentence, dtype: object\n",
+ "0 1.0\n",
+ "1 1.0\n",
+ "2 1.0\n",
+ "3 1.0\n",
+ "4 0.0\n",
+ "Name: label, dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "X = df['sentence']\n",
+ "y = df['label']\n",
+ "print(X.head())\n",
+ "print(y.head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Clean sentence\n",
+ "def text_process(text):\n",
+ " nopunc = [char for char in text if char not in string.punctuation]\n",
+ " nopunc = ''.join(nopunc)\n",
+ " return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "10819\n",
+ "The air-conditioned apartments at Mc Queen hotel are minimally decorated\n",
+ " (0, 3531)\t1\n",
+ " (0, 4381)\t1\n",
+ " (0, 5855)\t1\n",
+ " (0, 5960)\t1\n",
+ " (0, 6980)\t1\n",
+ " (0, 7959)\t1\n",
+ " (0, 8602)\t1\n"
+ ]
+ }
+ ],
+ "source": [
+ "count_vect = CountVectorizer(analyzer=text_process, token_pattern=r'\\w{1,}', ngram_range=(1, 3)).fit(X)\n",
+ "print(len(count_vect.vocabulary_))\n",
+ "r0 = X[0]\n",
+ "print(r0)\n",
+ "vocab0 = count_vect.transform([r0])\n",
+ "print(vocab0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "antiques\n",
+ "tax\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(count_vect.get_feature_names()[5950])\n",
+ "print(count_vect.get_feature_names()[10216])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CountVectorizer(analyzer=,\n",
+ " binary=False, decode_error='strict', dtype=,\n",
+ " encoding='utf-8', input='content', lowercase=True, max_df=1.0,\n",
+ " max_features=None, min_df=1, ngram_range=(1, 3), preprocessor=None,\n",
+ " stop_words=None, strip_accents=None, token_pattern='\\\\w{1,}',\n",
+ " tokenizer=None, vocabulary=None)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "count_vect"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Shape of the sparse matrix: (5231, 10819)\n",
+ "Non-Zero occurences: 61369\n",
+ "Density of the matrix = 0.10843692803867197\n"
+ ]
+ }
+ ],
+ "source": [
+ "X = count_vect.transform(X)\n",
+ "#Shape of the matrix:\n",
+ "print(\"Shape of the sparse matrix: \", X.shape)\n",
+ "#Non-zero occurences:\n",
+ "print(\"Non-Zero occurences: \",X.nnz)\n",
+ "\n",
+ "# DENSITY OF THE MATRIX\n",
+ "density = (X.nnz/(X.shape[0]*X.shape[1]))*100\n",
+ "print(\"Density of the matrix = \",density)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "---Training Set Results---\n",
+ "Confusion Matrix for XGBoost Classifier:\n",
+ "[[1461 515]\n",
+ " [ 107 2101]]\n",
+ "Score: 85.134\n",
+ "Classification Report:\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0.0 0.93 0.74 0.82 1976\n",
+ " 1.0 0.80 0.95 0.87 2208\n",
+ "\n",
+ " micro avg 0.85 0.85 0.85 4184\n",
+ " macro avg 0.87 0.85 0.85 4184\n",
+ "weighted avg 0.86 0.85 0.85 4184\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "xgb = XGBClassifier(n_jobs=-1)\n",
+ "xgb.fit(X_train,y_train)\n",
+ "y_pred = xgb.predict(X_train)\n",
+ "print(\"---Training Set Results---\")\n",
+ "print(\"Confusion Matrix for XGBoost Classifier:\")\n",
+ "print(confusion_matrix(y_train, y_pred))\n",
+ "print(\"Score: \",round(accuracy_score(y_train,y_pred)*100,3))\n",
+ "print(\"Classification Report:\")\n",
+ "print(classification_report(y_train, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "---Test Set Results---\n",
+ "Confusion Matrix for XGBoost Classifier:\n",
+ "[[334 144]\n",
+ " [ 33 536]]\n",
+ "Score: 83.095\n",
+ "Classification Report:\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0.0 0.91 0.70 0.79 478\n",
+ " 1.0 0.79 0.94 0.86 569\n",
+ "\n",
+ " micro avg 0.83 0.83 0.83 1047\n",
+ " macro avg 0.85 0.82 0.82 1047\n",
+ "weighted avg 0.84 0.83 0.83 1047\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "pred = xgb.predict(X_test)\n",
+ "print(\"---Test Set Results---\")\n",
+ "print(\"Confusion Matrix for XGBoost Classifier:\")\n",
+ "print(confusion_matrix(y_test,pred))\n",
+ "print(\"Score: \",round(accuracy_score(y_test,pred)*100,3))\n",
+ "print(\"Classification Report:\")\n",
+ "print(classification_report(y_test,pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,\n",
+ " n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):\n",
+ " plt.figure()\n",
+ " plt.title(title)\n",
+ " if ylim is not None:\n",
+ " plt.ylim(*ylim)\n",
+ " plt.xlabel(\"Training examples\")\n",
+ " plt.ylabel(\"Score\")\n",
+ " train_sizes, train_scores, test_scores = learning_curve(\n",
+ " estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)\n",
+ " train_scores_mean = np.mean(train_scores, axis=1)\n",
+ " train_scores_std = np.std(train_scores, axis=1)\n",
+ " test_scores_mean = np.mean(test_scores, axis=1)\n",
+ " test_scores_std = np.std(test_scores, axis=1)\n",
+ " plt.grid()\n",
+ "\n",
+ " plt.fill_between(train_sizes, train_scores_mean - train_scores_std,\n",
+ " train_scores_mean + train_scores_std, alpha=0.1,\n",
+ " color=\"r\")\n",
+ " plt.fill_between(train_sizes, test_scores_mean - test_scores_std,\n",
+ " test_scores_mean + test_scores_std, alpha=0.1, color=\"g\")\n",
+ " plt.plot(train_sizes, train_scores_mean, 'o-', color=\"r\",\n",
+ " label=\"Training score\")\n",
+ " plt.plot(train_sizes, test_scores_mean, 'o-', color=\"g\",\n",
+ " label=\"Cross-validation score\")\n",
+ "\n",
+ " plt.legend(loc=\"best\")\n",
+ " return plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJztvXmcFOW1//8+PfvGjmwioFFZBWFAiRpBo+ISDei9ajDRRCUa8cbrN0aMxng1JCTfLGr05xqXKFdU3Ei+JAYNY1wDI6IGBERc2JRNltmnu8/vj6eqp7qnZ3qGmZ5p4Lxfr3pV1fM8VXWqpud86tlOiapiGIZhGM0R6mwDDMMwjMzHxMIwDMNIiYmFYRiGkRITC8MwDCMlJhaGYRhGSkwsDMMwjJSYWBj7NSLyVxG5uLPtyCREZLiIlIuIdOA1+4jIByKS11HXNNoXEwsjLYjIJyLy9c62Q1VPV9VH03FuEekiIreLyGciUiEiH3n7vdJxvXbkNuA3qqoiUuz9rab7mSJS4t3TeYG0UhH5i4h8KSI7RWSliMwWke5e/iUiEvGeQ4WIrBORK/3jVfULYDEwowPv02hHTCyMfRYRye7Ea+cCLwMjgClAF2AisB2YsBfn65B7EZF+wGTgeQBVrQC+D9wuIr29Yr8GylV1vnfMV4Ey4HVgqKp2w91zGBgdOP2bqlqsqsXAucCvReToQP5c71rGvoiq2mJLuy/AJ8DXm8g7C1gO7ATeAI4K5M0CPgL2ACuBqYG8S3AO6/c4p/xzL+014DfAl8DHwOmBY8qAywLHN1d2CPBP79ovAXcDjzdxD5cBXwDFzTwDBb4S2H8E+Lm3PQnYAFwPfA48BnwAnBUonw1sBcZ6+8d6z2sn8C4wKeHZrPNs/xiY3oRN3wFeSpL+CPCEZ9d2oG8g7zXgDyn+3pcAryWkLQG+lXA/VcCgzv592tL6xWoWRofivWk+hHvD7AncBywItGV/BJwAdAX+B3jcexv2OQbnFPsAswNpq4FeuLfiPzbTHt9c2f/FObiewC3At5u5la8Df1P3Zr639AV6AINwzTNPABcG8k8DtqnqMhEZAPw/nED2AH4EPCMivUWkCLgTJ3wlwFdxYpyMUbj7T+S/cUIxH/iRqn4O4J17IvBMa25MRMYDRwDlfpqqhoG1xNdGjH0EEwujo5kB3Keq/1LViLr+hFrcWzOq+rSqblLVqKo+CXxIfLPOJlX9g6qGVbXaS/tUVR9Q1QjwKNAPJybJSFpWRA4BxgM3q2qdqr4GLGjmPnoCm/fqCTQQBX6mqrXevfwvcLaIFHr538IJCMBFwEJVXeg9m0U4R3xG4FwjRaRAVTer6oomrtkNV/uIQ1W/BFYAhcCzgazuOD/xuZ8gIr/2+i0qReSmQNljvfQ9ONF9DPf3C7LHs8HYxzCxMDqaQcD/8ZzKThHZCQwE+gOIyHdEZHkgbySuFuCzPsk5Y45MVau8zeImrt9U2f7AjkBaU9fy2Y4TmrawVVVrAvasxTVFfcMTjLNxAgLuuf1HwnM7HuinqpXA+cAVwGYR+X8iMrSJa34JlCQmishFwGBc89uvEspHCdyrqv5YXb/Fc7imJZ+3VLWbV7vpi+vP+UXCpUpwzWjGPoaJhdHRrAdme07FXwpV9QkRGQQ8AMwEenoO6d9AsEkpXWGSNwM9Am/14ESsKV4CTvOaaZqiCvem7tM3IT/ZvfhNUecAKz0BAffcHkt4bkWqOgdAVV9U1VNwTn0V7jkm4z1c81AMETkI1w90Oa558D9F5ATvvJXAv4BpzdxnI9SNfnoG+EbgOtnAV3D9LcY+homFkU5yRCQ/sGTjnNgVInKMOIpE5EwRKQGKcA50K4CIfBdXs0g7qvoprlnnFhHJFZGJBBxdEh7DOfBnRGSoiIREpKeI/ERE/Kah5cC3RCRLRKYAJ7bAlHnAqcCVNNQqAB7H1ThO886XLyKTRORgbw7DOZ5w1QIVuNpAMhYBY0UkP5B2F/C8qi5W1c3Aj4EHAv1IPwa+JyKzPGFBRA7GDQhIioj0BKbimrZ8JgCfeM/a2McwsTDSyUKgOrDcoqrluDfYu3BNHGtxI2lQ1ZXAb4E3cSONRuFGP3UU02kY/vpz4Emc822EqtbiOrlX4Rzwblw7fS/cmzjAD3GCs9M79/OpDPCc9Zu4TuonA+nrcbWNn+DEdD1wHe5/OARcC2wCduBE6UqS4L3x/8M7FyLyTVxz1nWBMg9657rZ238NOAn4GrDGawL7G26k2R8Cp5/oz7PANadtBa4O5E8H7k31DIzMRFTt40eGkQwReRJYpao/62xb2hMRGY7r3J+gHeQAvBrJK8DRwX4aY9/BxMIwPLzhnjtw8xROxdUEJqrqO51qmGFkAJ02A9YwMpC+uGGjPXET5q40oTAMh9UsDMMwjJRYB7dhGIaRkv2mGapXr146ePDgdj9vZWUlRUXNDaXvfDLdxky3DzLfRrOv7WS6jZ1l39tvv71NVXunLNjZwanaaxk3bpymg8WLF6flvO1JptuY6fapZr6NZl/byXQbO8s+XIRhCyRoGIZhtB0TC8MwDCMlaRMLEXlIRLaIyL+byBcRuVNE1orIeyIyNpB3sYh86C32SUzDMIxOJp0d3I/gQjr8qYn804HDveUY4B7gGBHpAfwMKMXFCXpbRBaoC6FsGAckIsLHH39MTU1mTn7u2rUrH3zwQWeb0SyZbmO67cvPz+fggw8mJydnr45Pm1io6j9FZHAzRc4B/uR1sLwlIt28j9xMAhap6g4AEVmE+4TjE02eyTD2c4qKiigpKWHw4ME0/V2nzmPPnj2UlDSKfJ5RZLqN6bRPVdm+fTsbNmxgyJAm4z82S2cOnR1A/PcCNnhpTaU3QkRm4H0Avk+fPpSVlbW7kRUVFWk5b3uS6TZmun2Q+TaWlJSQm5tLRUVbPsyXPiKRCHv2NPqmUkaR6Tam277c3Fx27ty517/zfXqehareD9wPUFpaqpMmTWr3a5SVlZGO87YnmW5jptsHmW/jO++8Q5cuXTrbjCbJ9Ld2yHwbO8K+/Px8jj766L06tjNHQ20k/uMyB3tpTaUbhmEYnURnisUC4DveqKhjgV3qYvm/CJwqIt1FpDsu+ueLnWinYRzwbN++nTFjxjBmzBj69u3LgAEDYvt1dXUtOsd3v/tdVq9e3WyZu+++m7lz57aHyUY7k7ZmKBF5AtdZ3UtENuBGOOUAqOq9uA/jnIH7+E0V8F0vb4eI3AYs9U51q9/ZbRhGC5k7F268ET77DA45BGbPhunT9/p0PXv2ZPny5QDccsstFBcX86Mf/SiWX1tb2zDTN5T8HfThhx9OeZ2rrrpqr21MJ6nu7UAgbXeuqheqaj9VzVHVg1X1j6p6rycUeDPNr1LVw1R1lLovqPnHPqSqX/GW1L8wwzAamDsXZsyATz8FVbeeMcOltzNr165l+PDhXHrppYwYMYLNmzczY8YMSktLGTFiBLfeemus7PHHH8/y5csJh8N069aNWbNmMXr0aCZOnMiWLVsAuOmmm7j99ttj5WfNmsWECRM48sgjeeONNwAXQ+ncc89l+PDhnHfeeZSWlsaELMh1113H8OHDOeqoo7j++usB+PzzzznnnHM46qijGD16NP/6l/uo4a9//WtGjhzJyJEj+cMf/hB3b9OnT4/d21//+lcmTpzI2LFjOf/886msrGz3Z5qp7NMd3IZxQHLNNZDEOcZ46y2oTfgabFUVXHopPPBA8mPGjAHPSbeWVatWcc8993Diie4T43PmzKFHjx6Ew2EmT57Meeedx/Dhw+OO2bVrFyeeeCJz5szh2muv5aGHHmLWrFmNzq2qLFmyhAULFnDrrbfyt7/9jT/84Q/07duXZ555hnfffZexY8c2Ou6LL75g4cKFrFixAhFh586dgKu5nHLKKcycOZNwOExVVRX/+te/mDt3LkuXLiUcDjNhwgQmTZpEQUEBq1at4k9/+hOlpaVs2bKFOXPm8PLLL1NYWMjs2bO54447+MlPfrJXz21f48CtUxnG/kqiUKRKbyOHHXZYnMN+4oknGDt2LGPHjuWDDz5g5cqVjY4pKCjg9NNPB2DcuHF88sknSc89bdq0RmVee+01LrjgAgBGjx7NiBEjGh3Xo0cPQqEQl19+Oc8991wsmmtZWRnf//73AcjOzqZLly689tprnHvuuRQUFFBSUsI3v/lNXn311di9lZaWAvDGG2+wcuVKvvrVrzJmzBjmzp3bpN37I1azMIx9jVQ1gMGDXdNTIoMGQRrmkgTDan/44YfccccdLFmyhG7dunHRRRclnXWem5sb287KyiIcDic9d15eXsoyycjJyaG8vJxFixbx9NNPc8899/DMM88AtGpSY/DeVJUpU6bw2GOPtfj4/QmrWRjG/sbs2VBYGJ9WWOjS08zu3bspKSmhS5cubN68mRdfbP+BjMcddxxPPfUUAO+//37SmsuePXvYvXs3Z511Fr///e955x33ddzJkydz7733Am4S3O7duznhhBN47rnnqK6upqKighdeeIETTjih0Tm/+tWv8sorr7Bu3TrA9Z18+OGH7X5/mYrVLAxjf8Mf9dSOo6FaytixYxk+fDhDhw5l0KBBHHfcce1+jauvvprvfOc7DB8+PLZ07do1rsyuXbuYNm0atbW1RKNRfve73wFw1113cfnll3PfffeRnZ3Nfffdx4QJE7jwwgsZP348AFdeeSWjRo1i7dq1cefs06cPf/zjHzn//PNjw4V/8YtfcPjhh7f7PWYkLfnoxb6w2MePMpdMt081821ctmxZZ5vQLLt37+6wa9XX12t1dbWqqq5Zs0YHDx6s9fX1KY/rSBv3ho6wb+XKlY3SaOHHj6xmYRjGPkVFRQUnn3wy4XAYVY3VEoz0Yk/YMIx9im7duvH22293thkHHNbBbRiGYaTExMIwDMNIiYmFYRiGkRITC8MwDCMlJhaGYbSIzz//nAsuuIDDDjuMcePGccYZZ7BmzZrONispgwcPZtu2bYCbTJeMSy65hPnz5zd7nkceeYRNmzbF9i+77LKkkwAPBEwsDGM/ZO77cxl8+2BC/xNi8O2Dmft+2yLOqipTp05l0qRJfPTRR7z99tv88pe/5Isvvogr15qQHB2FH612b0gUiwcffLBRUMRMoCOeu4mFYexnzH1/LjP+PINPd32Kony661Nm/HlGmwRj8eLF5OTkcMUVV8TSRo8ezQknnEBZWRmnnXYaZ599dsyR/u53v4uF/PZDjldWVnLmmWcyevRoRo4cyZNPPgnArFmzYqHEg9/I8Ln33nu57rrrYvuPPPIIM2fOBOCb3/wm48aNY8SIEdx///1JbS8uLgac4M2cOZMjjzySr3/967Gw6AC33nor48ePZ+TIkcyYMQNVZf78+ZSXlzN9+nTGjBlDdXU1kyZNorzcfU3hiSeeYNSoUYwcOTIWAt2/3o033sjo0aM59thjGwkqwCuvvBL7eNTRRx8d+/b2r371K0aNGsXo0aNjUXiXL1/Osccey1FHHcXUqVP58ssvAZg0aRLXXHMNpaWl3HHHHWzdupVzzz2X8ePHM378eF5//fWm/6B7Q0tm7u0Li83gzlwy3T7VzLcxOIP7h3/9oZ748IlNLnm35Sm30GjJuy2vyWN++NcfNnv9O+64Q6+55pqkeYsXL9bCwkJdt26dqqqWl5fryJEjtaKiQvfs2aPDhw/XZcuW6fz58/Wyyy6LHbdz507dtm2bHnHEERqNRlVV9csvv2x0/i1btuhhhx0W258yZYq++uqrqqq6fft2VVWtqqrSESNG6LZt21RVddCgQbp161ZVVS0qKlJV1ccff1y//vWvazgc1o0bN2rXrl316aefjjuPqupFF12kCxYsUFXVE088UZcuXRrL8/c3btyoAwcO1C1btmh9fb1OnjxZn3vuOVVVBWLHX3fddXrbbbc1uqezzjpLX3vtNVVV3bNnj9bX1+v8+fN14sSJWllZGWfTqFGjtKysTFVVf/rTn+oPf/jDmC1XXnll7JwXXnhh7Ll8+umnOnTo0EbXbcsMbqtZGMZ+Rm0keSjyptLbg3HjxjFkyBDAhRCfOnUqRUVFFBcXM23aNF599VVGjRrFokWLuP7663n11Vfp2rUrXbt2JT8/n0svvZRnn32WwsQAiEDv3r059NBDeeutt9i+fTurVq2KxZy68847Y2/w69evbzaw3+uvv86FF15IVlYW/fv356STTorlLV68mGOOOYZRo0bxj3/8gxUrVjR7v0uXLmXSpEn07t2b7Oxspk+fzj//+U/ARdQ966yzYs8lWRjz4447jmuvvZY777yTnTt3kp2dTVlZGd/97ndjz6BHjx7s2rWLnTt3xr4VcvHFF8euA3D++efHtl966SVmzpzJmDFjOPvss9m9ezcVFRXN3kdrsBnchrGPcfuU5kOUD759MJ/uahyifFDXQZRdUrZX1xwxYkSzncHJnHwiRxxxBMuWLWPhwoXcdNNNnHzyydx8880sWbKEl19+mfnz53PXXXexaNEixo0bB8DZZ5/NrbfeygUXXMBTTz3F0KFDmTp1KiJCWVkZL730Em+++SaFhYVMmjQpaTj0VNTU1PCDH/yA8vJyBg4cyC233LJX5/HJycmJhUFvKrT6rFmzOPPMM1m4cCHHHXfcXkfnDYZQj0ajvPXWW+Tn5++d4SmwmoVh7GfMPnk2hTnxzrswp5DZJ+99iPKTTjqJ2trauH6B9957L/aRoCAnnHACzz//PFVVVVRWVvLcc89xwgknsGnTJgoLC7nooou47rrrWLZsGRUVFezatYszzjiD3//+97z77rtkZWWxfPlyli9fHvss69SpU3nhhRd44oknYh8+2rVrF927d6ewsJBVq1bx1ltvNXsPxx13HE8++SSRSITNmzezePFigJgw9OrVi4qKijhRLCkpifUnBJkwYQKvvPIK27ZtIxKJ8MQTT8Te/lvCRx99xKhRo7j++usZP348q1atYvLkyTz88MNUVVUBsGPHDrp27Ur37t1jz/mxxx5r8jqnnnpq7JOwQNJPzbYFq1kYxn7G9FEuFPmNL9/IZ7s+45CuhzD75Nmx9L1BRHjuuee45ppr+NWvfkV+fj6DBw/m9ttvZ+PGjXFlx44dyyWXXMKECRMAN9z06KOP5sUXX+S6664jFAqRk5PDPffcw549ezjnnHOoqalBVWOhxBPp3r07w4YNY+XKlbHzTpkyhXvvvZdhw4Zx5JFHcuyxxzZ7D9/4xjd48803GT58OIcccggTJ04EXKypyy+/nJEjR9K3b99YqHJww2uvuOIKCgoKePPNN2Pp/fr1Y86cOUyePBlV5cwzz+Scc85p8fO8/fbbWbx4MaFQiBEjRnD66adTV1fHmjVrKC0tJTc3lzPOOINf/OIXPProo1xxxRVUVVVx6KGH8vDDDyc955133slVV13FUUcdRTgc5mtf+1rs2x3tgbj+jX2f0tJS9UcptCdlZWVMmjSp3c/bnmS6jZluH2S+je+88w5HH310Z5vRJHv27KGkpKSzzWiWTLexI+z74IMPGDZsWFyaiLytqqWpjrVmKMMwDCMlJhaGYRhGSkwsDGMfYX9pMjY6h7b+fkwsDGMfIBKJsH37dhMMY69QVbZv396mYbU2Gsow9gEqKyvZs2cPW7du7WxTklJTU5O28f3tRabbmG778vPzOfjgg/f6+LSKhYhMAe4AsoAHVXVOQv4g4CGgN7ADuEhVN3h5EeB9r+hnqnp2Om01jExGVWMzpDORsrKyjB6tBZlvY6bblzaxEJEs4G7gFGADsFREFqhqML7vb4A/qeqjInIS8Evg215etaqOSZd9hmEYRstJZ5/FBGCtqq5T1TpgHpA4a2U48A9ve3GSfMMwDCMDSNukPBE5D5iiqpd5+98GjlHVmYEy/wv8S1XvEJFpwDNAL1XdLiJhYDkQBuao6vNJrjEDmAHQp0+fcfPmzWv3+6ioqIiFOM5UMt3GTLcPMt9Gs6/tZLqNnWXf5MmTWzQpL20hw4HzcP0U/v63gbsSyvQHngXewfVtbAC6eXkDvPWhwCfAYc1dz0KUZy6Zbp9q5tto9rWdTLexs+yjhSHK09nBvREYGNg/2EuLoaqbgGkAIlIMnKuqO728jd56nYiUAUcDH6XRXsMwDKMJ0tlnsRQ4XESGiEgucAGwIFhARHqJiG/DDbiRUYhIdxHJ88sAxwEH5odvDcMwMoC0iYWqhoGZwIvAB8BTqrpCRG4VEX8Y7CRgtYisAfoAfgzlYUC5iLyL6/ieo/GjqAzDMIwOJK3zLFR1IbAwIe3mwPZ8oNEXVVT1DWBUOm0zDMMwWo6F+zAMwzBSYmJhGIZhpMTEwjAMw0iJiYVhGIaREhMLwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJhGIZhpMTEwjAMw0iJiYVhGIaREhMLwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJhGIZhpMTEwjAMw0iJiYVhGIaREhMLwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJhGIZhpCStYiEiU0RktYisFZFZSfIHicjLIvKeiJSJyMGBvItF5ENvuTiddhqGYRjNkzaxEJEs4G7gdGA4cKGIDE8o9hvgT6p6FHAr8Evv2B7Az4BjgAnAz0Ske7psNQzDMJonnTWLCcBaVV2nqnXAPOCchDLDgX9424sD+acBi1R1h6p+CSwCpqTRVsMwDKMZ0ikWA4D1gf0NXlqQd4Fp3vZUoEREerbwWMMwDKODyO7k6/8IuEtELgH+CWwEIi09WERmADMA+vTpQ1lZWbsbWFFRkZbztieZbmOm2weZb6PZ13Yy3cZMtw9VTcsCTAReDOzfANzQTPliYIO3fSFwXyDvPuDC5q43btw4TQeLFy9Oy3nbk0y3MdPtU818G82+tpPpNnaWfUC5tsCnp7MZailwuIgMEZFc4AJgQbCAiPQSEd+GG4CHvO0XgVNFpLvXsX2ql2YYhmF0AmkTC1UNAzNxTv4D4ClVXSEit4rI2V6xScBqEVkD9AFme8fuAG7DCc5S4FYvzTAMw+gE0tpnoaoLgYUJaTcHtucD85s49iEaahqGYRhGJ2IzuA3DMIyUmFgYhmEYKTGxAFCF2lqIRjvbEsMwjIyks+dZZAY1NfDppxAKQV4eFBVBQQHk5na2ZYZhGBmBiYVPKATFxVBfD7t2wfbtIOJqHBs3QmEh5OdDTg5k22MzDOPAwrxeIjk5bvEJhZyAbNvmmqtUXX5hoVtyc90SshY9wzD2X0wsWoIvCD6RCFRVwe7d8WWCzVc5Oa5mYhiGsR9gYrE3ZGW5JUg47MTjyy9d7SMUcs1WxcWuHyQ315qvDMPYZzHv1V5kZ8eLgaprvtq+3dVERJyAFBW55qu8PFf7SBQdwzCMDMQa2ufOhSOPhKFDYcIEePbZ9jmviKtNFBZCSYmrYeTnu5FXX3zhRl999BF8/DFs2QIVFa4z3QVONAzDyCgO7JrF3LkwY4brfwA36unHP3bb06Y1fdze4g/NzctrSAuHYc8e13zl93Hk5zf0fyR2uBuGYXQCB7ZY3Hhjg1D4VFfDnDnpEYtkJGu+CoedeGzf7vazspxwFBc3dLZb85VhGB3IgS0Wn32WPH3jRrjkEjj0UPoVFLiO68MOg9690z/CSaRxbSIahbo611zlzzLPyWno/1B16TZ81zCMNHFgi8Uhh7i+g0Ty82HDBnj1VY6sqYHbb3fpxcVw6KFOOILrQw91jjtdJGu+ikRcP8euXU5I1q6Nn33uC44N3zUMox04sMVi9uz4PgtwjvbXv3bNUNEob5aVMTErC9atcx3S69bB0qXw/PPxndF9+zYIR1BMBg5Mz5BZv2kKmp59LtIwedBmnxuG0QZa7DlE5HjgcFV9WER6A8Wq+nH6TOsApk936xtucDWJ/v1h1qyG/opQiNo+fWDECDjxxPhjq6vhk0/iReSjj+Avf4GdOxvK5eTAoEHJayS9erXvm39i85U/fNdmnxuG0UZaJBYi8jOgFDgSeBjIAR4HjkufaR3E9OlOHNavd2/nLaWgAIYNc0siO3Y0CEhQTMrKXJORT5cuDbWRxGatwsI231ps+G5w9nk0arPPDcNoNS2tWUwFjgaWAajqJhEpSZtV+zo9erhl/Pj49EjEdZ4HayLr1sFbbzWe39GvX+OayGGHwcEHt20klD+zPIjNPjcMIwUt9QB1qqoiogAiksbe3P2YrCzXqX7IITB5cnxedbWboJcoJC+84PohfHJzYfDgOBHpEgq5PpMePfauRrA3s8+zs12a3zdiGMZ+TUvF4ikRuQ/oJiKXA98DHkifWQcgBQUwfLhbgqjGN2sF1y+/DPX1jPXLdusGQ4Y0rpEMGdLQGd4Smmq+qqlxEwiTEQq5JSurYTsUcqISibhaSzDdFxlfiIL7fpphGBlDi8RCVX8jIqcAu3H9Fjer6qK0WmY4RKBnT7dMmBCfFw7Dhg28t3gxR4XDDSLy+uswf3582QEDGgQkKCb9+7esWSvZ8N0gfgd6NOqWSKRhPxxumGDYknAmqg2C4QuJH7wxUYiCZZoSncQ0wzBaTUqxEJEs4CVVnQyYQGQS2dkweDA7JkxwI7aCVFXF10T85Zln4msHeXmu5hHsaPeFpEePxtd89lk3w33TpvjRY83VCEKhveuw98XFF536+vi04NLU8UFx8GfDB8XFF51wGLZujReh5kQncd8w9nNSioWqRkQkKiJdVXVXqvL7JP4/fkWF2w86mWjU9SckNrNkOoWFMHKkW4KouqG0iaO11qyBRYucQ/bp1i2+FrJ1Kzz+uAt4COmPpRV0xO0V3iRRZCIRJxTRqOvkTyZAyUQnUSCaaoIL7jfVDGciZOwDtLTPogJ4X0QWAZV+oqr+V1qs6mjy8uArX4lvQvG31693Q1zDYedI6+rcNiR3GllZDf/0wbfYTEHEhS3p3RuOPTY+Lxx295vYP/Lqq/D008nPV10N11wDd9/tnmN+fkNzlbd9RHW164DPz2/IT1IuaX7iuq1C3ZQTFmldv04i0Wh8LSgScaLaJq7UAAAgAElEQVT6wgvwm9/A5s1uhNu118I3vpH8HE2JUCjkfneffdZYkBJfYloqQpn0mzT2CVoqFs96y/5LsF08OLEtK8s51kQSRSUSaXhL9YWlvr6hTNARBNvk/esG//E7i+xs1yQ1ZEjjvIoKF8Y9WZNPJOJqHzU1bqmsdH0UtbVQW0uvigpXpqYmfp7J3pCb27TQNCU4LSjXbfNmZ5+fVlAQf2x2dvMONtnf7dlnXbDK6mq3v2kT/PSn7pwtrYkFazn+KLXE2o//+2ruHMlsTyY2yUTIfwFqToR8O0yI9lta2sH9qIjkAkd4SatVtb65Y/Z7WuPY/eaOxJpLUFTC4fhaS/DY4D9msuaMjqC42PVRbNzYOG/AAHig6cFxb6xYwSS/TyUadSJSUxO/9rf9JZiWuF1d3TjNX+/Y0fR5/CCMSRiT6v79uSdBAQkKTjJBeu65BqHwqa6Gm292Djh4bHAJpufmurUfYLI98WtBwdpQa/qEgtTWuvhkwefVnAglDk5oajRcUHwSa0UmSh1KS2dwTwIeBT4BBBgoIher6j9THDcFuAPIAh5U1TkJ+Yd45+3mlZmlqgtFZDDwAbDaK/qWql7RslvKQERaPqktOKIoKCyRSLyw+Gtw+Xv2xP/zBJvD/H/atv5zzZrl+iiCDrCgwKW3lFDIHdOWJp+9wQ/9HhQQX3Rqali+ejVj+vWLF5+mBC1xnaQ2FUtLxpdfwg9+0Crzv5aTE1/baUpcgiLTEjFKTPfFKTEtVZ+RH58s+Lz3Roj+/Gf43e8aN9ulqh1B4+a2YF4o1DAqL3hcS0Up2X6yvP2YljZD/RY4VVVXA4jIEcATwLimDvBGUd0NnAJsAJaKyAJVXRkodhPwlKreIyLDgYXAYC/vI1VN+cK33xFsDmsJ0ahr3hg8uEFg/KYwf6mrc0tTb9bB0UHBN8FE/KaTZKOhMh3/zTwnx325MIGd+fmNR5S1lQkTktfE+vaFefMaRKWurkFkEsXJWzZs3MghxcWN0mOLH304WV5NTdvvJSenWdE5qq7ODe9uiSA1JWxvvAF33tkwgGLTJrjpJve7nTq1YfJo8LeZ2EyXOHghmBeJNMRta02tqTUk1vgTa0+JAhbcj0QaRiq2VLDmzXPPaP16N9l39uyGmHftTEvFIscXCgBVXSMiqerEE4C1qroOQETmAecAQbFQoIu33RXY1EJ7DB//h9bU/Icgwbe8YM0lsZ/Fbw5L9jZ32mlw+unxP9ra2sZNA4lrX6gOpCaEpmpiN94Ihx/eqlOtW7GCQ/ZWzFRbJEjNLs2Vr6khq7YWPv88/jrB2tfeOuWaGvcM/VF34H7z/vfr/UjKifv+Etg/qqYGunePz09SrsX7ya7v5/vbiWt/O1nZcNjVqFoy+g5cLeymmxpeBj791EXRhrQIRkvFolxEHsQFDwSYDpSnOGYAsD6wvwE4JqHMLcDfReRqoAj4eiBviIi8g5sIeJOqvtpCW42maE2tpbnmMF9cfAHw1001M/jOKli2mf6D2LlEmv7HaWqd6v6TrX17/H+6YHoqEWzunJlSExNpfkJlO/BOsF8qEb8JMJXwXHhh0xe48cb4l5pgzdkfSBJsog3ue0tWba1rhmrNeTqQE/3ab0tFcOXKxgNGqqrcs0qDWIi2QPFFJA+4CjjeS3oV+P9UtbaZY84DpqjqZd7+t4FjVHVmoMy1ng2/FZGJwB+BkbiotsWqul1ExgHPAyNUdXfCNWYAMwD69Okzbt68eS287ZZTUVFBcWui0XYCmW5jm+xrzRtpqrLNTN6rqK6m2O9HaW6S395ctx2oqK2luC3OvqU1ub2s8VXU1FCcGKCylRx70UXkb9nSKL3moIN46/HHkxzROlpto/fCFIpEkHAYCaxD/r6f5qcHy3rbjcp626HAcRKJUF9TQ55IwzkSyyZcr0d5Ocn+WirCK//4R4tvc/LkyW+rammqci0ViyKgRlUj3n4WkKeqVc0cMxG4RVVP8/ZvAFDVXwbKrMAJynpvfx1wrKpuSThXGfAjVW2yNlNaWqrl5akqO62nrKyMSZMmtft525NMtzHT7YM02BhsK2/L2tsue+stJh1zTPIyibW1VDW+lh7TVHoSQSn7+GMmJRtynYymaoULFrihxcE+lvx8+PnP0YS5KZpgQqIX00CKv/3auk85/rDBxHyeSFw5d974Ezc6b1wlN/5ZxM4lQpxfFSGq8TXpqMSfORqNsnztBsZ85WA0waooDceqauzcvSedSfamzTRi0CD3rZ0WIiItEouWNkO9jGsi8qY4UwD8HfhqM8csBQ4XkSHARuAC4FsJZT4DTgYeEZFhQD6w1fu40g5v9vihwOHAuhbaahidT3v3zfhRfzsYVee6/DWes1KNxqdt3EDVwL5xeagS1ShRjRCNujVANBIhihKNRoj659Eo0QunUJgXpsev7iR70+eE+/dh23Uz2XPOicDuBs+t2lizohpns2i8WAhQR5iPI9sDxyQ0hXr3JsH39YCTVxRJvA5CzDAloZbpC65bx+yNRuOvAQhCOFLPzi83ewUVvDKJvyD/2B1XXkKv2b8jVBNo4CksdJ3caaClYpGvqr5QoKoVItJssB9VDYvITOBF3LDYh1R1hYjcCpSr6gLg/wAPiMh/457OJV4o9K8Bt4pIPRAFrlDVHa2/PcPYt4k5ZCAcDcelBddA0jTnrBsWP6+pNFUlSpRobEACMb8Vc45BX6buunXRejZUfh6XJiIIElv7SLafnu1dwpXJRqi/+CK2XPxtl+551/ZqYA1tqKC4/+B2Ols7o0po90oKjhge209WJkj4qsPZdVBfSm77NVkbN8HAgcgvftHpo6EqRWSsqi4DEJFSoDrFMajqQtxw2GDazYHtlST52p6qPgM800LbDKNDSeasg043uI54b8/BBZI48eC5kjjr2kgt63asiznhRGedmOb75qTOOokTFxGyJbtRWksJSYjivMztN8t4mhso0QzVF55H9YXnUVFXweE9Dk/rKMOWisU1wNMi4g9t7Qecnx6TDKP9aMqRJ6ZFNcqX1V82cuLhaDjOoTf11h1rIklME+dIm3PWIpI0LYg5Y6OzaVYsRGQ8sF5Vl4rIUOD7wDTgb8DHHWCfcYAQdNqJb+eN3ryTOPKoRolohEg0Eu/Um2g+SUyrj9azrWobIXHzVoJOOyShuLfu1rxxG0Y6efaDZ5nz2hw27dnEwK4D+cXJv2D6qM5phrqPhrkPE4GfAFfjQuncD5yXFquMjCcSjTTpyJM59S8qvmj01h6JRuKcelNNKolpjd7KA+vs0N459ZCEKMq1rwV3NkHn17+kP7OOn8W0YftAhIBO4NkPnuXHi35Mddj1CHy26zNm/NlNykuHYKQSi6xAx/L5wP1+f4KILG93a4yMJBwNUxepozZcS1V9FdXhatf23lTTi5fmO+yIRqisr2zk3HOzc+1N3YiR6Pw27tnIjxe5mdudLRiqSkQjhKNhIlFv7e37NVx/OxKNENZAuWik2WP98us/X8/y95Y3Ojas3jWi0bi8x997PPasfKrqq7jx5Rs7RyxEJFtVw7ghrjNacayxDxKOhqmP1FMXqaOyvpKq+qrYm78g5GTlUJBd0Oq39vzstk3Y2lc50N6UVTX2chFcaiO1sd9VoyVaR124jpvLbm7k/KrD1fzk5Z/w4Y4PY04yqUP2miCT5fnOfM/uPeSuyW2U5x8bdMyJTj1xnkTa+LD57CzJIjuUTVYoi6r65NPcPtv1WRoMS+3wnwBeEZFtuNFPrwKIyFeA/fOreQcQkWgk9g9bVV9FdX019VEX4iAkIbJD2eRn58fa8Y3Wkc43ZVWlPtqE823CSTfnsPc2z1+q66qJvOF+T4kT3drKnro93L3k7piTzA5lNzhNyWpIS8jLDmXHfsdZoSzys/LpVtAtvrxkEwq5MtmS3SgvO+TlB/KC+VmhrDgHnpiX1EbJjivv27ju/XUMGzMsLi/uHiQr7iVtwgMT2LincaDKQ7oe0q7P36dZsVDV2SLyMm7009+1YVpiCNd3YewjRKIR51zCThiq6qsIazjWfJQTyiE3O5d8OTBrAHtLJBqhqr6KirqKWE3M3/7Z4p8lfVO+4aUbeHvT2w2O2Pu7xN6yI+5NO5YXqaOyuhIt1zgH3Z5kh7LJCeWQl5VHbnau+z1k5Tba75rXlZysQF5WLjlZOVRsq6Bf/35xecnKNZf3H0//B59XfN7ItgElA1hy+ZI23+OKpSsYMb6dIwu3I5X5lfQr6dfi8rOOnxX3MgJQmFPI7JM7aVKeqr6VJG1NWqwx2oWoRqmL1FEfqaeqvorK+krqIw01hqxQFjlZOeSHDixhiGqU6vrqOMdeWVdJRV0Fq7eu5v1/v9+QV+eeW3C/or6Cyrp4QagJtz78d0V9Bc+vfj7OUSY61KKCorj9qh1VHNT3oAZn64l7UkecmBfKIS+7YTsxLzcrl6xQ275x3h6O+MYTbmzk/AqyC5h1fCu+l3IA4ddOM2U0lJHh+KOPKuoqYs6vPlIf62zOznJvjO3VZ9BRbfCqSk24prHD9rabc+CNHL2X11Qbb4xVDZvZoWyKc4opyi1yS45b9yzoSVFOEYW5hbH8wpxCinOLY2X89fde+B5fVH7R6DJ786ac6W/F7UGi8zsQ+njayrRh05g2bFpsUl46B4uYWOxDRDVKfaSe+mg91fXVVNZVUhuppS5Sx6bdm2I1hrzs9ISibrINXuGMI86Ie1MPOuk1X6xh6fKlVNZVttjRV9ZXtrhTMSShRs66MKeQ/iX9G9J8J57g0H1Hv3n1ZkaPHR1Lz83KbfPzuulrN9mbcivxnZ+ReZhYZCjBDsyaeveGXRupjc07yA5lk5OVQ0l2SYfM7t1RvYNbym5J2gZ/9d+u5uq/pejC8houBYlz1r4DP6jooDhHH+f8m3D0/jovK6/Nb1QrNqzg4C4Ht+kcidibsrE/YWKRAfjCUB+pjzW91IRrYp3PWeJqDMW5HRPuQVVZt3Md5RvLWbJxCUs3LeWjLz9q9phZx89K7tBzi9j0wSbGjBtDcW4x+dn5B9S8CntTNvYXTCw6mOA49NpILZV1Thj8Wc/+cLminKIOc6q14Vre3/I+5ZvKWbpxKUs3LWV7tQvl3C2/G6X9S/nPEf/Jg8seZGvV1kbHDygZwNUTmqlZfAK9i3qnyXrDaF+CUXyb2m8uL+5cTZSLm9Dq7UejUSrqKhrl+6Fp/EgGifso5GSl+sp12zGxSDN+H0Nt2AlDdbja+wYAsfHdhTmFHfq2/WX1l5RvLqd8YzlLNy1l+efLXRMXMLjbYE4achLj+49n/IDxfKXHV2LzLPqX9Lc2eKNZWuJIE8OqB/Ni52miTHNOMy7elzvQEfsMfJSK2oom8/39kPdd+xANccKA2P9BsnWjgJCBuGLJjvHPGYzwuyFrAwO7DGwyv7n9kITS7kNMLNqRZGExIlH3wRd/kk1HC4Oq8umuT1mycYmrOWxayprtrgMhO5TNqINGcfGYi5049B/fbA3A2uDTT5MO1quRtvSttjVvtHGxtxLy/P1kb7PBsr4jjnO03qz/ljjZ5soE1y1xok3lbcrexKE9Dk3pgDuLkIQoyCnoVBuaw8RiL0lHWIz2oD5Sz7+3/Julm5bGmpT8pqMueV0o7VfK1KFTGd9/PGP6jmn1j/NAboMPBk1MFh036MiD35RIGsYckjpj/+008a0WXJnWOtjEss0527Y64iN6HdHaR9rhZIfM5e0t9uRaQCaHxdhVs4slO5bwl9f/wtKNS3nn83diE8UO6XoIJww6gQkDJjC+/3iO6HnEARW6ozXO3W8vDjZlJIYxzwplEZJQLGR5VlZg3wvL4C+JIc5b6oyb4tOsTxnUbVAan5ZhNI+JRQLBsBiV9ZXURercSKAMCIuhqqzfvT5WYyjfVM6qbatQlCzJYuRBI5k+ajrjB7gmpb7FfTvcxraQLMR5UueeGO22ie1QKESWOIeeJVlkZ2Unde7B9uJE557o6A3jQMXEAtektKN6BxV1FY3CYgjSYUNWk9m1cuvK2PDV8o3lfF7pYucU5xZT2q+UM484k957ejNt0jQKc5r9LHq701LnHtex2ELnHpIQOVk5Dc7eC9jmO/emHLrfRt5a557p7cWG0dmYWODa+bdXbY/NA4ijA18o99TuYdnmZTFxeOfzd2IhKgaUDGDiwImU9i9l/IDxDO05NBbPZ8XSFWkXClWlNlJLOBKOc+7+W3uicw++vW/I2sCALgPimmeSbRuGkbmYWHhkhbLaHEyttWzcvTHWEb1k0xJWbVtFVKOEJMTw3sO5YMQFlA4oZXz/8fQv6d+htqm6CKf+CJxQKERRThHFhcXkZbtgdS118PYVOsPY9zGx6CAi0QgfbPsg1t+wZOMSNldsBqAop4ix/cZyzTHXMH7AeMb2G9spTV/B0NeCUJhTSM+CnrGIpfb2bxgHLiYWaaKyrpK3N78dm9uwbPOy2OzMvsV9YyOUxvcfz7DewzplSJ8/9FfVjZ3Pz86nT1Ef8rLz2iXekmEY+w8mFu3E5j2b4+Y2rNy6kohGEIShvYZy7rBzY7OiB5QM6BRH7E8a9L+fnZedR6/CXuRn55OXnXdADas1DKN1mFjsBZFohNXbV8eJw4bdGwAX/uLofkdz9YSrY01KXfK6dJqddZE6N4vcG/bbLa8bhbmF5GXldXgfjWEY+y4mFi2gqr6Kdza/E5vbUL6pnD11ewDoU9SH0v6lXDb2Mib0n8Dw3sM7JKhXMqIapTZcS0Qj7sNHoWy65HWhMKeQvOw8m71qGMZek1bvISJTgDuALOBBVZ2TkH8I8CjQzSszS1UXenk3AJcCEeC/VPXFdNg49/253PDSDWzYvSEW6+i4gce5WsOmpby65lU+ev0jwtEwgnBkzyM5Z+g5jO8/ngkDJsQmcnUG/nBWf/ZxSEKU5JbEvvHQWaJlGMb+R9rEQkSygLuBU4ANwFIRWaCqKwPFbgKeUtV7RGQ4sBAY7G1fAIwA+gMvicgRqhppTxvnvj+XGX+eEZvLsHHPRv7rr/8VCwGRn5XPEcVHcGXplYzvP55x/cfRLb9be5rQKhKHs4q4CYM5WTkM7ja4VcNZDcMwWkM6axYTgLWqug5AROYB5wBBsVDAb9DvCmzyts8B5qlqLfCxiKz1zvdmexp448s3Nvous6J0zevK49MeZ+RBI/lw2Yed+u3jukgd9ZH6mIAV5RTRo6AH+dn5seGsq2V1u3wG1DAMoynSKRYDgPWB/Q3AMQllbgH+LiJXA0XA1wPHvpVw7ID2NvCzXZ8lTd9du5ux/ca29+VahP/9i2jUfX+6IKeAboXdyM9x4mAjlgzD6Aw6u8fzQuARVf2tiEwEHhORkS09WERmADMA+vTpQ1lZWasuflDeQXxR+0Wj9N55vVmxdAUANZU1se10kBjWWkRcdFNaHgKjoqKi1ffekWS6fZD5Npp9bSfTbcx0+9IpFhuBgYH9g720IJcCUwBU9U0RyQd6tfBYVPV+4H6A0tJSnTRpUqsM/G3P38b1WYAb+vrTk37KiGGu6WnF0hXt2gwVHM6qKLlZuZTklrRpOGtZWRmtvfeOJNPtg8y30exrO5luY6bbl06xWAocLiJDcI7+AuBbCWU+A04GHhGRYUA+sBVYAPyviPwO18F9OLCkvQ2cPmo6QKPRUO35cZ+oRuM6pXNCOZTklVCUU2TDWQ3D2GdIm6dS1bCIzARexA2LfUhVV4jIrUC5qi4A/g/wgIj8N66z+xJ134FcISJP4TrDw8BV7T0Symf6qOlMGzqN9bvXt0s8Jn/Ekt8pnRXKojinmOK8YhvOahjGPktaX2u9ORMLE9JuDmyvBI5r4tjZwOx02tceqCr10XrqI/WxiLFFuUX0LOhJfk6+DWc1DGO/wNpA9gI/AF9Uo4gIhdmFdC/qHjec1TAMY3/CxKIF+AH4IlEXGDA/J59ehb0oyCmw4ayGYRwQmFgkwR+xFI6GiUajRKNRehT0oCC7wKKzGoZxQGJi4RGOhtlT64ID5mTl0DWvK4W5hWzO3szg7oM71zjDMIxOxsQCF531oMKDKMgpsOGshmEYSTCviKtJ9Crq1dlmGIZhZCzW+G4YhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkrSKhYiMkVEVovIWhGZlST/9yKy3FvWiMjOQF4kkLcgnXYahmEYzZOdrhOLSBZwN3AKsAFYKiILVHWlX0ZV/ztQ/mrg6MApqlV1TLrsMwzDMFpOOmsWE4C1qrpOVeuAecA5zZS/EHgijfYYhmEYe4moanpOLHIeMEVVL/P2vw0co6ozk5QdBLwFHKyqES8tDCwHwsAcVX0+yXEzgBkAffr0GTdv3rx2v4+KigqKi4vb/bztSabbmOn2QebbaPa1nUy3sbPsmzx58tuqWpqyoKqmZQHOAx4M7H8buKuJstcDf0hIG+CtDwU+AQ5r7nrjxo3TdLB48eK0nLc9yXQbM90+1cy30exrO5luY2fZB5RrC3x6OpuhNgIDA/sHe2nJuICEJihV3eit1wFlxPdnGIZhGB1IOsViKXC4iAwRkVycIDQa1SQiQ4HuwJuBtO4ikudt9wKOA1YmHmsYhmF0DGkbDaWqYRGZCbwIZAEPqeoKEbkVV+3xheMCYJ5XHfIZBtwnIlGcoM3RwCgqwzAMo2NJm1gAqOpCYGFC2s0J+7ckOe4NYFQ6bTMMwzBajs3gNgzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkxMTCMAzDSImJhWEYhpESEwvDMAwjJSYWhmEYRkpMLAzDMIyUmFgYhmEYKTGxMAzDMFJiYmEYhmGkJK1iISJTRGS1iKwVkVlJ8n8vIsu9ZY2I7AzkXSwiH3rLxem00zAMw2ie7HSdWESygLuBU4ANwFIRWaCqK/0yqvrfgfJXA0d72z2AnwGlgAJve8d+mS57DcMwjKZJZ81iArBWVdepah0wDzinmfIXAk9426cBi1R1hycQi4ApabTVMAzDaIa01SyAAcD6wP4G4JhkBUVkEDAE+Eczxw5IctwMYIa3WyEiq9toczJ6AdvScN72JNNtzHT7IPNtNPvaTqbb2Fn2DWpJoXSKRWu4AJivqpHWHKSq9wP3p8ckh4iUq2ppOq/RVjLdxky3DzLfRrOv7WS6jZluXzqboTYCAwP7B3tpybiAhiao1h5rGIZhpJl0isVS4HARGSIiuThBWJBYSESGAt2BNwPJLwKnikh3EekOnOqlGYZhGJ1A2pqhVDUsIjNxTj4LeEhVV4jIrUC5qvrCcQEwT1U1cOwOEbkNJzgAt6rqjnTZmoK0NnO1E5luY6bbB5lvo9nXdjLdxoy2TwI+2jAMwzCSYjO4DcMwjJSYWBiGYRgpMbEAROQTEXnfCztS7qX1EJFFXriRRV5HO+K40wth8p6IjE2DPQ+JyBYR+XcgrdX2pDNkShM23iIiGwMhXM4I5N3g2bhaRE4LpDcbEqYN9g0UkcUislJEVojID730jHiOzdiXSc8wX0SWiMi7no3/46UPEZF/edd70hvAgojkeftrvfzBqWxPk32PiMjHgWc4xkvvrP+VLBF5R0T+4u1nxPNrNap6wC/AJ0CvhLRfA7O87VnAr7ztM4C/AgIcC/wrDfZ8DRgL/Htv7QF6AOu8dXdvu3uabbwF+FGSssOBd4E83OTLj3CDHrK87UOBXK/M8Hayrx8w1tsuAdZ4dmTEc2zGvkx6hgIUe9s5wL+8Z/MUcIGXfi9wpbf9A+Beb/sC4MnmbE+jfY8A5yUp31n/K9cC/wv8xdvPiOfX2sVqFk1zDvCot/0o8M1A+p/U8RbQTUT6teeFVfWfQOLor9bak9aQKU3Y2BTn4Ea81arqx8BaXDiY1oaEaY19m1V1mbe9B/gAFwUgI55jM/Y1RWc8Q1XVCm83x1sUOAmY76UnPkP/2c4HThYRacb2dNnXFB3+vyIiBwNnAg96+0KGPL/WYmLhUODvIvK2uBAiAH1UdbO3/TnQx9tuUSiSNNBaezrLzpleFf8hv4mns230qvNH4948M+45JtgHGfQMvSaU5cAWnBP9CNipquEk14vZ4uXvAnqm08ZE+1TVf4azvWf4exHJS7QvwY50PsPbgR8DUW+/Jxn0/FqDiYXjeFUdC5wOXCUiXwtmqqsLZswY40yzJ8A9wGHAGGAz8NvONQdEpBh4BrhGVXcH8zLhOSaxL6OeoapGVHUMLorCBGBoZ9qTSKJ9IjISuAFn53hc09L1nWGbiJwFbFHVtzvj+u2NiQWgqhu99RbgOdw/xRd+85K33uIV76xQJK21p8PtVNUvvH/eKPAADVXlTrFRRHJwjniuqj7rJWfMc0xmX6Y9Qx9V3QksBibimm/8Cb3B68Vs8fK7Ats7wsaAfVO8Jj5V1VrgYTrvGR4HnC0in+CaB08C7iADn1+L6OhOkkxbgCKgJLD9Bq698v8S3xH6a2/7TOI7yZakya7BxHcet8oe3BvVx7gOu+7edo8029gvsP3fuHZWgBHEd9Ctw3XMZnvbQ2jonB3RTrYJ8Cfg9oT0jHiOzdiXSc+wN9DN2y4AXgXOAp4mvoP2B972VcR30D7VnO1ptK9f4BnfDszJgP+VSTR0cGfE82v1PXT0BTNtwY0ieddbVgA3erjQwzQAAAR4SURBVOk9gZeBD4GX/B+P90O7G9d2+z5QmgabnsA1QdTj2icv3Rt7gO/hOsPWAt/tABsf82x4DxcHLOj4bvRsXA2cHkg/AzcS6CP/2beTfcfjmpjeA5Z7yxmZ8hybsS+TnuFRwDueLf8Gbg78zyzxnsfTQJ6Xnu/tr/XyD01le5rs+4f3DP8NPE7DiKlO+V/xzj+JBrHIiOfX2sXCfRiGYRgpsT4LwzAMIyUmFoZhGEZKTCwMwzCMlJhYGIZhGCkxsTAMwzBSYmJh7FOISM9ANNHPEyK05rbwHA+LyJEpylwlItPbx+rMQERe8yOwGkZrsaGzxj6LiNwCVKjqbxLSBffbjiY98ABFRF4DZqrq8s62xdj3sJqFsV8gIl8R922IubjJlf1E5H4RKfe+dXBzoOxrIjJGRLJFZKeIzPG+ifCmiBzklfm5iFwTKD/H+3bCahH5qpdeJCLPeNed712r0Zu7iIwXkVe8QJV/FZE+IpLj7R/vlfm/0vA9hv8RkaUi8m8RudcTP9+O33nXWSkipSLynLhvMNwSeA4rRGSeiHwgIk+JSEESm0737neZuG8oFAXsWOkF4ftVu/6RjH0aEwtjf2Io8HtVHa4u3tcsVS0FRgOniMjwJMd0BV5R1dHAm7iZvMkQVZ0AXAf4wnM18LmqDgduw0WOjT/IRTy9AzhXVcfhZhTfpqr1wHeB+0XkVGAy8HPvsDtUdTwwyrMvGC672runPwLPA1d45WaISDevzHBcGJFhQA3w/QSbDsKFOjlZXQDN94Afikgf3GzwEap6FPDLJp6FcQBiYmHsT3ykquWB/QtFZBmwDBiGc6KJVKvqX73tt3HxrpLxbJIyx+MCxKGqfriYRIbhYvu85IXSnoUXFE5V3/OOfwH4nicg4L5jsAQXguZE73ifBd76feB9dYEHa3Af8DrYy/tY3fcawInT8Qk2fRX3LN7wbJru3dMOXCjtB0RkKlDZxLMwDkCyUxcxjH2GmHMTkcOBHwITVHWniDyOi72TSF1gO0LT/xO1LSiTDAHeU9UTmsgfiftugd/8VQjchfuK3kYR+XmC3b4d0cC2v+/bldgRmbgvwN9U9duNjBUpBU4B/gO4Eji16VszDiSsZmHsr3QB9gC7peFraO3N68B/AojIKJLXXFYCA0RkglcuV0RGeNvnA8W4IHN3i0gXXPTUKLBNREqAc/fCriEiMt7b/hbwWkL+G8CJInKoZ0eRiBzuXa+Lqv4FF/G2UbOaceBiNQtjf2UZzlGvAj7FOfb25g/An0RkpXetlbhaQgxVrRWR84A7PTHIAn4rIltx/RyTVHWTiNyH62+5VEQe9c61mYav57WGD4Brvc7294H7E2z6QkQuBZ4MDDf+CVANPOv1s4Rw3442DMCGzhrGXiPuAzXZqlrjNXv9HThcGz6Z2Rk2fQWYr+7rcYbRbljNwjD2nmLgZU80BPh+ZwqFYaQTq1kYhmEYKbEObsMwDCMlJhaGYRhGSkwsDMMwjJSYWBiGYRgpMbEwDMMwUvL/A1EOSAQQi8ymAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "title = \"Learning Curves (XGB)\"\n",
+ "cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)\n",
+ "estimator = XGBClassifier()\n",
+ "plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# lbl_enc = preprocessing.LabelEncoder()\n",
+ "# y = lbl_enc.fit_transform(df.label.values)\n",
+ "# X_train, X_test, y_train, y_test = train_test_split(df.sentence.values, y, random_state=42, test_size=0.2)\n",
+ "# target_names = lbl_enc.classes_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# countvec = CountVectorizer(analyzer='word',token_pattern=r'\\w{1,}',\n",
+ "# ngram_range=(1, 3), stop_words = 'english', binary=True)\n",
+ "# countvec.fit(list(X_train) + list(X_test))\n",
+ "# X_train_countvec = countvec.transform(X_train) \n",
+ "# X_test_countvec = countvec.transform(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# clf = XGBClassifier(n_jobs=-1)\n",
+ "# clf.fit(X_train_countvec.tocsc(), y_train)\n",
+ "# y_pred = clf.predict(X_test_countvec.tocsc())\n",
+ "# print(\"---Test Set Results---\")\n",
+ "# print(\"Accuracy with Xgboost: {}\".format(accuracy_score(y_test, y_pred)))\n",
+ "# print(classification_report(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "# for x, y, y_hat in zip(X_test, lbl_enc.inverse_transform(y_test), lbl_enc.inverse_transform(y_pred)):\n",
+ "# if y != y_hat:\n",
+ "# print(f'sentence: {x} label: {y} label_pred: {y_hat}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Fetch hotels in New York"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "conn = connect(\n",
+ " host='XXX',\n",
+ " port='XXX',\n",
+ " dbname='XXX',\n",
+ " user='XXX',\n",
+ " password='XXX')\n",
+ "conn.readonly = XXX\n",
+ "query = \"\"\"\n",
+ " XXX\n",
+ "\"\"\"\n",
+ "ny_hotels = pd.read_sql(query, con=conn)\n",
+ "conn.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " name \n",
+ " description \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 11 Howard \n",
+ " 11 Howard is located at the crossroads of the ... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 Hotel Brooklyn Bridge \n",
+ " Welcome to your Brooklyn Bridge waterfront ret... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1 Hotel Central Park \n",
+ " 1 Hotel Central Park is a 5-star property loca... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 36 Hudson Hotel \n",
+ " Ticket services and free Wi-Fi head the list o... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 6 Columbus Central Park a Sixty Hotel \n",
+ " hotelinformation Overlooking Columbus Circle a... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name \\\n",
+ "0 11 Howard \n",
+ "1 1 Hotel Brooklyn Bridge \n",
+ "2 1 Hotel Central Park \n",
+ "3 36 Hudson Hotel \n",
+ "4 6 Columbus Central Park a Sixty Hotel \n",
+ "\n",
+ " description \n",
+ "0 11 Howard is located at the crossroads of the ... \n",
+ "1 Welcome to your Brooklyn Bridge waterfront ret... \n",
+ "2 1 Hotel Central Park is a 5-star property loca... \n",
+ "3 Ticket services and free Wi-Fi head the list o... \n",
+ "4 hotelinformation Overlooking Columbus Circle a... "
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ny_hotels.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "We have 18719 sentences in New York hotels\n"
+ ]
+ }
+ ],
+ "source": [
+ "ny_hotels = pd.concat([pd.Series(str(row['name']), str(row['description']).split('. ')) \n",
+ " for _, row in ny_hotels.iterrows()]).reset_index()\n",
+ "ny_hotels.columns = ['sentence', 'name']\n",
+ "ny_hotels['sentence'] = ny_hotels['sentence'].map(lambda x: re.sub(r'\\W+', ' ', x))\n",
+ "print('We have ', len(ny_hotels), 'sentences in New York hotels')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Welcome to your Brooklyn Bridge waterfront retreat with expansive views of the East River and Manhattan skyline\n",
+ "\n",
+ "The hotel is 100 non smoking\n",
+ "\n",
+ "A 350 cleaning fee will be charged to any guest who violates the smoking policy\n",
+ "\n",
+ " Fee subject to change 150 incidental deposit will be charged to guests with either a major credit card or debit card\n",
+ "\n",
+ " Fee subject to change Unless notified prior to or at check in a one night penalty will apply to departures that occur earlier than the date specified on the original reservation\n",
+ "\n",
+ "Must be 18 years of age to check in\n",
+ "\n",
+ "A daily Facility Fee is payable by the guest upon check in \n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "a = 22\n",
+ "for i in range(a,a+7):\n",
+ " print(ny_hotels.sentence[i])\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "X_final= ny_hotels['sentence']\n",
+ "X_final = count_vect.transform(X_final)\n",
+ "y_pred = xgb.predict(X_final)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "18719"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "18719"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(ny_hotels)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " sentence \n",
+ " name \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 11 Howard is located at the crossroads of the ... \n",
+ " 11 Howard \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Bookings of 8 rooms or more will be considered... \n",
+ " 11 Howard \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 150 incidental deposit will be charged to gue... \n",
+ " 11 Howard \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Fee subject to change Must be 21 years of age... \n",
+ " 11 Howard \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Conscious design sophisticated dining options ... \n",
+ " 11 Howard \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sentence name\n",
+ "0 11 Howard is located at the crossroads of the ... 11 Howard\n",
+ "1 Bookings of 8 rooms or more will be considered... 11 Howard\n",
+ "2 150 incidental deposit will be charged to gue... 11 Howard\n",
+ "3 Fee subject to change Must be 21 years of age... 11 Howard\n",
+ "4 Conscious design sophisticated dining options ... 11 Howard"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ny_hotels.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1.,\n",
+ " 1., 1., 0.])"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_pred[:20]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = ny_hotels\n",
+ "results['label'] = y_pred.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0 12931\n",
+ "0.0 5788\n",
+ "Name: label, dtype: int64"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results.label.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results_keep = results.loc[results['label'] == 0.0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " sentence \n",
+ " name \n",
+ " label \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 11 Howard is located at the crossroads of the ... \n",
+ " 11 Howard \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " The 11 Howard is in Manhattan s SoHo neighbor... \n",
+ " 11 Howard \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " Walk one block to reach two subway stops offer... \n",
+ " 11 Howard \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " Located close to Soho Bowery Chinatown and Lit... \n",
+ " 11 Howard \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 19 \n",
+ " One World Trade Center is 1 3 km from 11 Howa... \n",
+ " 11 Howard \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sentence name label\n",
+ "0 11 Howard is located at the crossroads of the ... 11 Howard 0.0\n",
+ "9 The 11 Howard is in Manhattan s SoHo neighbor... 11 Howard 0.0\n",
+ "11 Walk one block to reach two subway stops offer... 11 Howard 0.0\n",
+ "12 Located close to Soho Bowery Chinatown and Lit... 11 Howard 0.0\n",
+ "19 One World Trade Center is 1 3 km from 11 Howa... 11 Howard 0.0"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results_keep.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nyc_description = results_keep.groupby('name')['sentence'].agg(lambda col: ' '.join(col)).reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " name \n",
+ " sentence \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 Hotel Central Park \n",
+ " 1 Hotel Central Park is a 5 star property loca... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 11 Howard \n",
+ " 11 Howard is located at the crossroads of the ... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 36 Hudson Hotel \n",
+ " Ticket services and free Wi Fi head the list o... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 6 Columbus Central Park a Sixty Hotel \n",
+ " hotelinformation Overlooking Columbus Circle a... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " AC Hotel by Marriott New York Downtown \n",
+ " A truly cosmopolitan hotel with an urban vibe ... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name \\\n",
+ "0 1 Hotel Central Park \n",
+ "1 11 Howard \n",
+ "2 36 Hudson Hotel \n",
+ "3 6 Columbus Central Park a Sixty Hotel \n",
+ "4 AC Hotel by Marriott New York Downtown \n",
+ "\n",
+ " sentence \n",
+ "0 1 Hotel Central Park is a 5 star property loca... \n",
+ "1 11 Howard is located at the crossroads of the ... \n",
+ "2 Ticket services and free Wi Fi head the list o... \n",
+ "3 hotelinformation Overlooking Columbus Circle a... \n",
+ "4 A truly cosmopolitan hotel with an urban vibe ... "
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nyc_description.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " name \n",
+ " sentence \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [name, sentence]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nyc_description.loc[nyc_description['name'] == '1 Hotel Brooklyn Bridge']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics.pairwise import linear_kernel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nyc_description['sentence'] = nyc_description['sentence'].str.replace(r'[^\\w\\s]+', '')\n",
+ "nyc_description.set_index('name', inplace = True)\n",
+ "\n",
+ "tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=10, stop_words='english', token_pattern='[a-zA-Z0-9]{3,}')\n",
+ "tfidf_matrix = tf.fit_transform(nyc_description['sentence'])\n",
+ "cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "indices = pd.Series(nyc_description.index)\n",
+ "\n",
+ "def recommendations(name, cosine_similarities = cosine_similarities):\n",
+ " \n",
+ " recommended_hotels = []\n",
+ " \n",
+ " # gettin the index of the hotel that matches the name\n",
+ " idx = indices[indices == name].index[0]\n",
+ "\n",
+ " # creating a Series with the similarity scores in descending order\n",
+ " score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)\n",
+ "\n",
+ " # getting the indexes of the 10 most similar hotels except itself\n",
+ " top_10_indexes = list(score_series.iloc[1:11].index)\n",
+ " \n",
+ " # populating the list with the titles of the best 10 matching hotels\n",
+ " for i in top_10_indexes:\n",
+ " recommended_hotels.append(list(nyc_description.index)[i])\n",
+ " \n",
+ " return recommended_hotels"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Sheraton Brooklyn New York Hotel',\n",
+ " 'Hotel Indigo BROOKLYN',\n",
+ " 'Union Hotel, an Ascend Hotel Collection Member',\n",
+ " 'Hampton Inn Brooklyn Downtown NY',\n",
+ " 'EVEN Hotel Brooklyn',\n",
+ " 'La Quinta Inn & Suites Brooklyn Downtown',\n",
+ " 'Hotel Le Bleu',\n",
+ " 'Days Inn by Wyndham Brooklyn Borough Park',\n",
+ " 'Hilton Brooklyn New York',\n",
+ " 'NU Hotel Brooklyn']"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('New York Marriott at the Brooklyn Bridge')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Hyatt Centric Times Square New York',\n",
+ " 'Renaissance New York Times Square Hotel',\n",
+ " 'Sheraton New York Times Square Hotel',\n",
+ " 'Novotel New York - Times Square',\n",
+ " 'Hotel Mela Times Square',\n",
+ " 'Paramount Times Square',\n",
+ " 'Crowne Plaza Times Square Manhattan',\n",
+ " 'The Manhattan at Times Square Hotel',\n",
+ " 'Millennium Broadway New York Times Square',\n",
+ " 'Hotel Edison']"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('W New York - Times Square')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Trump International Hotel & Tower New York',\n",
+ " 'Hudson New York, Central Park',\n",
+ " '6 Columbus Central Park a Sixty Hotel',\n",
+ " 'Kimpton Ink48 Hotel',\n",
+ " 'Empire Hotel',\n",
+ " 'Parker New York',\n",
+ " 'The Time New York',\n",
+ " 'Hotel Sofitel New York',\n",
+ " 'Residence Inn New York Manhattan/Central Park',\n",
+ " 'JW Marriott Essex House New York']"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('Mandarin Oriental, New York')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Best Western Gregory Hotel',\n",
+ " 'Sleep Inn Coney Island',\n",
+ " 'Sleep Inn Brooklyn Downtown',\n",
+ " 'Holiday Inn Express Brooklyn',\n",
+ " 'Sheraton Brooklyn New York Hotel',\n",
+ " 'Wyndham Garden Brooklyn Sunset Park',\n",
+ " 'Hotel Indigo BROOKLYN',\n",
+ " 'Days Inn by Wyndham Jamaica / JFK Airport',\n",
+ " 'EVEN Hotel Brooklyn',\n",
+ " 'Aloft New York Brooklyn']"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('Best Western Plus Brooklyn Bay Hotel')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Sheraton Brooklyn New York Hotel',\n",
+ " 'La Quinta Inn & Suites Brooklyn Downtown',\n",
+ " 'Sleep Inn Brooklyn Downtown',\n",
+ " 'Hampton Inn Brooklyn Downtown NY',\n",
+ " 'NU Hotel Brooklyn',\n",
+ " 'Hotel Indigo BROOKLYN',\n",
+ " 'Union Hotel, an Ascend Hotel Collection Member',\n",
+ " 'Days Inn by Wyndham Brooklyn Borough Park',\n",
+ " 'Wyndham Garden Brooklyn Sunset Park',\n",
+ " 'Brooklyn Way Hotel, BW Premier Collection']"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('Aloft New York Brooklyn')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Ace Hotel New York',\n",
+ " 'INNSIDE by MeliĆ New York Nomad',\n",
+ " 'Arlo NoMad',\n",
+ " 'DoubleTree by Hilton New York City - Chelsea',\n",
+ " 'Hampton Inn New York - 35th Street - Empire State Building',\n",
+ " 'DoubleTree by Hilton Hotel New York - Times Square South',\n",
+ " 'Holiday Inn Express - New York City Chelsea',\n",
+ " 'Nyma The New York Manhattan Hotel',\n",
+ " 'The Redbury New York',\n",
+ " 'Hotel Pennsylvania']"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('Avalon Hotel')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Archer Hotel New York',\n",
+ " 'The Mansfield Hotel',\n",
+ " 'Element New York Times Square West',\n",
+ " 'Andaz 5th Avenue - a concept by Hyatt',\n",
+ " 'DoubleTree by Hilton Hotel New York - Times Square South',\n",
+ " 'The Redbury New York',\n",
+ " 'The Westin New York at Times Square',\n",
+ " 'Ace Hotel New York',\n",
+ " 'The Westin New York Grand Central',\n",
+ " 'Hotel Mela Times Square']"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('The Langham, New York, Fifth Avenue')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['INNSIDE by MeliĆ New York Nomad',\n",
+ " 'The New York EDITION',\n",
+ " 'Hotel Mela Times Square',\n",
+ " 'The Redbury New York',\n",
+ " 'W New York - Union Square',\n",
+ " 'The Westin New York at Times Square',\n",
+ " 'DoubleTree by Hilton New York City - Chelsea',\n",
+ " 'Holiday Inn Express New York City Times Square',\n",
+ " 'Arlo NoMad',\n",
+ " 'Ace Hotel New York']"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('The James New York - NoMad')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Ramada by Wyndham Flushing Queens',\n",
+ " 'Aloft New York LaGuardia Airport',\n",
+ " 'Flushing Central Hotel',\n",
+ " 'Aloft Long Island City - Manhattan View',\n",
+ " 'Holiday Inn LaGuardia Airport',\n",
+ " 'LaGuardia Plaza Hotel',\n",
+ " 'Best Western JFK Airport Hotel',\n",
+ " 'Days Inn by Wyndham Jamaica / JFK Airport',\n",
+ " 'Courtyard Newark Downtown',\n",
+ " 'Holiday Inn Express Kennedy Airport']"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('New York LaGuardia Airport Marriott')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Hilton New York JFK Airport',\n",
+ " 'Hilton Garden Inn Queens/JFK Airport',\n",
+ " 'Hampton Inn JFK Airport',\n",
+ " 'Holiday Inn Express Kennedy Airport',\n",
+ " 'Howard Johnson by Wyndham Jamaica JFK Airport NYC',\n",
+ " 'Days Inn by Wyndham Jamaica / JFK Airport',\n",
+ " 'Wyndham Garden Brooklyn Sunset Park',\n",
+ " 'Hampton Inn Brooklyn Downtown NY',\n",
+ " 'TRYP By Wyndham Times Square South',\n",
+ " 'Holiday Inn New York JFK Airport Area']"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recommendations('Radisson Hotel JFK Airport')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}