From 8bdfd9b5cfcc8b76369237e569990f6d62fd8263 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Mon, 17 Jan 2022 00:26:14 -0500 Subject: [PATCH] Add notebook --- BPR_movie_RecSys_TF.ipynb | 924 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 924 insertions(+) create mode 100644 BPR_movie_RecSys_TF.ipynb diff --git a/BPR_movie_RecSys_TF.ipynb b/BPR_movie_RecSys_TF.ipynb new file mode 100644 index 0000000..c8df430 --- /dev/null +++ b/BPR_movie_RecSys_TF.ipynb @@ -0,0 +1,924 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d172b26a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from os import path\n", + "from collections import OrderedDict\n", + "from tqdm import tqdm\n", + "from typing import Dict\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow.keras.models import Model, load_model\n", + "from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda\n", + "from tensorflow.keras.optimizers import Adam\n", + "import tensorflow.keras.backend as K\n", + "\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a59d3e42", + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 42\n", + "np.random.seed(SEED)\n", + "tf.random.set_seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a7e1c264", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usermovieratingid
02783125352783_1253
\n", + "
" + ], + "text/plain": [ + " user movie rating id\n", + "0 2783 1253 5 2783_1253" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_full = pd.read_csv('data/training_ratings_for_kaggle_comp.csv')\n", + "df_full.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "76be0e07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[3939, 3940, 3941, 3942, 3943, 3945, 3946, 3947, 3948, 3952]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(df_full.movie.unique())[-10:]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d32cee6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(df_full.user.unique())[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7ed5e5d4", + "metadata": {}, + "outputs": [], + "source": [ + "unique_users = df_full.user.unique()\n", + "user_ids = dict(zip(unique_users, np.arange(unique_users.shape[0], dtype=np.int32)))\n", + "\n", + "unique_movies = df_full.movie.unique()\n", + "movie_ids = dict(zip(unique_movies, np.arange(unique_movies.shape[0], dtype=np.int32)))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2bc051a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usermovieratingiduser_idmovie_id
02783125352783_125300
\n", + "
" + ], + "text/plain": [ + " user movie rating id user_id movie_id\n", + "0 2783 1253 5 2783_1253 0 0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_full['user_id'] = df_full.user.apply(lambda u: user_ids[u])\n", + "df_full['movie_id'] = df_full.movie.apply(lambda m: movie_ids[m])\n", + "\n", + "df_full.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "76d0858d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3225, 2)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp_test = df_full[df_full.rating > 4]\n", + "tmp_test = tmp_test.groupby('user').movie.count().reset_index()\n", + "tmp_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "39af3ea2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3500, 6)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conditions = (df_full.user.isin(tmp_test[tmp_test.movie > 20].user)) & (df_full.rating > 4)\n", + "df_test = df_full[conditions].groupby('user').head(2).reset_index()\n", + "\n", + "del df_test['index']\n", + "df_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a26f6540", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idmovie_id
03[221, 222]
\n", + "
" + ], + "text/plain": [ + " user_id movie_id\n", + "0 3 [221, 222]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ground_truth_test = df_test.groupby('user_id').movie_id.agg(list).reset_index()\n", + "\n", + "ground_truth_test.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "678e3031", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(496600, 6)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train = pd.concat([df_full, df_test]).drop_duplicates(keep=False)\n", + "df_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b6ba8493", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idmovie_id
00[0, 1, 2, 3, 4, 5, 6, 7, 10, 13, 14, 16, 18, 2...
\n", + "
" + ], + "text/plain": [ + " user_id movie_id\n", + "0 0 [0, 1, 2, 3, 4, 5, 6, 7, 10, 13, 14, 16, 18, 2..." + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ground_truth_train = df_train[df_train.rating > 3].groupby('user_id').movie_id.agg(list).reset_index()\n", + "\n", + "ground_truth_train.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "060c0129", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idpositive_m_idnegative_m_id
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [user_id, positive_m_id, negative_m_id]\n", + "Index: []" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_triplest = pd.DataFrame(columns=['user_id', 'positive_m_id', 'negative_m_id'])\n", + "df_triplest" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "884d7e7e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3255/3255 [00:30<00:00, 105.93it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 4min 10s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "data = []\n", + "users_without_data = []\n", + "\n", + "for user_id in tqdm(df_train.user_id.unique()):\n", + " positive_movies = df_train[(df_train.user_id == user_id) & (df_train.rating > 3)].movie_id.values\n", + " negative_movies = df_train[(df_train.user_id == user_id) & (df_train.rating <= 3)].movie_id.values\n", + "\n", + " if negative_movies.shape[0] == 0 or positive_movies.shape[0] == 0:\n", + " users_without_data.append(user_id)\n", + " continue\n", + "\n", + "\n", + " for positive_movie in positive_movies:\n", + " for negative_movie in negative_movies:\n", + " data.append({'user_id': user_id, 'positive_m_id': positive_movie, 'negative_m_id': negative_movie})\n", + "\n", + "df_triplest = df_triplest.append(data, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "98999e12", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((37621280, 3), (496600, 6))" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_triplest.shape, df_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4a33fa89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3255, 3551)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_users = unique_users.shape[0]\n", + "num_items = unique_movies.shape[0]\n", + "\n", + "num_users, num_items" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a59f5711", + "metadata": {}, + "outputs": [], + "source": [ + "unique_movie_ids = list(df_full.movie_id.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6af131b6", + "metadata": {}, + "outputs": [], + "source": [ + "def bpr_predict(model: Model, user_id: int, item_ids: list, user_layer='user_embedding', item_layer='item_embedding'):\n", + " \"\"\"\n", + " Predict by multiplication user vector by item matrix\n", + " \n", + " :return: list of the scores\n", + " \"\"\"\n", + " user_vector = model.get_layer(user_layer).get_weights()[0][user_id]\n", + " item_matrix = model.get_layer(item_layer).get_weights()[0][item_ids]\n", + "\n", + " scores = (np.dot(user_vector, item_matrix.T))\n", + "\n", + " return scores" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fbd1fb7d", + "metadata": {}, + "outputs": [], + "source": [ + "@tf.function\n", + "def identity_loss(_, y_pred):\n", + " return tf.math.reduce_mean(y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "32711205", + "metadata": {}, + "outputs": [], + "source": [ + "@tf.function\n", + "def bpr_triplet_loss(X: dict):\n", + " \"\"\"\n", + " Calculate triplet loss - as higher the difference between positive interactions\n", + " and negative interactions as better\n", + "\n", + " :param X: X contains the user input, positive item input, negative item input\n", + " :return:\n", + " \"\"\"\n", + " positive_item_latent, negative_item_latent, user_latent = X\n", + "\n", + " positive_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, positive_item_latent), axis=-1, keepdims=True)\n", + " negative_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, negative_item_latent), axis=-1, keepdims=True)\n", + "\n", + " return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(positive_interactions, negative_interactions)))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1b95ef84", + "metadata": {}, + "outputs": [], + "source": [ + "def out_shape(shapes):\n", + " return shapes[0]\n", + " \n", + "\n", + "def build_model(num_users: int, num_items: int, latent_dim: int) -> Model:\n", + " \"\"\"\n", + " Build a model for Bayesian personalized ranking\n", + "\n", + " :param num_users: a number of the unique users\n", + " :param num_items: a number of the unique movies\n", + " :param latent_dim: vector length for the latent representation\n", + " :return: Model\n", + " \"\"\"\n", + " user_input = Input((1,), name='user_input')\n", + "\n", + " positive_item_input = Input((1,), name='positive_item_input')\n", + " negative_item_input = Input((1,), name='negative_item_input')\n", + " # One embedding layer is shared between positive and negative items\n", + " item_embedding_layer = Embedding(num_items, latent_dim, name='item_embedding', input_length=1)\n", + "\n", + " positive_item_embedding = Flatten()(item_embedding_layer(positive_item_input))\n", + " negative_item_embedding = Flatten()(item_embedding_layer(negative_item_input))\n", + "\n", + " user_embedding = Embedding(num_users, latent_dim, name='user_embedding', input_length=1)(user_input)\n", + " user_embedding = Flatten()(user_embedding)\n", + "\n", + " triplet_loss = Lambda(bpr_triplet_loss, output_shape=out_shape)([positive_item_embedding,\n", + " negative_item_embedding,\n", + " user_embedding])\n", + "\n", + " model = Model(inputs=[positive_item_input, negative_item_input, user_input], outputs=triplet_loss)\n", + "\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f3881bb4", + "metadata": {}, + "outputs": [], + "source": [ + "latent_dim = 350\n", + "batch_size = 256\n", + "num_epochs = 1\n", + "lr = 0.001\n", + "\n", + "model = build_model(num_users, num_items, latent_dim)\n", + "model.compile(loss=identity_loss, optimizer=Adam(learning_rate=lr))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "2d4b4b70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of parameters: 2,382,100.0\n", + "Trainable number of parameters: 2,382,100\n", + "Non-trainable number of parameters: 0.0\n", + "Training data length: 37,621,280\n" + ] + } + ], + "source": [ + "trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])\n", + "non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])\n", + "\n", + "print('Total number of parameters: {:,}'.format(trainable_count + non_trainable_count))\n", + "print('Trainable number of parameters: {:,}'.format(trainable_count))\n", + "print('Non-trainable number of parameters: {:,}'.format(non_trainable_count))\n", + "\n", + "print('Training data length: {:,}'.format(df_triplest.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "5127c25e", + "metadata": {}, + "outputs": [], + "source": [ + "df_triplest['user_id'] = df_triplest['user_id'].astype(str).astype(int)\n", + "df_triplest['positive_m_id'] = df_triplest['positive_m_id'].astype(str).astype(int)\n", + "df_triplest['negative_m_id'] = df_triplest['negative_m_id'].astype(str).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "73bf776a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "146959/146959 [==============================] - 3662s 25ms/step - loss: 0.0078\n", + "Wall time: 1h 1min 2s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "X = {\n", + " 'user_input': tf.convert_to_tensor(df_triplest.user_id),\n", + " 'positive_item_input': tf.convert_to_tensor(df_triplest.positive_m_id),\n", + " 'negative_item_input': tf.convert_to_tensor(df_triplest.negative_m_id)\n", + "}\n", + "\n", + "model.fit(X, \n", + " tf.ones(df_triplest.shape[0]), \n", + " batch_size=batch_size,\n", + " epochs=num_epochs)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "05ef9677", + "metadata": {}, + "outputs": [], + "source": [ + "def full_auc(model: Model, ground_truth: Dict[int, list], items: list) -> float:\n", + " \"\"\"\n", + " Measure AUC for model and ground truth for all items\n", + " \n", + " :param model: \n", + " :param ground_truth: dictionary of the users and the high ranked movies for the specific user\n", + " :param items: a list of the all available movies\n", + " :return: AUC\n", + " \"\"\"\n", + "\n", + " number_of_items = len(items)\n", + " scores = []\n", + "\n", + " for user_id, true_item_ids in ground_truth:\n", + " predictions = bpr_predict(model, user_id, items)\n", + " grnd = np.zeros(number_of_items, dtype=np.int32)\n", + "\n", + " for p in true_item_ids:\n", + " index = items.index(p)\n", + " grnd[index] = 1\n", + "\n", + " if true_item_ids:\n", + " scores.append(roc_auc_score(grnd, predictions))\n", + "\n", + " return sum(scores) / len(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "14a72e06", + "metadata": {}, + "outputs": [], + "source": [ + "def mean_average_precision_k(model: Model, \n", + " ground_truth: Dict[int, list], \n", + " items: list, \n", + " k=100) -> float:\n", + " \"\"\"\n", + " Calculate mean eavarage precission per user\n", + " \n", + " :param model: \n", + " :param ground_truth: dictionary of the users and the high ranked movies for the specific user\n", + " :param items: a list of the all available movies\n", + " :param k: top N recommendations per user\n", + " :return: mean eavarage precission\n", + " \"\"\"\n", + " scores = []\n", + "\n", + " for user, actual in ground_truth:\n", + " predictions = bpr_predict(model, user, items)\n", + " predictions = dict(zip(items, predictions))\n", + " predictions = sorted(predictions.items(), key=lambda kv: kv[1], reverse=True)[:k]\n", + " predictions = list(OrderedDict(predictions).keys())\n", + "\n", + " score = 0.0\n", + " num_hits = 0.0\n", + "\n", + " for i, p in enumerate(predictions):\n", + " if p in actual:\n", + " num_hits += 1.0\n", + " score += num_hits / (i + 1.0)\n", + "\n", + " score = score / min(len(actual), k)\n", + " scores.append(score)\n", + "\n", + " return np.mean(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "b1751d16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC train: 0.9548811986790618\n" + ] + } + ], + "source": [ + "print(f'AUC train: {full_auc(model, ground_truth_train.values, unique_movie_ids)}')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "f3704e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean average precision train: 0.3110772982725375\n" + ] + } + ], + "source": [ + "print(f'Mean average precision train: {mean_average_precision_k(model, ground_truth_train.values, unique_movie_ids)}')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "9db39950", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC test: 0.78945988809725\n" + ] + } + ], + "source": [ + "print(f'AUC test: {full_auc(model, ground_truth_test.values, unique_movie_ids)}')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "c825ceeb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean average precision test: 0.014730053180023026\n" + ] + } + ], + "source": [ + "print(f'Mean average precision test: {mean_average_precision_k(model, ground_truth_test.values, unique_movie_ids)}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}