From 6cce29f8d50f157d6b138ee7eb0c339838744dee Mon Sep 17 00:00:00 2001 From: Susan Li Date: Fri, 31 Dec 2021 00:07:07 -0500 Subject: [PATCH] Add py files --- bpr.py | 60 ++++++++++++++++++++++++ classes.py | 36 +++++++++++++++ filereader.py | 82 +++++++++++++++++++++++++++++++++ hitrate.py | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++ util.py | 40 ++++++++++++++++ 5 files changed, 342 insertions(+) create mode 100644 bpr.py create mode 100644 classes.py create mode 100644 filereader.py create mode 100644 hitrate.py create mode 100644 util.py diff --git a/bpr.py b/bpr.py new file mode 100644 index 0000000..ba0ad9b --- /dev/null +++ b/bpr.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from util import m_normal, learning_rate, get_lambda +from classes import ret +import random as random +import numpy as np +import math +def bpr_update(users, movies): + count = 0 + lr = learning_rate() + lam = get_lambda() + for u1 in users: + u = users[u1] + userid = u.userid + Vu = u.factor + if (len(u.movies_train) > 0): + + rand_pos = random.sample(u.movies_train.keys(), 1)[0] + rand_neg = random.sample(movies.keys(), 1)[0] + + if rand_neg not in u.movies_train: + Vi = movies[rand_pos].factor + Vj = movies[rand_neg].factor + firstterm = calculate_first_term(Vu, Vi, Vj) + + # USER FACTOR + diff = Vi - Vj + d = firstterm * diff + derivative = d + Vu = Vu + lr * (derivative + lam * np.linalg.norm(Vu)) + users[u1].factor = Vu + + # ITEM POSITIVE FACTOR + d = firstterm * Vu + derivative = d + Vi = Vi + lr * (derivative + lam * np.linalg.norm(Vi)) + movies[rand_pos].factor = Vi + + #ITEM NEGATIVE FACTOR + negvu = -1 * Vu + d = firstterm * negvu + derivative = d + Vj = Vj + lr * (derivative + lam * np.linalg.norm(Vj)) + movies[rand_neg].factor = Vj + +def calculate_first_term(Vu, Vi, Vj): + boughtdot = np.dot(Vu, Vi) + notboughtdot = np.dot(Vu, Vj) + negxuij = (boughtdot - notboughtdot) * -1 + if negxuij > 500: + negxuij = 500 + numerator = math.exp(negxuij) + denominator = 1 + math.exp(negxuij) + firstterm = numerator / denominator + return firstterm + diff --git a/classes.py b/classes.py new file mode 100644 index 0000000..d7502fe --- /dev/null +++ b/classes.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from util import random_vector +class user: + def __init__(self, userid): + self.userid = userid + self.movies_train = dict() + self.movies_test = dict() + self.movies_all = dict() + self.factor = random_vector() + +class movie: + def __init__(self, movieid, rating=0, title=None, genres=None): + self.movieid = movieid + self.rating = rating + self.title = title + self.genres = genres + self.factor = random_vector() + +class ret: + def __init__(self): + self.userid = None + self.movieid = None + self.isuser = True + self.retvalue = [] + +class usermovie: + def __init__(self): + self.userid = None + self.movieid = None + self.rating = 0 + diff --git a/filereader.py b/filereader.py new file mode 100644 index 0000000..0434f76 --- /dev/null +++ b/filereader.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from classes import user +from classes import movie +from numpy import random +from util import min_rating, random_vector, num_users +from random import seed +import pandas as pd +import numpy as np +def read_ratings(filename): + seed(42) + np.random.seed(42) + r_cols = ['user_id', 'movie_id', 'rating', 'timestamp'] + ratings = pd.read_csv(filename, sep=',', names=r_cols, encoding='latin-1') + + ratings['user_id'] = ratings['user_id'].astype(int) + ratings['movie_id'] = ratings['movie_id'].astype(int) + ratings['rating'] = ratings['rating'].astype(float) + + numusers = num_users() + + msks = ratings['user_id'] < numusers + ratings = ratings[msks] + users = dict() + testcount = 0 + traincount = 0 + trainuserdict = dict() + + for index, row in ratings.iterrows(): + userid = int(row['user_id']) + movieid = int(row['movie_id']) + rating1 = float(row['rating']) + minmovierating = min_rating() + if rating1 >= minmovierating: + if random.random() < 0.7: + traincount = traincount + 1 + if userid in users.keys(): + user1 = users[userid] + user1.movies_train[movieid] = rating1 + else: + user1 = user(userid) + user1.factor = random_vector() + user1.movies_train[movieid] = rating1 + users[userid] = user1 + trainuserdict[userid] = 1 + else: + testcount = testcount + 1 + if userid in users.keys(): + user1 = users[userid] + user1.movies_test[movieid] = rating1 + else: + user1 = user(userid) + user1.factor = random_vector() + user1.movies_test[movieid] = rating1 + users[userid] = user1 + + for index, row in ratings.iterrows(): + userid = int(row['user_id']) + movieid = int(row['movie_id']) + rating1 = float(row['rating']) + if userid in users.keys(): + user1 = users[userid] + user1.movies_all[movieid] = rating1 + + return users + +def read_movies(filename): + r_cols = ['movie_id', 'title', 'genres'] + df = pd.read_csv(filename, sep=",", encoding='latin-1', names=r_cols) + movies = dict() + for index, row in df.iterrows(): + movieid = row['movie_id'] + movie1 = movie(movieid, 0) + movie1.factor = random_vector() + movies[movieid] = movie1 + + return movies + diff --git a/hitrate.py b/hitrate.py new file mode 100644 index 0000000..b81f674 --- /dev/null +++ b/hitrate.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from classes import usermovie +import numpy as np +from sklearn.metrics import mean_squared_error +from math import sqrt + +def hit_rate(users, movies): + hits = 0 + denom = 0 + actual = [] + predicted = [] + actualall = [] + predictedall = [] + for u1 in users: + u = users[u1] + userid = u.userid + usermovies = [] + if userid in users: + denom = denom + 1 + ufactor = users[userid].factor + for m1 in movies: + m = movies[m1] + mfactor = m.factor + dotp = np.dot(ufactor, mfactor) + if m.movieid in u.movies_all: + actualall.append(u.movies_all[m.movieid]) + predictedall.append(float(dotp)) + + if m.movieid in u.movies_test: + actual.append(u.movies_test[m.movieid]) + predicted.append(dotp) + + usermovied = usermovie() + usermovied.userid = userid + usermovied.movieid = m.movieid + usermovied.rating = dotp + usermovies.append(usermovied) + + usermovies.sort(key=lambda x: x.rating, reverse=True) + count = 0 + for um in usermovies: + userid = um.userid + movieid = um.movieid + #rating = um.rating + if movieid in users[userid].movies_test: + hits = hits + 1 + break + count = count + 1 + if count > 9: + break + + sortedpredicted = predicted + least = min(sortedpredicted) + sortedpredicted = [x + least for x in sortedpredicted] + sortedpredicted = [x / max(sortedpredicted) for x in sortedpredicted] + sortedpredicted = [x * 5 for x in sortedpredicted] + predicted = sortedpredicted + + sortedpredicted = predictedall + least = min(sortedpredicted) + sortedpredicted = [x + least for x in sortedpredicted] + sortedpredicted = [x / max(sortedpredicted) for x in sortedpredicted] + sortedpredicted = [x * 5 for x in sortedpredicted] + predictedall = sortedpredicted + + rms = sqrt(mean_squared_error(actual, predicted)) + rmsall = sqrt(mean_squared_error(actualall, predictedall)) + + return hits, denom, rms, rmsall + +def hit_rate_SVD(users, movies, svd): + hits = 0 + denom = 0 + actual = [] + predicted = [] + actualall = [] + predictedall = [] + for u1 in users: + u = users[u1] + userid = u.userid + usermovies = [] + if userid in users: + denom = denom + 1 + for m1 in movies: + m = movies[m1] + dotp = float(svd.predict(int(userid), int(m.movieid))[3]) + + if m.movieid in u.movies_all: + actualall.append(u.movies_all[m.movieid]) + predictedall.append(float(dotp)) + + if (str(m.movieid) in u.movies_test) | (int(m.movieid) in u.movies_test): + actual.append(u.movies_test[m.movieid]) + predicted.append(float(dotp)) + + usermovied = usermovie() + usermovied.userid = userid + usermovied.movieid = m.movieid + usermovied.rating = dotp + usermovies.append(usermovied) + + usermovies.sort(key=lambda x: x.rating, reverse=True) + count = 0 + for um in usermovies: + userid = um.userid + movieid = um.movieid + + if (str(movieid) in users[userid].movies_test) | (int(movieid) in users[userid].movies_test): + hits = hits + 1 + break + count = count + 1 + if count > 9: + break + + rms = sqrt(mean_squared_error(actual, predicted)) + rmsall = sqrt(mean_squared_error(actualall, predictedall)) + + return hits, denom, rms, rmsall + diff --git a/util.py b/util.py new file mode 100644 index 0000000..199e5b5 --- /dev/null +++ b/util.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from numpy import random +import numpy as np + +def num_users(): + return 1000000 + +def dimension(): + return 50 + +def min_rating(): + return 4 + +def learning_rate(): + return 1 + +def get_lambda(): + return 0.1 + +def random_vector(): + dim = dimension() + cov_mtx = cov_matrix() + return random.multivariate_normal(np.zeros(dim), cov_mtx) + +def cov_matrix(): + dim = dimension() + cov = np.zeros((dim, dim), dtype=float) + for i in range(dim): + cov[i][i] = 0.1 + return cov + +def m_normal(mean): + cov_mtx = cov_matrix() + return random.multivariate_normal(mean=mean, cov=cov_mtx) +