Last active
August 11, 2019 14:05
-
-
Save badalnabizade/d190e8016b2ee5d7f773cc7cefd84ceb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow.keras import datasets, layers, models | |
from tensorflow.keras.layers import Input, add, dot, Flatten, Embedding,Dropout, concatenate | |
from tensorflow.keras.regularizers import l2 | |
from tensorflow.keras import optimizers | |
ratings = pd.read_csv('./data/ratings.csv') | |
ratings.drop('timestamp', 1, inplace=True) | |
movies = pd.read_csv('./data/movies.csv') | |
# Those are the movies in movies.csv that doesn't exist in ratings.csv | |
movies_not_in_ratings = [i for i in movies.movieId if i not in ratings.movieId.unique()] | |
indcs = movies.set_index('movieId').loc[movies_not_in_ratings].index | |
movies = movies.set_index('movieId').drop(index=indcs).reset_index() | |
def proc_data_for_model(ratings_data, movie_col, user_col): | |
""" | |
Process ratings and movies datasets. | |
Changes userIds in ratings data with corresponding indices. | |
Changes movieIds in ratings and movies data with corresponding indices. | |
Returns: Number of unique users and number of unique movies in ratings dataset. | |
""" | |
unique_users = ratings[user_col].unique() | |
user_to_indx = {v:i for i,v in enumerate(unique_users)} | |
ratings[user_col] = ratings[user_col].apply(lambda val: user_to_indx[val]) | |
n_users = len(unique_users) | |
unique_movies = ratings[movie_col].unique() | |
movie_to_indx = {o:i for i,o in enumerate(unique_movies)} | |
movies[movie_col] = movies[movie_col].apply(lambda val: movie_to_indx[val]) | |
ratings[movie_col] = ratings[movie_col].apply(lambda val: movie_to_indx[val]) | |
n_movies=len(unique_movies) | |
return n_users, n_movies | |
n_users, n_movies = proc_data_for_model(ratings, 'movieId', 'userId') | |
n_factors = 50 # size of embedding matrix. | |
def get_input(n_inputs, shape=(1,)): | |
inputs = [Input(shape=shape) for arg in range(n_inputs)] | |
return inputs | |
def get_emb(input_, input_dim, output_dim, inp_length, reg_func, name): | |
emb = Embedding(input_dim, output_dim, input_length=inp_length, embeddings_regularizer=reg_func, name=name)(input_) | |
return emb | |
def get_bias_emb(input_, input_dim): | |
bias_emb = Embedding(input_dim=input_dim, output_dim=1, input_length=1)(input_) | |
bias_emb = Flatten()(bias_emb) | |
return bias_emb | |
u_inp, m_inp = get_input(2) | |
u_emb = get_emb(u_inp, n_users, n_factors, 1, l2(1e-4), 'user_embedding') | |
m_emb = get_emb(m_inp, n_movies, n_factors, 1, l2(1e-4), 'movie_embedding') | |
u_b, m_b = get_bias_emb(u_inp, n_users), get_bias_emb(m_inp, n_movies) | |
opt = optimizers.SGD(1e-1, decay=2e-4, momentum=0.9) | |
x = dot([u_emb, m_emb], axes=2) | |
x = Flatten()(x) | |
x = add([x, u_b]) | |
x = add([x, m_b]) | |
model = models.Model([u_inp, m_inp], x) | |
model.compile(opt, loss='mse') | |
model.fit([train.userId, train.movieId], train.rating, batch_size=512, epochs=15, | |
validation_data=([val.userId, val.movieId], val.rating)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment