Skip to content

Instantly share code, notes, and snippets.

@singhrahuldps
Last active June 1, 2019 08:05
Show Gist options
  • Save singhrahuldps/4d4893b629eb5416dc17f13614a8ebd3 to your computer and use it in GitHub Desktop.
Save singhrahuldps/4d4893b629eb5416dc17f13614a8ebd3 to your computer and use it in GitHub Desktop.
An implementation of a basic Recommendation System built using Embedding Matrices in a Neural Net
# required libraries - numpy, pandas, pytorch
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import random
# laoding the table as a pandas dataframe
ratings = pd.read_csv('ratings.csv')
# getting the three column names from a pandas dataframe
user_col, item_col, rating_col = ratings.columns
# this function returns a python dictionary
# which maps each id to a corresponding index value
def list_2_dict(id_list:list):
d={}
for id, index in zip(id_list, range(len(id_list))):
d[id] = index
return d
# splits ratings dataframe to training and validation dataframes
def get_data(ratings, valid_pct:float = 0.2):
# shuffle the indexes
ln = random.sample(range(0, len(ratings)), len(ratings))
# split based on the given validation set percentage
part = int(len(ln)*valid_pct)
valid_index = ln[0:part]
train_index = ln[part:]
valid = ratings.iloc[valid_index]
train = ratings.iloc[train_index]
return [train,valid]
# get a batch -> (user, item and rating arrays) from the dataframe
def get_batch(ratings, start:int, end:int):
return ratings[user_col][start:end].values, ratings[item_col][start:end].values, ratings[rating_col][start:end].values
# get list of unique user ids
users = sorted(list(set(ratings[user_col].values)))
# get list of unique item ids
items = sorted(list(set(ratings[item_col].values)))
# generate dict of correponding indexes for the user ids
user2idx = list_2_dict(users)
# generate dict of correponding indexes for the item ids
item2idx = list_2_dict(items)
# neural net based on Embedding matrices
class EmbeddingModel(nn.Module):
def __init__(self, n_factors, n_users, n_items, y_range, initialise = 0.01):
super().__init__()
self.y_range = y_range
self.u_weight = nn.Embedding(n_users, n_factors)
self.i_weight = nn.Embedding(n_items, n_factors)
self.u_bias = nn.Embedding(n_users, 1)
self.i_bias = nn.Embedding(n_items, 1)
# initialise the weights of the embeddings
self.u_weight.weight.data.uniform_(-initialise, initialise)
self.i_weight.weight.data.uniform_(-initialise, initialise)
self.u_bias.weight.data.uniform_(-initialise, initialise)
self.i_bias.weight.data.uniform_(-initialise, initialise)
def forward(self, users, items):
# dot multiply the weights for the given user_id and item_id
dot = self.u_weight(users)* self.i_weight(items)
# sum the result of dot multiplication above and add both the bias terms
res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
# return the output in the given range
return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]
# create a model object
# y_range has been extended(0-11) than required(1-10) to make the
# values lie in the linear region of the sigmoid function
model = EmbeddingModel(10, len(users), len(items), [0,11], initialise = 0.01).cuda()
# split the data, returns a list [train, valid]
data = get_data(ratings, 0.1)
# loss = mean((target_rating - predicted_rating)**2)
loss_function = nn.MSELoss()
# optimizer function will update the weights of the Neural Net
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9)
# batch size for each input
bs = 128
def train(epochs = 10, bs = 64):
for epoch in range(epochs):
# training the model
i=0
total_loss = 0.0
ct = 0
while i < len(data[0]):
x1,x2,y = get_batch(data[0],i,i+bs)
i+=bs
ct+=1
user_ids = torch.LongTensor([user2idx[u] for u in x1]).cuda()
item_ids = torch.LongTensor([item2idx[b] for b in x2]).cuda()
y = torch.Tensor(y).cuda()
# disregard/zero the gradients from previous computation
model.zero_grad()
preds = model(user_ids,item_ids)
loss = loss_function(preds, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
total_loss /= ct
# getting the loss on validation set
i = 0
total_val_loss = 0.0
cv=0
m = model.eval() # setting the model to evaluation mode
while i < len(data[1]):
x11,x21,y1 = get_batch(data[1],i,i+bs)
i+=bs
cv+=1
user_ids = torch.LongTensor([user2idx[u] for u in x11]).cuda()
item_ids = torch.LongTensor([item2idx[b] for b in x21]).cuda()
y1 = torch.Tensor(y1).cuda()
preds = m(user_ids,item_ids)
loss = loss_function(preds, y1)
total_val_loss += loss.item()
total_val_loss /= cv
print('epoch', epoch+1, ' train loss', "%.3f" % total_loss,
' valid loss', "%.3f" % total_val_loss)
train(epochs=20, bs)
def recommend_item_for_user(model, user_id):
m = model.eval().cpu()
user_ids = torch.LongTensor([user2idx[u] for u in [user_id]*len(items)])
item_ids = torch.LongTensor([item2idx[b] for b in items])
remove = set(ratings[ratings[user_col] == user_id][item_col].values)
preds = m(user_ids,item_ids).detach().numpy()
pred_item = [(p,b) for p,b in sorted(zip(preds,items), reverse = True) if b not in remove]
return pred_item
def recommend_user_for_item(model, item_id):
m = model.eval().cpu()
user_ids = torch.LongTensor([user2idx[u] for u in users])
book_ids = torch.LongTensor([item2idx[b] for b in [item_id]*len(users)])
remove = set(ratings[ratings[item_col] == book_id][user_col].values)
preds = m(user_ids,item_ids).detach().numpy()
pred_user = [(p,u) for p,u in sorted(zip(preds,users), reverse = True) if u not in remove]
return pred_user
# Making life easier with fast.ai
# required library -> fastai (https://github.com/fastai/fastai/)
from fastai.collab import *
from fastai.tabular import *
# loading the dataframe
ratings = pd.read_csv('ratings.csv')
# creating data object
data = CollabDataBunch.from_df(ratings, seed=42, valid_pct=0.1)
# creating a learner object used for training and testing
learn = collab_learner(data, n_factors=40, y_range=[0,11])
# training method using the fit_one_cycle technique of training
learn.fit_one_cycle(5, 5e-3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment