Skip to content

Instantly share code, notes, and snippets.

@dzenanh
Created January 3, 2017 15:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dzenanh/8383ef7326636ad1d04fa27b7af16d60 to your computer and use it in GitHub Desktop.
Save dzenanh/8383ef7326636ad1d04fa27b7af16d60 to your computer and use it in GitHub Desktop.
# coding: utf-8
# ## import some stuff ##
# In[246]:
import numpy as np
import scipy as sc
from pandas import Series,DataFrame
import pandas as pd
from scipy import spatial
from sklearn import preprocessing
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from collections import OrderedDict
from fractions import Fraction
get_ipython().magic(u'matplotlib inline')
mpl.rcParams['figure.figsize'] = (10.0, 5)
# # Part 1 #
# ## Collaborative filtering item-item ##
#
#
# This notebook is implementation of collaborative filtering algorithm in python.
# Missing rating for Hotel1, and User5 is going to be predicted.
# Recommendations are maid based on these calculations.
#
# Have fun...
# In[247]:
df = pd.DataFrame({'Hotel1' :[1,0,3,0,0,5,0,0,5,0,4,0],
'Hotel2' :[0,0,5,4,0,0,4,0,0,2,1,3],
'Hotel3' :[2,4,0,1,2,0,3,0,4,3,5,0],
'Hotel4' :[0,2,4,0,5,0,0,4,0,0,2,0],
'Hotel5' :[0,0,4,3,4,2,0,0,0,0,2,5],
'Hotel6' : [1,0,3,0,3,0,0,2,0,0,4,0],
}, index=['User1','User2','User3','User4','User5',
'User6','User7','User8','User9','User10','User11','User12'])
df = df.transpose()
df
# In[248]:
# check if hotels have enough ratings (enough support) to be able to make predictions
df.transpose().plot.barh(stacked=True)
# In[249]:
# find 0 values
no_rating_mask = (df == 0)
no_rating_mask
# In[250]:
#comes after
#df[no_rating_mask] = None
#df
# In[251]:
# possibility 2 to find hotel rating mean values
hotel_rating_averages = df[np.invert(no_rating_mask)].mean(axis=1)
hotel_rating_averages
# In[252]:
# normalise dataset
dfn = df.sub(hotel_rating_averages, axis=0)
dfn = dfn.round(1)
dfn
# In[253]:
# put 0 values where no values was found
dfn[no_rating_mask] = 0
# and round values
dfn = dfn.round(1)
# In[254]:
dfn
# In[255]:
# inspect hotel similarities
sns.pairplot(dfn.transpose())
# In[256]:
# we could also plot hotel recommendation values vectors
soa = dfn.transpose().values
print zip(*soa)
X,Y,U,V,Z,E = zip(*soa)
plt.figure()
ax = plt.gca()
ax.quiver(X,Y,U,V,Z,E, angles='xy',scale_units='xy',scale=1)
ax.set_xlim([-1,4])
ax.set_ylim([-1,1])
plt.draw()
plt.show()
# In[257]:
# pearson correlation similarity
# option 1
hotel_similarity_df = dfn.transpose().corr().round(2)
hotel_similarity_df
# In[258]:
sns.heatmap(hotel_similarity_df, annot=True)
# In[259]:
# we couuld also calculate hotel similarities this way
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
A_sparse = sparse.csr_matrix(dfn.as_matrix())
#also can output sparse matrices
similarities_sparse = cosine_similarity(A_sparse, dense_output=False)
print('hotel pairwise similarity:\n {}\n'.format(similarities_sparse))
# # now lets calculate how would the user 5 rate the hotel 1
# In[260]:
# Hotel1 is most similar to the Hotels 3 and 6
mask = hotel_similarity_df["Hotel1"] > 0.30
mask
# In[261]:
# take ratings of most similar hotels (3 and 6)
hotel_ratings = df.User5[mask].values[1:]
hotel_ratings
# In[262]:
# take similarities of most similar hotels (3 and 6)
hotel_sim = hotel_similarity_df.Hotel1[mask].values[1:]
hotel_sim
# In[263]:
#calculate rating for hotel 1 from user 5
# predict by taking weighted average
r_15 = sum(hotel_ratings * hotel_sim) / sum(hotel_sim)
print "User 5 would rate Hotel 1 with: ", round(r_15,1), " stars"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment