Skip to content

Instantly share code, notes, and snippets.

@wy36101299
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wy36101299/35b71ab48f38ac890212 to your computer and use it in GitHub Desktop.
Save wy36101299/35b71ab48f38ac890212 to your computer and use it in GitHub Desktop.
collaborative filtering
import numpy as np
# linalg:Linear algebra
# 日式 中式 美式 泰式 韓式
# ---------------------------
# sam 2 0 0 4 4
#john 5 5 5 3 3
# tim 2 4 2 1 2
#以下矩陣可看此 user 對 item 進行評分
#藉由協同過濾 猜出 sam對中式和美式的評價後,推薦sam吃什麼
def loadItemScore():
tmp = [[4, 4, 0, 2, 2],
[4, 0, 0, 3, 3],
[4, 0, 0, 1, 1],
[1, 1, 1, 2, 0],
[2, 2, 2, 0, 0],
[5, 5, 5, 0, 0],
[1, 1, 1, 0, 0]]
itemScore = np.array(tmp,dtype=np.float)
return itemScore
def loadItemScore2():
tmp = [[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
itemScore = np.array(tmp,dtype=np.float)
return itemScore
# SVD 矩陣奇異值分解
# 利用SVD奇異值分解可以大幅減低矩陣的儲存量
U,sigma,VT = np.linalg.svd(loadItemScore2())
# 矩陣能量為 sum(sigma**2) 需保留90%為典型作法
sig2 = sigma**2
print("矩陣總能量為:")
print(sum(sig2))
print("90%矩陣能量:")
print(sum(sig2)*0.9)
print("轉換為二維矩陣能量:")
print(sum(sig2[:2]))
print("轉換三維矩陣能量:")
print(sum(sig2[:3]))
# 轉換為二維能量太低,需為三維 svdMatrix則為還原後的矩陣
Sig4 = np.mat(np.eye(3)*sigma[:3])
svdMatrix = U[:,:3] * Sig4 * VT[:3,:]
# 資料來源:http://en.wikipedia.org/wiki/Singular_value_decomposition
# Euclid Similarity
def eulidSim(a,b):
eulid = np.linalg.norm(a-b)
norEulid = 1.0/(1.0 + eulid)
return norEulid
# Adjusted Cosine Similarity
# 不超過3 Similarity =1
def adjCosSim(a,b):
if len(a)<3:
return 1
else:
a = a-a.mean()
b = b-b.mean()
cos = float(np.inner(a,b))/(np.linalg.norm(a)*np.linalg.norm(b))
norCos = 0.5+0.5*cos
return norCos
# Cosine Similarity
def cosSim(a,b):
cos = float(np.inner(a,b))/(np.linalg.norm(a)*np.linalg.norm(b))
norCos = 0.5+0.5*cos
return norCos
# Person Correlation Coefficient Similarity
# 不超過3 Similarity =1
def pearsSim(a,b):
if len(a)<3:
return 1
else:
corcoef = np.corrcoef(a,b,rowvar=0)[0][1]
norCorcoef = 0.5+0.5*corcoef
return norCorcoef
# 資料來源:http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
# Person Correlation Coefficient Similarity 和 Adjusted Cosine Similarity 比較
# In pearson correlation,減去該物品ㄉ
# In adjusted cosine correlation 不同使用者可能對物品會有較高或是較低的評分表現 因此減去使用者平均的評分 e.x.(4,5)(1,2)視為行為相似
# 資料來源:http://www.cs.carleton.edu/cs_comps/0607/recommend/recommender/itembased.html
# 概念藉由使用者都評分物品的分數做相似度運算,其結果*該未評分之物品 總和取平均 即為可能的分數
def itemSimRec(itemScore, user, simMethods, item):
# user number : shape(itemScore)[0] item number : shape(itemScore)[1]
n = np.shape(itemScore)[1]
simTotal = 0.0; ratSimTotal = 0.0
for i in range(n):
userItemRating = itemScore[user,i]
# pass norating item
if userItemRating == 0 or i==item:
pass
else:
# find same rating item
sameRatingItem = np.nonzero(np.logical_and(itemScore[:,item] > 0,itemScore[:,i] > 0))[0]
if len(sameRatingItem) == 0:
similarity = 0
else:
similarity = simMethods(itemScore[sameRatingItem,item],itemScore[sameRatingItem,i])
# print(itemScore[sameRatingItem,item])
# print(itemScore[sameRatingItem,i])
# print 'the %d and %d similarity is: %f' % (item, i, similarity)
simTotal += similarity
ratSimTotal += similarity * userItemRating
if simTotal == 0:
return 0
else:
return ratSimTotal/simTotal
def recommend(itemScore, user, N=3, simMethods=cosSim, estMethod=itemSimRec):
#find unrated items
unratedItems = np.nonzero(itemScore[user,:] ==0)[0]
if len(unratedItems) == 0:
return 'you rated everything'
else:
itemScores = []
for item in unratedItems:
estimatedScore = estMethod(itemScore, user, simMethods, item)
itemScores.append((item, estimatedScore))
recItem = sorted(itemScores, key=lambda x: x[1], reverse=True)[:N]
return recItem
# SVD分解再還原 只有有多出SVD的部分,其餘和itemSimRec算法相同
def svdItemSimRec(itemScore, user, simMethods, item):
n = np.shape(itemScore)[1]
simTotal = 0.0; ratSimTotal = 0.0
U,Sigma,VT = np.linalg.svd(itemScore)
Sig4 = np.mat(np.eye(3)*Sigma[:3]) #arrange Sig4 into a diagonal matrix
xformedItems = U[:,:3] * Sig4 * VT[:3,:] #create transformed items
for i in range(n):
userItemRating = itemScore[user,i]
# pass norating item
if userItemRating == 0 or i==item:
pass
else:
# find same rating item
sameRatingItem = np.nonzero(np.logical_and(itemScore[:,item] > 0,itemScore[:,i] > 0))[0]
if len(sameRatingItem) == 0:
similarity = 0
else:
similarity = simMethods(itemScore[sameRatingItem,item],itemScore[sameRatingItem,i])
# print(itemScore[sameRatingItem,item])
# print(itemScore[sameRatingItem,i])
# print 'the %d and %d similarity is: %f' % (item, i, similarity)
simTotal += similarity
ratSimTotal += similarity * userItemRating
if simTotal == 0:
return 0
else:
return ratSimTotal/simTotal
# [(4, 5.0), (9, 5.0), (10, 4.7297297297297298)]
# 物品4 分數5 物品9 分數5 物品10 分數4.7 依序推薦
print("estMethod=itemSimRec")
print("--------------------")
print("simMethods=eulidSim")
print(recommend(loadItemScore2(),4,simMethods=eulidSim,estMethod=itemSimRec))
print("simMethods=cosSim")
print(recommend(loadItemScore2(),4,simMethods=cosSim,estMethod=itemSimRec))
print("simMethods=adjCosSim")
print(recommend(loadItemScore2(),4,simMethods=adjCosSim,estMethod=itemSimRec))
print("simMethods=pearsSim")
print(recommend(loadItemScore2(),4,simMethods=pearsSim,estMethod=itemSimRec))
# 照理說經過SVD分解後,可大幅減少儲存量,但對於矩陣還原可能會失真,故會比itemSimRec中有更多誤差存在
print("estMethod=svdItemSimRec")
print("-----------------------")
print("simMethods=eulidSim")
print(recommend(loadItemScore2(),4,simMethods=eulidSim,estMethod=svdItemSimRec))
print("simMethods=cosSim")
print(recommend(loadItemScore2(),4,simMethods=cosSim,estMethod=svdItemSimRec))
print("simMethods=adjCosSim")
print(recommend(loadItemScore2(),4,simMethods=adjCosSim,estMethod=svdItemSimRec))
print("simMethods=pearsSim")
print(recommend(loadItemScore2(),4,simMethods=pearsSim,estMethod=svdItemSimRec))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment