Skip to content

Instantly share code, notes, and snippets.

@dimart
Created October 20, 2015 09:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimart/3d4a3e803757e1775ec6 to your computer and use it in GitHub Desktop.
Save dimart/3d4a3e803757e1775ec6 to your computer and use it in GitHub Desktop.
ver2
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
import hdf5_getters as h5
import glob
import os
from math import log
import scipy
from sets import Set
NUM_SONG_PER_USER = 25
USER_COUNT = 1000
def mk_data(fname="./train_triplets.txt"):
txt = open(fname, "r")
users = {}
user_count = 0
total = 0
songs = Set([])
songs_per_user = []
cur_user = ""
while (user_count < USER_COUNT):
uId, sId, count = txt.readline().split()
if (cur_user != uId):
total += 1
if (len(songs_per_user) >= NUM_SONG_PER_USER):
users[cur_user] = songs_per_user
tmp_songs = [x for (x,y) in songs_per_user]
songs = songs.union(tmp_songs)
user_count += 1
songs_per_user = []
cur_user = uId
# if (not users.has_key(uId)):
# users[uId] = []
songs_per_user.append((sId, int(count)))
# users[uId].append((sId, count))
# songs.add(sId)
txt.close()
print total
print len(songs)
return users, songs
def cluster_kmeans(users):
est = KMeans(5)
good_users = {}
for u in users:
play_count = [y for (x, y) in users[u]]
if (len(play_count) < 5):
break
songs = [x for (x, y) in users[u]]
nl = np.array(play_count)
nl = nl[np.newaxis, :].T
est.fit(nl)
ratings = est.labels_ + 1
result = zip(songs, ratings.tolist())
good_users[u] = result
return good_users
def rate_generate(play_cnt):
if play_cnt >= 1 and play_cnt <=2 :
return 1
elif play_cnt >= 3 and play_cnt <= 4 :
return 2
elif play_cnt >= 5 and play_cnt <= 6 :
return 3
elif play_cnt >= 7 and play_cnt <= 9 :
return 4
else: return 5
def cluster_switch(users):
for u in users:
# users[u] = [(x, rate_generate(y)) for (x, y) in users[u]]
users[u] = map(lambda (sId, pc): (sId, rate_generate(pc)), users[u])
return users
def main():
# users = cluster(mk_users())
# songs = mk_songs()
users, songs = mk_data()
users = cluster_switch(users)
songs = list(songs)
test_data = {}
R = np.zeros((len(users.keys()), len(songs)))
i = 0
tmp_k = 0
test_data_count = 0
for u in users:
test_data[i] = []
for sId, rate in users[u]:
tmp_k += 1
j = songs.index(sId)
if (tmp_k % 5 == 0):
test_data_count += 1
test_data[i] += [(j,rate)]
else:
R[i, j] = rate
i += 1
del i
print tmp_k
n, m = R.shape
ru_sqr = np.zeros((n, 1))
# ru_avg = np.zeros((m, 1))
# for j in xrange(m):
# sumj = 0
# count = 0
# for i in xrange(n):
# sumj += R[i,j]
# if (R[i,j] != 0):
# count += 1
# ru_avg[j] = float(sumj) / float(count)
for i in xrange(n):
sumi = 0
for j in xrange(m):
sumi += (R[i,j] ** 2)
ru_sqr[i] = float(sumi)
# pirs = np.zeros((n-1, 1))
# for i in xrange(n - 1):
# sum1 = 0.0
# sum2 = 0.0
# sum3 = 0.0
# for j in xrange(m):
# if (R[n-1, j] != 0 and R[i, j] != 0):
# sum1 += ((R[n-1,j] - ru_avg[n-1]) * (R[i, j] - ru_avg[i]))
# sum2 += ((R[n-1,j] - ru_avg[n-1]) ** 2)
# sum3 += ((R[i,j] - ru_avg[i]) ** 2)
# if not (sum2 == 0.0 or sum3 == 0.0):
# pirs[i] = sum1 / ((sum2 * sum3) ** 0.5)
RMSE = 0.0
MAE = 0.0
test_data_count = 0
iters = list(xrange(n-1,-1,-1))
list_DSG = []
list_DSGi = []
for u_iter in iters :
cosine = []
data_users = list(xrange(n))
data_users.remove(u_iter)
for i in data_users:
sum1 = 0.0
for j in xrange(m):
if (R[u_iter, j] != 0 and R[i, j] != 0):
sum1 += R[u_iter,j] * R[i, j]
cosine.append((sum1 / ((ru_sqr[u_iter] * ru_sqr[i]) ** 0.5), i))
cosine.sort(reverse=True)
# for j in xrange(m):
# sum1 = 0.0
# sum2 = 0.0
# if (R[u_iter, j] != 0):
# neighbors = [(x, y) for (x, y) in cosine if x != 0 and R[y, j] != 0]
# if (len(neighbors) > 0):
# neighbors = neighbors[:3]
# for (cosj, nj) in neighbors:
# sum1 += cosj * R[nj, j]
# sum2 += abs(cosj)
# print (float(sum1/sum2) , R[u_iter,j])
DSG = []
DSGi = []
for (test_song, true_rate) in test_data[u_iter]:
sum1 = 0.0
sum2 = 0.0
neighbors = [(cos, y) for (cos, y) in cosine if cos != 0 and R[y, test_song] != 0]
if (len(neighbors) > 0):
neighbors = neighbors[:5]
for (cosj, nj) in neighbors:
sum1 += cosj * R[nj, test_song]
sum2 += abs(cosj)
test_data_count += 1
new_rate = float(sum1/sum2)
RMSE += (new_rate - true_rate) ** 2
MAE += abs(new_rate - true_rate)
DSG.append(new_rate)
DSGi.append(true_rate)
# print (float(sum1/sum2) , true_rate)
DSG.sort(reverse=True)
DSGi.sort(reverse=True)
sum_dsg = 0.0
sum_dsgi = 0.0
for i in xrange(len(DSG)):
tmpl = log(i+1, 2)
sum_dsg += DSG[i] / max(1.0, tmpl)
sum_dsgi += DSGi[i] / max(1.0, tmpl)
list_DSG.append(sum_dsg)
list_DSGi.append(sum_dsgi)
# raw_input()
print u_iter
RMSE = (RMSE / test_data_count) ** 0.5
MAE = (MAE / test_data_count) ** 0.5
DSG = sum(list_DSG) / USER_COUNT
DSGi = sum(list_DSGi) / USER_COUNT
print MAE, RMSE, float(DSG/DSGi)
return 0
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment