Created
October 20, 2015 09:35
-
-
Save dimart/3d4a3e803757e1775ec6 to your computer and use it in GitHub Desktop.
ver2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.cluster import KMeans | |
import hdf5_getters as h5 | |
import glob | |
import os | |
from math import log | |
import scipy | |
from sets import Set | |
NUM_SONG_PER_USER = 25 | |
USER_COUNT = 1000 | |
def mk_data(fname="./train_triplets.txt"): | |
txt = open(fname, "r") | |
users = {} | |
user_count = 0 | |
total = 0 | |
songs = Set([]) | |
songs_per_user = [] | |
cur_user = "" | |
while (user_count < USER_COUNT): | |
uId, sId, count = txt.readline().split() | |
if (cur_user != uId): | |
total += 1 | |
if (len(songs_per_user) >= NUM_SONG_PER_USER): | |
users[cur_user] = songs_per_user | |
tmp_songs = [x for (x,y) in songs_per_user] | |
songs = songs.union(tmp_songs) | |
user_count += 1 | |
songs_per_user = [] | |
cur_user = uId | |
# if (not users.has_key(uId)): | |
# users[uId] = [] | |
songs_per_user.append((sId, int(count))) | |
# users[uId].append((sId, count)) | |
# songs.add(sId) | |
txt.close() | |
print total | |
print len(songs) | |
return users, songs | |
def cluster_kmeans(users): | |
est = KMeans(5) | |
good_users = {} | |
for u in users: | |
play_count = [y for (x, y) in users[u]] | |
if (len(play_count) < 5): | |
break | |
songs = [x for (x, y) in users[u]] | |
nl = np.array(play_count) | |
nl = nl[np.newaxis, :].T | |
est.fit(nl) | |
ratings = est.labels_ + 1 | |
result = zip(songs, ratings.tolist()) | |
good_users[u] = result | |
return good_users | |
def rate_generate(play_cnt): | |
if play_cnt >= 1 and play_cnt <=2 : | |
return 1 | |
elif play_cnt >= 3 and play_cnt <= 4 : | |
return 2 | |
elif play_cnt >= 5 and play_cnt <= 6 : | |
return 3 | |
elif play_cnt >= 7 and play_cnt <= 9 : | |
return 4 | |
else: return 5 | |
def cluster_switch(users): | |
for u in users: | |
# users[u] = [(x, rate_generate(y)) for (x, y) in users[u]] | |
users[u] = map(lambda (sId, pc): (sId, rate_generate(pc)), users[u]) | |
return users | |
def main(): | |
# users = cluster(mk_users()) | |
# songs = mk_songs() | |
users, songs = mk_data() | |
users = cluster_switch(users) | |
songs = list(songs) | |
test_data = {} | |
R = np.zeros((len(users.keys()), len(songs))) | |
i = 0 | |
tmp_k = 0 | |
test_data_count = 0 | |
for u in users: | |
test_data[i] = [] | |
for sId, rate in users[u]: | |
tmp_k += 1 | |
j = songs.index(sId) | |
if (tmp_k % 5 == 0): | |
test_data_count += 1 | |
test_data[i] += [(j,rate)] | |
else: | |
R[i, j] = rate | |
i += 1 | |
del i | |
print tmp_k | |
n, m = R.shape | |
ru_sqr = np.zeros((n, 1)) | |
# ru_avg = np.zeros((m, 1)) | |
# for j in xrange(m): | |
# sumj = 0 | |
# count = 0 | |
# for i in xrange(n): | |
# sumj += R[i,j] | |
# if (R[i,j] != 0): | |
# count += 1 | |
# ru_avg[j] = float(sumj) / float(count) | |
for i in xrange(n): | |
sumi = 0 | |
for j in xrange(m): | |
sumi += (R[i,j] ** 2) | |
ru_sqr[i] = float(sumi) | |
# pirs = np.zeros((n-1, 1)) | |
# for i in xrange(n - 1): | |
# sum1 = 0.0 | |
# sum2 = 0.0 | |
# sum3 = 0.0 | |
# for j in xrange(m): | |
# if (R[n-1, j] != 0 and R[i, j] != 0): | |
# sum1 += ((R[n-1,j] - ru_avg[n-1]) * (R[i, j] - ru_avg[i])) | |
# sum2 += ((R[n-1,j] - ru_avg[n-1]) ** 2) | |
# sum3 += ((R[i,j] - ru_avg[i]) ** 2) | |
# if not (sum2 == 0.0 or sum3 == 0.0): | |
# pirs[i] = sum1 / ((sum2 * sum3) ** 0.5) | |
RMSE = 0.0 | |
MAE = 0.0 | |
test_data_count = 0 | |
iters = list(xrange(n-1,-1,-1)) | |
list_DSG = [] | |
list_DSGi = [] | |
for u_iter in iters : | |
cosine = [] | |
data_users = list(xrange(n)) | |
data_users.remove(u_iter) | |
for i in data_users: | |
sum1 = 0.0 | |
for j in xrange(m): | |
if (R[u_iter, j] != 0 and R[i, j] != 0): | |
sum1 += R[u_iter,j] * R[i, j] | |
cosine.append((sum1 / ((ru_sqr[u_iter] * ru_sqr[i]) ** 0.5), i)) | |
cosine.sort(reverse=True) | |
# for j in xrange(m): | |
# sum1 = 0.0 | |
# sum2 = 0.0 | |
# if (R[u_iter, j] != 0): | |
# neighbors = [(x, y) for (x, y) in cosine if x != 0 and R[y, j] != 0] | |
# if (len(neighbors) > 0): | |
# neighbors = neighbors[:3] | |
# for (cosj, nj) in neighbors: | |
# sum1 += cosj * R[nj, j] | |
# sum2 += abs(cosj) | |
# print (float(sum1/sum2) , R[u_iter,j]) | |
DSG = [] | |
DSGi = [] | |
for (test_song, true_rate) in test_data[u_iter]: | |
sum1 = 0.0 | |
sum2 = 0.0 | |
neighbors = [(cos, y) for (cos, y) in cosine if cos != 0 and R[y, test_song] != 0] | |
if (len(neighbors) > 0): | |
neighbors = neighbors[:5] | |
for (cosj, nj) in neighbors: | |
sum1 += cosj * R[nj, test_song] | |
sum2 += abs(cosj) | |
test_data_count += 1 | |
new_rate = float(sum1/sum2) | |
RMSE += (new_rate - true_rate) ** 2 | |
MAE += abs(new_rate - true_rate) | |
DSG.append(new_rate) | |
DSGi.append(true_rate) | |
# print (float(sum1/sum2) , true_rate) | |
DSG.sort(reverse=True) | |
DSGi.sort(reverse=True) | |
sum_dsg = 0.0 | |
sum_dsgi = 0.0 | |
for i in xrange(len(DSG)): | |
tmpl = log(i+1, 2) | |
sum_dsg += DSG[i] / max(1.0, tmpl) | |
sum_dsgi += DSGi[i] / max(1.0, tmpl) | |
list_DSG.append(sum_dsg) | |
list_DSGi.append(sum_dsgi) | |
# raw_input() | |
print u_iter | |
RMSE = (RMSE / test_data_count) ** 0.5 | |
MAE = (MAE / test_data_count) ** 0.5 | |
DSG = sum(list_DSG) / USER_COUNT | |
DSGi = sum(list_DSGi) / USER_COUNT | |
print MAE, RMSE, float(DSG/DSGi) | |
return 0 | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment