Last active
June 14, 2019 10:46
-
-
Save dmarx/10590950 to your computer and use it in GitHub Desktop.
Simple recommender system
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
partial_reconstruction = function(svd, n){ | |
# Takes an SVD object, reconstructs the matrix using | |
# the top 'n' singular values | |
j = nrow(svd$u) | |
k = nrow(svd$v) | |
with(svd, u[,1:n] %*% (d[1:n]*diag(n)) %*% t(v[,1:n]) ) | |
} | |
SVD_K_Test = function(mat, svd_ob=NULL, k=NULL){ | |
# takes an SVD object and a vector of integers k. Let j be | |
# a value in k: reconstructs the decomposed matrix using the | |
# first j singular values for each j in k. Calculates the similarity | |
# between the input matrix and the lower-dimension reconstructions | |
# for each j in k as the frobenius norm, returns the vector of these | |
# similarity measurements. | |
if(is.null(svd_ob)){svd_ob=svd(mat)} | |
n = length(svd_ob$d) | |
if(is.null(k)){k=1:n} | |
sim = rep(NA, length(k)) | |
for (j in 1:length(k)){ | |
M_k = partial_reconstruction(svd_ob, k[j]) | |
sim[j] = norm(mat - M_k, type='F') | |
} | |
sim | |
} | |
# Problem 3 | |
ratings = as.matrix(read.table('user-shows.txt')) | |
system.time(r_svd <- svd(ratings)) # Just 9 sec, not bad! | |
library(ggplot2) | |
qplot(r_svd$u[,1], r_svd$u[,2], alpha=.25) | |
# Describing the ratings matrix in just two dimensions does not appear to reveal any latent structure in the data (i.e. there aren't any clear user clusters in 2D genre space). | |
k=c(2,10,20,25,30,40,50,75,100,150,200,300, 400, 500) | |
sim = SVD_K_Test(ratings, r_svd, k) | |
plot(k, sim) | |
points(40,sim[k==40], col='red', pch=4) | |
# There appear to be diminishing (approximately linear) returns for setting $k>40$, | |
# so I will attempt using that as the reduced dimensionality into which I will project the ratings matrix. | |
# To calculate recommendations, I will perform user-user similarity measurements | |
# in the dimension-reduced latent "topic" space, and then develop a "recommendation" vector as a weighted average | |
# of the K-nearest neighbors to the target user's vector (Alex), using the similarity measurements as the weighting scheme. | |
# For simplicity, I'm going to use euclidean distance for now, but it would probably be better to use cosine similarity. | |
################################################################### | |
# Step 1: cluster users in the latent space with k=40 | |
k=40 | |
system.time(adj_mat <- dist(r_svd$u[,1:k])) # 14sec, not bad | |
m = as.matrix(adj_mat) | |
# Extract and rank similarities to our target user. | |
target_ix = 500 | |
ranks = m[500,] | |
plot(sort(ranks[-500])) | |
plot(sort(ranks[-500])[1:500]) | |
plot(sort(ranks[-500])[1:50]) # let's use the top 20 nearest-neighbors | |
NN_20 = order(ranks[-500])[1:20] | |
NN_20_sim = ranks[NN_20] | |
# step 2: Extract ratings from relevant users and average ratings | |
# according to user-user similarity. | |
relevant = ratings[NN_20,] # 20 users x 563 movies | |
# weight by similarity score, take column average to build recommendations | |
recommendations = colMeans(NN_20_sim*relevant) | |
##Should ensure that recommendations are new items to the user | |
#rated = recommendations%in%m[500,] | |
#new = rep(0, 562) | |
#new[!rated] = recommendations[!rated] | |
# We actually only need recommendations for the first 100 items. | |
new = recommendations[1:100] | |
# top 5 recommendations | |
#recs=order(recommendations, decreasing=TRUE) # 141 91 157 69 61 | |
recs=order(new, decreasing=TRUE)#[1:5] | |
# step 3: evaluate results | |
alex = which(read.table('alex.txt')==1) | |
sum(recs[1:5]%in%alex) # 3 out of 5, not bad | |
sum(recs%in%alex) | |
precision= cumsum(recs%in%alex) | |
plot(precision, type='l', xlab='n results') | |
plot(precision[1:100], type='l', xlab='n results') # 1/2 precision up to about 20 docs, then 1/4th for more |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment