Skip to content

Instantly share code, notes, and snippets.

@dmarx
Last active June 14, 2019 10:46
Show Gist options
  • Save dmarx/10590950 to your computer and use it in GitHub Desktop.
Save dmarx/10590950 to your computer and use it in GitHub Desktop.
Simple recommender system
partial_reconstruction = function(svd, n){
# Takes an SVD object, reconstructs the matrix using
# the top 'n' singular values
j = nrow(svd$u)
k = nrow(svd$v)
with(svd, u[,1:n] %*% (d[1:n]*diag(n)) %*% t(v[,1:n]) )
}
SVD_K_Test = function(mat, svd_ob=NULL, k=NULL){
# takes an SVD object and a vector of integers k. Let j be
# a value in k: reconstructs the decomposed matrix using the
# first j singular values for each j in k. Calculates the similarity
# between the input matrix and the lower-dimension reconstructions
# for each j in k as the frobenius norm, returns the vector of these
# similarity measurements.
if(is.null(svd_ob)){svd_ob=svd(mat)}
n = length(svd_ob$d)
if(is.null(k)){k=1:n}
sim = rep(NA, length(k))
for (j in 1:length(k)){
M_k = partial_reconstruction(svd_ob, k[j])
sim[j] = norm(mat - M_k, type='F')
}
sim
}
# Problem 3
ratings = as.matrix(read.table('user-shows.txt'))
system.time(r_svd <- svd(ratings)) # Just 9 sec, not bad!
library(ggplot2)
qplot(r_svd$u[,1], r_svd$u[,2], alpha=.25)
# Describing the ratings matrix in just two dimensions does not appear to reveal any latent structure in the data (i.e. there aren't any clear user clusters in 2D genre space).
k=c(2,10,20,25,30,40,50,75,100,150,200,300, 400, 500)
sim = SVD_K_Test(ratings, r_svd, k)
plot(k, sim)
points(40,sim[k==40], col='red', pch=4)
# There appear to be diminishing (approximately linear) returns for setting $k>40$,
# so I will attempt using that as the reduced dimensionality into which I will project the ratings matrix.
# To calculate recommendations, I will perform user-user similarity measurements
# in the dimension-reduced latent "topic" space, and then develop a "recommendation" vector as a weighted average
# of the K-nearest neighbors to the target user's vector (Alex), using the similarity measurements as the weighting scheme.
# For simplicity, I'm going to use euclidean distance for now, but it would probably be better to use cosine similarity.
###################################################################
# Step 1: cluster users in the latent space with k=40
k=40
system.time(adj_mat <- dist(r_svd$u[,1:k])) # 14sec, not bad
m = as.matrix(adj_mat)
# Extract and rank similarities to our target user.
target_ix = 500
ranks = m[500,]
plot(sort(ranks[-500]))
plot(sort(ranks[-500])[1:500])
plot(sort(ranks[-500])[1:50]) # let's use the top 20 nearest-neighbors
NN_20 = order(ranks[-500])[1:20]
NN_20_sim = ranks[NN_20]
# step 2: Extract ratings from relevant users and average ratings
# according to user-user similarity.
relevant = ratings[NN_20,] # 20 users x 563 movies
# weight by similarity score, take column average to build recommendations
recommendations = colMeans(NN_20_sim*relevant)
##Should ensure that recommendations are new items to the user
#rated = recommendations%in%m[500,]
#new = rep(0, 562)
#new[!rated] = recommendations[!rated]
# We actually only need recommendations for the first 100 items.
new = recommendations[1:100]
# top 5 recommendations
#recs=order(recommendations, decreasing=TRUE) # 141 91 157 69 61
recs=order(new, decreasing=TRUE)#[1:5]
# step 3: evaluate results
alex = which(read.table('alex.txt')==1)
sum(recs[1:5]%in%alex) # 3 out of 5, not bad
sum(recs%in%alex)
precision= cumsum(recs%in%alex)
plot(precision, type='l', xlab='n results')
plot(precision[1:100], type='l', xlab='n results') # 1/2 precision up to about 20 docs, then 1/4th for more
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment