dmarx/math504_hw12__recommendations.r

## math504_hw12__recommendations.r
partial_reconstruction = function(svd, n){
  # Takes an SVD object, reconstructs the matrix using
  # the top 'n' singular values
  j = nrow(svd$u)
  k = nrow(svd$v)
  with(svd, u[,1:n] %*% (d[1:n]*diag(n)) %*% t(v[,1:n]) )
}

SVD_K_Test = function(mat, svd_ob=NULL, k=NULL){
  # takes an SVD object and a vector of integers k. Let j be
  # a value in k: reconstructs the decomposed matrix using the
  # first j singular values for each j in k. Calculates the similarity
  # between the input matrix and the lower-dimension reconstructions
  # for each j in k as the frobenius norm, returns the vector of these
  # similarity measurements.
  if(is.null(svd_ob)){svd_ob=svd(mat)}
  n   = length(svd_ob$d)
  if(is.null(k)){k=1:n}

  sim = rep(NA, length(k))
  for (j in 1:length(k)){
    M_k = partial_reconstruction(svd_ob, k[j])
    sim[j] = norm(mat - M_k, type='F')
  }
  sim
}


# Problem 3

ratings = as.matrix(read.table('user-shows.txt'))
system.time(r_svd <- svd(ratings)) # Just 9 sec, not bad!

library(ggplot2)
qplot(r_svd$u[,1], r_svd$u[,2], alpha=.25)
# Describing the ratings matrix in just two dimensions does not appear to reveal any latent structure in the data (i.e. there aren't any clear user clusters in 2D genre space).

k=c(2,10,20,25,30,40,50,75,100,150,200,300, 400, 500)
sim = SVD_K_Test(ratings, r_svd, k)
plot(k, sim)
points(40,sim[k==40], col='red', pch=4)

# There appear to be diminishing (approximately linear) returns for setting $k>40$,
# so I will attempt using that as the reduced dimensionality into which I will project the ratings matrix.
# To calculate recommendations, I will perform user-user similarity measurements
# in the dimension-reduced latent "topic" space, and then develop a "recommendation" vector as a weighted average
# of the K-nearest neighbors to the target user's vector (Alex), using the similarity measurements as the weighting scheme.

# For simplicity, I'm going to use euclidean distance for now, but it would probably be better to use cosine similarity.

###################################################################

# Step 1: cluster users in the latent space with k=40
k=40
system.time(adj_mat <- dist(r_svd$u[,1:k])) # 14sec, not bad
m = as.matrix(adj_mat)

# Extract and rank similarities to our target user.
target_ix = 500
ranks = m[500,]
plot(sort(ranks[-500]))
plot(sort(ranks[-500])[1:500])
plot(sort(ranks[-500])[1:50]) # let's use the top 20 nearest-neighbors

NN_20 = order(ranks[-500])[1:20]
NN_20_sim = ranks[NN_20]
# step 2: Extract ratings from relevant users and average ratings
# according to user-user similarity.

relevant = ratings[NN_20,] # 20 users x 563 movies

# weight by similarity score, take column average to build recommendations
recommendations = colMeans(NN_20_sim*relevant)

##Should ensure that recommendations are new items to the user
#rated = recommendations%in%m[500,]
#new = rep(0, 562)
#new[!rated] = recommendations[!rated]

# We actually only need recommendations for the first 100 items.
new = recommendations[1:100]

# top 5 recommendations
#recs=order(recommendations, decreasing=TRUE) # 141  91 157  69  61
recs=order(new, decreasing=TRUE)#[1:5]

# step 3: evaluate results
alex = which(read.table('alex.txt')==1)
sum(recs[1:5]%in%alex) # 3 out of 5, not bad
sum(recs%in%alex)

precision= cumsum(recs%in%alex)
plot(precision, type='l', xlab='n results')
plot(precision[1:100], type='l', xlab='n results') # 1/2 precision up to about 20 docs, then 1/4th for more
	partial_reconstruction = function(svd, n){
	# Takes an SVD object, reconstructs the matrix using
	# the top 'n' singular values
	j = nrow(svd$u)
	k = nrow(svd$v)
	with(svd, u[,1:n] %% (d[1:n]diag(n)) %*% t(v[,1:n]) )
	}

	SVD_K_Test = function(mat, svd_ob=NULL, k=NULL){
	# takes an SVD object and a vector of integers k. Let j be
	# a value in k: reconstructs the decomposed matrix using the
	# first j singular values for each j in k. Calculates the similarity
	# between the input matrix and the lower-dimension reconstructions
	# for each j in k as the frobenius norm, returns the vector of these
	# similarity measurements.
	if(is.null(svd_ob)){svd_ob=svd(mat)}
	n = length(svd_ob$d)
	if(is.null(k)){k=1:n}

	sim = rep(NA, length(k))
	for (j in 1:length(k)){
	M_k = partial_reconstruction(svd_ob, k[j])
	sim[j] = norm(mat - M_k, type='F')
	}
	sim
	}


	# Problem 3

	ratings = as.matrix(read.table('user-shows.txt'))
	system.time(r_svd <- svd(ratings)) # Just 9 sec, not bad!

	library(ggplot2)
	qplot(r_svd$u[,1], r_svd$u[,2], alpha=.25)
	# Describing the ratings matrix in just two dimensions does not appear to reveal any latent structure in the data (i.e. there aren't any clear user clusters in 2D genre space).

	k=c(2,10,20,25,30,40,50,75,100,150,200,300, 400, 500)
	sim = SVD_K_Test(ratings, r_svd, k)
	plot(k, sim)
	points(40,sim[k==40], col='red', pch=4)

	# There appear to be diminishing (approximately linear) returns for setting $k>40$,
	# so I will attempt using that as the reduced dimensionality into which I will project the ratings matrix.
	# To calculate recommendations, I will perform user-user similarity measurements
	# in the dimension-reduced latent "topic" space, and then develop a "recommendation" vector as a weighted average
	# of the K-nearest neighbors to the target user's vector (Alex), using the similarity measurements as the weighting scheme.

	# For simplicity, I'm going to use euclidean distance for now, but it would probably be better to use cosine similarity.

	###################################################################

	# Step 1: cluster users in the latent space with k=40
	k=40
	system.time(adj_mat <- dist(r_svd$u[,1:k])) # 14sec, not bad
	m = as.matrix(adj_mat)

	# Extract and rank similarities to our target user.
	target_ix = 500
	ranks = m[500,]
	plot(sort(ranks[-500]))
	plot(sort(ranks[-500])[1:500])
	plot(sort(ranks[-500])[1:50]) # let's use the top 20 nearest-neighbors

	NN_20 = order(ranks[-500])[1:20]
	NN_20_sim = ranks[NN_20]
	# step 2: Extract ratings from relevant users and average ratings
	# according to user-user similarity.

	relevant = ratings[NN_20,] # 20 users x 563 movies

	# weight by similarity score, take column average to build recommendations
	recommendations = colMeans(NN_20_sim*relevant)

	##Should ensure that recommendations are new items to the user
	#rated = recommendations%in%m[500,]
	#new = rep(0, 562)
	#new[!rated] = recommendations[!rated]

	# We actually only need recommendations for the first 100 items.
	new = recommendations[1:100]

	# top 5 recommendations
	#recs=order(recommendations, decreasing=TRUE) # 141 91 157 69 61
	recs=order(new, decreasing=TRUE)#[1:5]

	# step 3: evaluate results
	alex = which(read.table('alex.txt')==1)
	sum(recs[1:5]%in%alex) # 3 out of 5, not bad
	sum(recs%in%alex)

	precision= cumsum(recs%in%alex)
	plot(precision, type='l', xlab='n results')
	plot(precision[1:100], type='l', xlab='n results') # 1/2 precision up to about 20 docs, then 1/4th for more