Last active
August 20, 2018 13:38
-
-
Save abelsonlive/6689344 to your computer and use it in GitHub Desktop.
Cosine Distance Recommendation / Collaborative Filtering Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# lets make some dummy data | |
n_rows <- 1000 | |
n_cols <- 100 | |
mat <- matrix(0, nrow=n_rows, ncol=n_cols) | |
mat <- apply(mat, 2, function(x) { return(rbinom(n_rows, size=1, prob=0.1))}) | |
colnames(mat) <- paste0("event", 1:n_cols) | |
rownames(mat) <- paste0("pol", 1:n_rows) | |
# lets take a look at it before we do some math | |
head(mat) | |
# now we're going to write a script that, | |
# given a vector of events a politicians has gone to | |
# will return the cosine similarity of all other | |
# politicians. This could easily be adapted to search across | |
# ALL politicians, so that we could identify the nearest X | |
# neighbors of each politician | |
# I got this code from here: | |
# http://www2.research.att.com/~volinsky/DataMining/Columbia2011/HW/HW6-Solution.pdf | |
# lets take one pol for starters | |
sample_pol <- "pol100" | |
sample_pol_vec = mat[samp,] | |
# now we're going to compute cosine similarity in three steps | |
mat_fcn <- function(pol_x){ | |
out <- sum(pol_x * sample_pol_vec) | |
return(out) | |
} | |
numerator <- apply(mat, 1, mat_fcn) | |
denominator <- sqrt(sum(sample_pol_vec ^ 2) * rowSums(mat * mat)) | |
cosine_distance <- numerator / denominator | |
# We then order the other pols by their similiarity with our sample_pol, | |
# keeping the top 10 (not including our sample_pol!). | |
cosine_order <- order(cosine_distance, decreasing=T)[2:11] | |
nearest_10_neighbors <- names(cosine_distance[cosine_order]) | |
# The nearest 10 neighbors are: | |
print(nearest_10_neighbors) | |
# The 10 events these nearest 10 have gone to the most are: | |
nearest_10_neighbor_events <- colSums(mat[nearest_10_neighbors,]) | |
print(sort(nearest_10_neighbor_events, decreasing=TRUE)[1:10]) | |
# The Cosine Distance betwen our sample_pol and her nearest neighbor is: | |
print(cosine_distance[cosine_order][1]) | |
# The events that both these two pols have gone to is | |
nearest_pol <- names(cosine_distance[cosine_order][1]) | |
events <- colnames(mat) | |
nearest_pol_events <- events[mat[nearest_pol,]==1] | |
sample_pol_events <- events[mat[sample_pol,]==1] | |
shared_events <- sample_pol_events[sample_pol_events %in% nearest_pol_events] | |
print(shared_events) |
Trying to write a wrapper function using your code , can you please help me to remove the events? Tried doing this but did not work.
recomender = function(df, userid){
sample_user <- c("userid")
sample_user_vec = df[sample_user,]
mat_fcn <- function(df){
out <- sum(df * sample_user_vec)
return(out)
}
numerator <- apply(df, 1, mat_fcn)
denominator <- sqrt(sum(sample_user_vec ^ 2) * rowSums(df * df))
cosine_distance <- numerator / denominator
cosine_order <- order(cosine_distance, decreasing=T)[-1]
nearest_neighbors <- names(cosine_distance[cosine_order])
nearest_neighbor_events <- colSums(df[nearest_neighbors,])
sort(nearest_neighbor_events[df[sample_user, ] == 0], decreasing=TRUE)}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The events that the sample pol has attended are not removed, I think?