-
-
Save merveydn/d762c24f82e9ff6f05a21c75ed50e6a2 to your computer and use it in GitHub Desktop.
Cosine Distance Recommendation / Collaborative Filtering Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# lets make some dummy data | |
n_rows <- 1000 | |
n_cols <- 100 | |
mat <- matrix(0, nrow=n_rows, ncol=n_cols) | |
mat <- apply(mat, 2, function(x) { return(rbinom(n_rows, size=1, prob=0.1))}) | |
colnames(mat) <- paste0("event", 1:n_cols) | |
rownames(mat) <- paste0("pol", 1:n_rows) | |
# lets take a look at it before we do some math | |
head(mat) | |
# now we're going to write a script that, | |
# given a vector of events a politicians has gone to | |
# will return the cosine similarity of all other | |
# politicians. This could easily be adapted to search across | |
# ALL politicians, so that we could identify the nearest X | |
# neighbors of each politician | |
# I got this code from here: | |
# http://www2.research.att.com/~volinsky/DataMining/Columbia2011/HW/HW6-Solution.pdf | |
# lets take one pol for starters | |
sample_pol <- "pol100" | |
sample_pol_vec = mat[samp,] | |
# now we're going to compute cosine similarity in three steps | |
mat_fcn <- function(pol_x){ | |
out <- sum(pol_x * sample_pol_vec) | |
return(out) | |
} | |
numerator <- apply(mat, 1, mat_fcn) | |
denominator <- sqrt(sum(sample_pol_vec ^ 2) * rowSums(mat * mat)) | |
cosine_distance <- numerator / denominator | |
# We then order the other pols by their similiarity with our sample_pol, | |
# keeping the top 10 (not including our sample_pol!). | |
cosine_order <- order(cosine_distance, decreasing=T)[2:11] | |
nearest_10_neighbors <- names(cosine_distance[cosine_order]) | |
# The nearest 10 neighbors are: | |
print(nearest_10_neighbors) | |
# The 10 events these nearest 10 have gone to the most are: | |
nearest_10_neighbor_events <- colSums(mat[nearest_10_neighbors,]) | |
print(sort(nearest_10_neighbor_events, decreasing=TRUE)[1:10]) | |
# The Cosine Distance betwen our sample_pol and her nearest neighbor is: | |
print(cosine_distance[cosine_order][1]) | |
# The events that both these two pols have gone to is | |
nearest_pol <- names(cosine_distance[cosine_order][1]) | |
events <- colnames(mat) | |
nearest_pol_events <- events[mat[nearest_pol,]==1] | |
sample_pol_events <- events[mat[sample_pol,]==1] | |
shared_events <- sample_pol_events[sample_pol_events %in% nearest_pol_events] | |
print(shared_events) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment