Instantly share code, notes, and snippets.

# abelsonlive/cosine_similarity.R

Last active Aug 20, 2018
Cosine Distance Recommendation / Collaborative Filtering Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 # lets make some dummy data n_rows <- 1000 n_cols <- 100 mat <- matrix(0, nrow=n_rows, ncol=n_cols) mat <- apply(mat, 2, function(x) { return(rbinom(n_rows, size=1, prob=0.1))}) colnames(mat) <- paste0("event", 1:n_cols) rownames(mat) <- paste0("pol", 1:n_rows) # lets take a look at it before we do some math head(mat) # now we're going to write a script that, # given a vector of events a politicians has gone to # will return the cosine similarity of all other # politicians. This could easily be adapted to search across # ALL politicians, so that we could identify the nearest X # neighbors of each politician # I got this code from here: # http://www2.research.att.com/~volinsky/DataMining/Columbia2011/HW/HW6-Solution.pdf # lets take one pol for starters sample_pol <- "pol100" sample_pol_vec = mat[samp,] # now we're going to compute cosine similarity in three steps mat_fcn <- function(pol_x){ out <- sum(pol_x * sample_pol_vec) return(out) } numerator <- apply(mat, 1, mat_fcn) denominator <- sqrt(sum(sample_pol_vec ^ 2) * rowSums(mat * mat)) cosine_distance <- numerator / denominator # We then order the other pols by their similiarity with our sample_pol, # keeping the top 10 (not including our sample_pol!). cosine_order <- order(cosine_distance, decreasing=T)[2:11] nearest_10_neighbors <- names(cosine_distance[cosine_order]) # The nearest 10 neighbors are: print(nearest_10_neighbors) # The 10 events these nearest 10 have gone to the most are: nearest_10_neighbor_events <- colSums(mat[nearest_10_neighbors,]) print(sort(nearest_10_neighbor_events, decreasing=TRUE)[1:10]) # The Cosine Distance betwen our sample_pol and her nearest neighbor is: print(cosine_distance[cosine_order]) # The events that both these two pols have gone to is nearest_pol <- names(cosine_distance[cosine_order]) events <- colnames(mat) nearest_pol_events <- events[mat[nearest_pol,]==1] sample_pol_events <- events[mat[sample_pol,]==1] shared_events <- sample_pol_events[sample_pol_events %in% nearest_pol_events] print(shared_events)

### merveydn commented Aug 20, 2018

The events that the sample pol has attended are not removed, I think?

### merveydn commented Aug 20, 2018

Trying to write a wrapper function using your code , can you please help me to remove the events? Tried doing this but did not work.

recomender = function(df, userid){

sample_user <- c("userid")
sample_user_vec = df[sample_user,]
mat_fcn <- function(df){
out <- sum(df * sample_user_vec)
return(out)
}

numerator <- apply(df, 1, mat_fcn)
denominator <- sqrt(sum(sample_user_vec ^ 2) * rowSums(df * df))
cosine_distance <- numerator / denominator

cosine_order <- order(cosine_distance, decreasing=T)[-1]
nearest_neighbors <- names(cosine_distance[cosine_order])

nearest_neighbor_events <- colSums(df[nearest_neighbors,])
sort(nearest_neighbor_events[df[sample_user, ] == 0], decreasing=TRUE)}