merveydn/cosine_similarity.R

## cosine_similarity.R
# lets make some dummy data
n_rows <- 1000
n_cols <- 100
mat <- matrix(0, nrow=n_rows, ncol=n_cols)
mat <- apply(mat, 2, function(x) { return(rbinom(n_rows, size=1, prob=0.1))})
colnames(mat) <- paste0("event", 1:n_cols)
rownames(mat) <- paste0("pol", 1:n_rows)

# lets take a look at it before we do some math
head(mat)

# now we're going to write a script that,
# given a vector of events a politicians has gone to
# will return the cosine similarity of all other
# politicians. This could easily be adapted to search across
# ALL politicians, so that we could identify the nearest X
# neighbors of each politician

# I got this code from here:
# http://www2.research.att.com/~volinsky/DataMining/Columbia2011/HW/HW6-Solution.pdf

# lets take one pol for starters
sample_pol <- "pol100"
sample_pol_vec = mat[samp,]


# now we're going to compute cosine similarity in three steps
mat_fcn <- function(pol_x){
  out <- sum(pol_x * sample_pol_vec)
  return(out)
}

numerator <- apply(mat, 1, mat_fcn)
denominator <- sqrt(sum(sample_pol_vec ^ 2) * rowSums(mat * mat))
cosine_distance <- numerator / denominator

# We then order the other pols by their similiarity with our sample_pol,
# keeping the top 10 (not including our sample_pol!).

cosine_order <- order(cosine_distance, decreasing=T)[2:11]
nearest_10_neighbors <- names(cosine_distance[cosine_order])

# The nearest 10 neighbors are:
print(nearest_10_neighbors)

# The 10 events these nearest 10 have gone to the most are:
nearest_10_neighbor_events <- colSums(mat[nearest_10_neighbors,])
print(sort(nearest_10_neighbor_events, decreasing=TRUE)[1:10])

# The Cosine Distance betwen our sample_pol and her nearest neighbor is:
print(cosine_distance[cosine_order][1])

# The events that both these two pols have gone to is
nearest_pol <- names(cosine_distance[cosine_order][1])
events <- colnames(mat)
nearest_pol_events <- events[mat[nearest_pol,]==1]
sample_pol_events <- events[mat[sample_pol,]==1]
shared_events <- sample_pol_events[sample_pol_events %in% nearest_pol_events]
print(shared_events)
	# lets make some dummy data
	n_rows <- 1000
	n_cols <- 100
	mat <- matrix(0, nrow=n_rows, ncol=n_cols)
	mat <- apply(mat, 2, function(x) { return(rbinom(n_rows, size=1, prob=0.1))})
	colnames(mat) <- paste0("event", 1:n_cols)
	rownames(mat) <- paste0("pol", 1:n_rows)

	# lets take a look at it before we do some math
	head(mat)

	# now we're going to write a script that,
	# given a vector of events a politicians has gone to
	# will return the cosine similarity of all other
	# politicians. This could easily be adapted to search across
	# ALL politicians, so that we could identify the nearest X
	# neighbors of each politician

	# I got this code from here:
	# http://www2.research.att.com/~volinsky/DataMining/Columbia2011/HW/HW6-Solution.pdf

	# lets take one pol for starters
	sample_pol <- "pol100"
	sample_pol_vec = mat[samp,]


	# now we're going to compute cosine similarity in three steps
	mat_fcn <- function(pol_x){
	out <- sum(pol_x * sample_pol_vec)
	return(out)
	}

	numerator <- apply(mat, 1, mat_fcn)
	denominator <- sqrt(sum(sample_pol_vec ^ 2) * rowSums(mat * mat))
	cosine_distance <- numerator / denominator

	# We then order the other pols by their similiarity with our sample_pol,
	# keeping the top 10 (not including our sample_pol!).

	cosine_order <- order(cosine_distance, decreasing=T)[2:11]
	nearest_10_neighbors <- names(cosine_distance[cosine_order])

	# The nearest 10 neighbors are:
	print(nearest_10_neighbors)

	# The 10 events these nearest 10 have gone to the most are:
	nearest_10_neighbor_events <- colSums(mat[nearest_10_neighbors,])
	print(sort(nearest_10_neighbor_events, decreasing=TRUE)[1:10])

	# The Cosine Distance betwen our sample_pol and her nearest neighbor is:
	print(cosine_distance[cosine_order][1])

	# The events that both these two pols have gone to is
	nearest_pol <- names(cosine_distance[cosine_order][1])
	events <- colnames(mat)
	nearest_pol_events <- events[mat[nearest_pol,]==1]
	sample_pol_events <- events[mat[sample_pol,]==1]
	shared_events <- sample_pol_events[sample_pol_events %in% nearest_pol_events]
	print(shared_events)