Skip to content

Instantly share code, notes, and snippets.

@boundsj
Created June 24, 2012 20:27
Show Gist options
  • Save boundsj/2984781 to your computer and use it in GitHub Desktop.
Save boundsj/2984781 to your computer and use it in GitHub Desktop.
Cosine Similarity Examples
# -*- coding: utf-8 -*-
from math import sqrt
def vector_length(vector):
"""
Compute length (aka magnitude, Euclidiean norm) of vector.
This gives the ordinary distance from the origin to the point x,
a consequence of the Pythagorean theorem.
The length is the square root of the sum of the squares of the
vector's components - for example, the coin collections
above have counts of each category of coin, the counts
are the components that are squared, summed, and then
square rooted
example:
>>> vector_length({'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 })
5.47722557505166
"""
total = 0
for term, count in vector.items():
total += count * count
return sqrt(total)
def dot_product(vector_A, vector_B):
"""
Compute the sum of the products of the components of each vector.
Components that do not exist in each vector are ignored.
example:
# 2 * 1 + 4 * 2 = 10
>>> dot_product({'pennies': 2, 'nickles', 4}, {'pennies': 1, 'nickles': 2})
10
"""
total = 0
for component_type in vector_A: # type of thing (i.e. coin type)
if component_type in vector_B:
total += vector_A[component_type] * vector_B[component_type]
return total
def cos_similarity(vector_A, vector_B):
"""
Compute the relative similarity of two vectors:
http://en.wikipedia.org/wiki/Cosine_similarity
This is the algebraic form where the similarity is the dot product as
a percentage of the magnitudes of the vectors. The result will fall
somewhere between 0.0 (completely dissimilar) to 1.0 (completely similar)
"""
total = dot_product(vector_A, vector_B)
return float(total) / ( vector_length(vector_A) * vector_length(vector_B) )
###
# TESTS: compare the similarity of the coin collections
# (but these could just as easily be word frequency counts in documents)
##
you = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 }
me = {'pennies': 0, 'nickels': 3, 'dimes': 1, 'quarters': 1 }
her = {'pennies': 2, 'nickels': 1, 'dimes': 0, 'quarters': 3, 'pesos': 1 }
his = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4, 'pesos': 20 }
print "Similarity of your collection and mine: "
print cos_similarity(you, me)
print "Similarity of your collection and her's: "
print cos_similarity(you, her)
print "Similarity of my collection and her's: "
print cos_similarity(me, her)
print "Similarity of your collection and his's: "
print cos_similarity(you, his)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment