Created
June 24, 2012 20:27
-
-
Save boundsj/2984781 to your computer and use it in GitHub Desktop.
Cosine Similarity Examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from math import sqrt | |
def vector_length(vector): | |
""" | |
Compute length (aka magnitude, Euclidiean norm) of vector. | |
This gives the ordinary distance from the origin to the point x, | |
a consequence of the Pythagorean theorem. | |
The length is the square root of the sum of the squares of the | |
vector's components - for example, the coin collections | |
above have counts of each category of coin, the counts | |
are the components that are squared, summed, and then | |
square rooted | |
example: | |
>>> vector_length({'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 }) | |
5.47722557505166 | |
""" | |
total = 0 | |
for term, count in vector.items(): | |
total += count * count | |
return sqrt(total) | |
def dot_product(vector_A, vector_B): | |
""" | |
Compute the sum of the products of the components of each vector. | |
Components that do not exist in each vector are ignored. | |
example: | |
# 2 * 1 + 4 * 2 = 10 | |
>>> dot_product({'pennies': 2, 'nickles', 4}, {'pennies': 1, 'nickles': 2}) | |
10 | |
""" | |
total = 0 | |
for component_type in vector_A: # type of thing (i.e. coin type) | |
if component_type in vector_B: | |
total += vector_A[component_type] * vector_B[component_type] | |
return total | |
def cos_similarity(vector_A, vector_B): | |
""" | |
Compute the relative similarity of two vectors: | |
http://en.wikipedia.org/wiki/Cosine_similarity | |
This is the algebraic form where the similarity is the dot product as | |
a percentage of the magnitudes of the vectors. The result will fall | |
somewhere between 0.0 (completely dissimilar) to 1.0 (completely similar) | |
""" | |
total = dot_product(vector_A, vector_B) | |
return float(total) / ( vector_length(vector_A) * vector_length(vector_B) ) | |
### | |
# TESTS: compare the similarity of the coin collections | |
# (but these could just as easily be word frequency counts in documents) | |
## | |
you = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 } | |
me = {'pennies': 0, 'nickels': 3, 'dimes': 1, 'quarters': 1 } | |
her = {'pennies': 2, 'nickels': 1, 'dimes': 0, 'quarters': 3, 'pesos': 1 } | |
his = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4, 'pesos': 20 } | |
print "Similarity of your collection and mine: " | |
print cos_similarity(you, me) | |
print "Similarity of your collection and her's: " | |
print cos_similarity(you, her) | |
print "Similarity of my collection and her's: " | |
print cos_similarity(me, her) | |
print "Similarity of your collection and his's: " | |
print cos_similarity(you, his) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment