boundsj/cosine_sim_examples.py

## cosine_sim_examples.py
# -*- coding: utf-8 -*-

from math import sqrt

def vector_length(vector):
    """
    Compute length (aka magnitude, Euclidiean norm) of vector.
    This gives the ordinary distance from the origin to the point x,
    a consequence of the Pythagorean theorem.

    The length is the square root of the sum of the squares of the
    vector's components - for example, the coin collections
    above have counts of each category of coin, the counts
    are the components that are squared, summed, and then
    square rooted

    example:

        >>> vector_length({'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 })
        5.47722557505166
    """
    total = 0
    for term, count in vector.items():
        total += count * count
    return sqrt(total)

def dot_product(vector_A, vector_B):
    """
    Compute the sum of the products of the components of each vector.
    Components that do not exist in each vector are ignored.

    example:
        # 2 * 1 + 4 * 2 = 10
        >>> dot_product({'pennies': 2, 'nickles', 4}, {'pennies': 1, 'nickles': 2})
        10
    """
    total = 0
    for component_type in vector_A: # type of thing (i.e. coin type)
        if component_type in vector_B:
            total += vector_A[component_type] * vector_B[component_type]
    return total

def cos_similarity(vector_A, vector_B):
  """
  Compute the relative similarity of two vectors:
  http://en.wikipedia.org/wiki/Cosine_similarity

  This is the algebraic form where the similarity is the dot product as
  a percentage of the magnitudes of the vectors. The result will fall
  somewhere between 0.0 (completely dissimilar) to 1.0 (completely similar)
  """
  total = dot_product(vector_A, vector_B)
  return float(total) / ( vector_length(vector_A) * vector_length(vector_B) )

###
# TESTS: compare the similarity of the coin collections
# (but these could just as easily be word frequency counts in documents)
##

you = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 }
me = {'pennies': 0, 'nickels': 3, 'dimes': 1, 'quarters': 1 }
her = {'pennies': 2, 'nickels': 1, 'dimes': 0, 'quarters': 3, 'pesos': 1 }
his = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4, 'pesos': 20 }

print "Similarity of your collection and mine: "
print cos_similarity(you, me)
print "Similarity of your collection and her's: "
print cos_similarity(you, her)
print "Similarity of my collection and her's: "
print cos_similarity(me, her)
print "Similarity of your collection and his's: "
print cos_similarity(you, his)
	# -- coding: utf-8 --

	from math import sqrt

	def vector_length(vector):
	"""
	Compute length (aka magnitude, Euclidiean norm) of vector.
	This gives the ordinary distance from the origin to the point x,
	a consequence of the Pythagorean theorem.

	The length is the square root of the sum of the squares of the
	vector's components - for example, the coin collections
	above have counts of each category of coin, the counts
	are the components that are squared, summed, and then
	square rooted

	example:

	>>> vector_length({'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 })
	5.47722557505166
	"""
	total = 0
	for term, count in vector.items():
	total += count * count
	return sqrt(total)

	def dot_product(vector_A, vector_B):
	"""
	Compute the sum of the products of the components of each vector.
	Components that do not exist in each vector are ignored.

	example:
	# 2 * 1 + 4 * 2 = 10
	>>> dot_product({'pennies': 2, 'nickles', 4}, {'pennies': 1, 'nickles': 2})
	10
	"""
	total = 0
	for component_type in vector_A: # type of thing (i.e. coin type)
	if component_type in vector_B:
	total += vector_A[component_type] * vector_B[component_type]
	return total

	def cos_similarity(vector_A, vector_B):
	"""
	Compute the relative similarity of two vectors:
	http://en.wikipedia.org/wiki/Cosine_similarity

	This is the algebraic form where the similarity is the dot product as
	a percentage of the magnitudes of the vectors. The result will fall
	somewhere between 0.0 (completely dissimilar) to 1.0 (completely similar)
	"""
	total = dot_product(vector_A, vector_B)
	return float(total) / ( vector_length(vector_A) * vector_length(vector_B) )

	###
	# TESTS: compare the similarity of the coin collections
	# (but these could just as easily be word frequency counts in documents)
	##

	you = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4 }
	me = {'pennies': 0, 'nickels': 3, 'dimes': 1, 'quarters': 1 }
	her = {'pennies': 2, 'nickels': 1, 'dimes': 0, 'quarters': 3, 'pesos': 1 }
	his = {'pennies': 1, 'nickels': 2, 'dimes': 3, 'quarters': 4, 'pesos': 20 }

	print "Similarity of your collection and mine: "
	print cos_similarity(you, me)
	print "Similarity of your collection and her's: "
	print cos_similarity(you, her)
	print "Similarity of my collection and her's: "
	print cos_similarity(me, her)
	print "Similarity of your collection and his's: "
	print cos_similarity(you, his)