keweishang/gist:44eb1a9656fa9e374e61e4957c2bde8d

## gistfile1.txt
#!/usr/bin/python
# docdist8.py - treat whole file as a single "line"
#
# Original version by Erik D. Demaine on January 31, 2011,
# based on code by Ronald L. Rivest (see docdist[1-7].py).
#
# Usage:
#    docdist8.py filename1 filename2
#
# This program computes the "distance" between two text files
# as the angle between their word frequency vectors (in radians).
#
# For each input file, a word-frequency vector is computed as follows:
#    (1) the specified file is read in
#    (2) it is converted into a list of alphanumeric "words"
#        Here a "word" is a sequence of consecutive alphanumeric
#        characters.  Non-alphanumeric characters are treated as blanks.
#        Case is not significant.
#    (3) for each word, its frequency of occurrence is determined
#    (4) the word/frequency lists are sorted into order alphabetically
#
# The "distance" between two vectors is the angle between them.
# If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i)
# and y = (y1, y2, ..., yn) is the second vector,
# then the angle between them is defined as:
#    d(x,y) = arccos(inner_product(x,y) / (norm(x)*norm(y)))
# where:
#    inner_product(x,y) = x1*y1 + x2*y2 + ... xn*yn
#    norm(x) = sqrt(inner_product(x,x))

import math
    # math.acos(x) is the arccosine of x.
    # math.sqrt(x) is the square root of x.

import string

import sys

##################################
# Operation 1: read a text file ##
##################################
def read_file(filename):
    """
    Read the text file with the given filename;
    return a list of the lines of text in the file.
    """
    try:
        f = open(filename, 'r')
        return f.read()
    except IOError:
        print "Error opening or reading input file: ",filename
        sys.exit()

#################################################
# Operation 2: split the text lines into words ##
#################################################

# global variables needed for fast parsing
# translation table maps upper case to lower case and punctuation to spaces
translation_table = string.maketrans(string.punctuation+string.uppercase,
                                     " "*len(string.punctuation)+string.lowercase)

def get_words_from_line_list(text):
    """
    Parse the given text into words.
    Return list of all words found.
    """
    text = text.translate(translation_table)
    word_list = text.split()
    return word_list

##############################################
# Operation 3: count frequency of each word ##
##############################################
def count_frequency(word_list):
    """
    Return a dictionary mapping words to frequency.
    """
    D = {}
    for new_word in word_list:
        if new_word in D:
            D[new_word] = D[new_word]+1
        else:
            D[new_word] = 1
    return D

#############################################
## compute word frequencies for input file ##
#############################################
def word_frequencies_for_file(filename):
    """
    Return dictionary of (word,frequency) pairs for the given file.
    """

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)

    print "File",filename,":",
    print len(line_list),"lines,",
    print len(word_list),"words,",
    print len(freq_mapping),"distinct words"

    return freq_mapping

def inner_product(D1,D2):
    """
    Inner product between two vectors, where vectors
    are represented as dictionaries of (word,freq) pairs.

    Example: inner_product({"and":3,"of":2,"the":5},
                           {"and":4,"in":1,"of":1,"this":2}) = 14.0
    """
    sum = 0.0
    for key in D1:
        if key in D2:
            sum += D1[key] * D2[key]
    return sum

def vector_angle(D1,D2):
    """
    The input is a list of (word,freq) pairs, sorted alphabetically.

    Return the angle between these two vectors.
    """
    numerator = inner_product(D1,D2)
    denominator = math.sqrt(inner_product(D1,D1)*inner_product(D2,D2))
    return math.acos(numerator/denominator)

def main():
    if len(sys.argv) != 3:
        print "Usage: docdist8.py filename_1 filename_2"
    else:
        filename_1 = sys.argv[1]
        filename_2 = sys.argv[2]
        sorted_word_list_1 = word_frequencies_for_file(filename_1)
        sorted_word_list_2 = word_frequencies_for_file(filename_2)
        distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
        print "The distance between the documents is: %0.6f (radians)"%distance

if __name__ == "__main__":
    import profile
    profile.run("main()")
	#!/usr/bin/python
	# docdist8.py - treat whole file as a single "line"
	#
	# Original version by Erik D. Demaine on January 31, 2011,
	# based on code by Ronald L. Rivest (see docdist[1-7].py).
	#
	# Usage:
	# docdist8.py filename1 filename2
	#
	# This program computes the "distance" between two text files
	# as the angle between their word frequency vectors (in radians).
	#
	# For each input file, a word-frequency vector is computed as follows:
	# (1) the specified file is read in
	# (2) it is converted into a list of alphanumeric "words"
	# Here a "word" is a sequence of consecutive alphanumeric
	# characters. Non-alphanumeric characters are treated as blanks.
	# Case is not significant.
	# (3) for each word, its frequency of occurrence is determined
	# (4) the word/frequency lists are sorted into order alphabetically
	#
	# The "distance" between two vectors is the angle between them.
	# If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i)
	# and y = (y1, y2, ..., yn) is the second vector,
	# then the angle between them is defined as:
	# d(x,y) = arccos(inner_product(x,y) / (norm(x)*norm(y)))
	# where:
	# inner_product(x,y) = x1y1 + x2y2 + ... xn*yn
	# norm(x) = sqrt(inner_product(x,x))

	import math
	# math.acos(x) is the arccosine of x.
	# math.sqrt(x) is the square root of x.

	import string

	import sys

	##################################
	# Operation 1: read a text file ##
	##################################
	def read_file(filename):
	"""
	Read the text file with the given filename;
	return a list of the lines of text in the file.
	"""
	try:
	f = open(filename, 'r')
	return f.read()
	except IOError:
	print "Error opening or reading input file: ",filename
	sys.exit()

	#################################################
	# Operation 2: split the text lines into words ##
	#################################################

	# global variables needed for fast parsing
	# translation table maps upper case to lower case and punctuation to spaces
	translation_table = string.maketrans(string.punctuation+string.uppercase,
	" "*len(string.punctuation)+string.lowercase)

	def get_words_from_line_list(text):
	"""
	Parse the given text into words.
	Return list of all words found.
	"""
	text = text.translate(translation_table)
	word_list = text.split()
	return word_list

	##############################################
	# Operation 3: count frequency of each word ##
	##############################################
	def count_frequency(word_list):
	"""
	Return a dictionary mapping words to frequency.
	"""
	D = {}
	for new_word in word_list:
	if new_word in D:
	D[new_word] = D[new_word]+1
	else:
	D[new_word] = 1
	return D

	#############################################
	## compute word frequencies for input file ##
	#############################################
	def word_frequencies_for_file(filename):
	"""
	Return dictionary of (word,frequency) pairs for the given file.
	"""

	line_list = read_file(filename)
	word_list = get_words_from_line_list(line_list)
	freq_mapping = count_frequency(word_list)

	print "File",filename,":",
	print len(line_list),"lines,",
	print len(word_list),"words,",
	print len(freq_mapping),"distinct words"

	return freq_mapping

	def inner_product(D1,D2):
	"""
	Inner product between two vectors, where vectors
	are represented as dictionaries of (word,freq) pairs.

	Example: inner_product({"and":3,"of":2,"the":5},
	{"and":4,"in":1,"of":1,"this":2}) = 14.0
	"""
	sum = 0.0
	for key in D1:
	if key in D2:
	sum += D1[key] * D2[key]
	return sum

	def vector_angle(D1,D2):
	"""
	The input is a list of (word,freq) pairs, sorted alphabetically.

	Return the angle between these two vectors.
	"""
	numerator = inner_product(D1,D2)
	denominator = math.sqrt(inner_product(D1,D1)*inner_product(D2,D2))
	return math.acos(numerator/denominator)

	def main():
	if len(sys.argv) != 3:
	print "Usage: docdist8.py filename_1 filename_2"
	else:
	filename_1 = sys.argv[1]
	filename_2 = sys.argv[2]
	sorted_word_list_1 = word_frequencies_for_file(filename_1)
	sorted_word_list_2 = word_frequencies_for_file(filename_2)
	distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
	print "The distance between the documents is: %0.6f (radians)"%distance

	if __name__ == "__main__":
	import profile
	profile.run("main()")