Created
June 1, 2016 06:55
-
-
Save keweishang/44eb1a9656fa9e374e61e4957c2bde8d to your computer and use it in GitHub Desktop.
mit lec02_code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# docdist8.py - treat whole file as a single "line" | |
# | |
# Original version by Erik D. Demaine on January 31, 2011, | |
# based on code by Ronald L. Rivest (see docdist[1-7].py). | |
# | |
# Usage: | |
# docdist8.py filename1 filename2 | |
# | |
# This program computes the "distance" between two text files | |
# as the angle between their word frequency vectors (in radians). | |
# | |
# For each input file, a word-frequency vector is computed as follows: | |
# (1) the specified file is read in | |
# (2) it is converted into a list of alphanumeric "words" | |
# Here a "word" is a sequence of consecutive alphanumeric | |
# characters. Non-alphanumeric characters are treated as blanks. | |
# Case is not significant. | |
# (3) for each word, its frequency of occurrence is determined | |
# (4) the word/frequency lists are sorted into order alphabetically | |
# | |
# The "distance" between two vectors is the angle between them. | |
# If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i) | |
# and y = (y1, y2, ..., yn) is the second vector, | |
# then the angle between them is defined as: | |
# d(x,y) = arccos(inner_product(x,y) / (norm(x)*norm(y))) | |
# where: | |
# inner_product(x,y) = x1*y1 + x2*y2 + ... xn*yn | |
# norm(x) = sqrt(inner_product(x,x)) | |
import math | |
# math.acos(x) is the arccosine of x. | |
# math.sqrt(x) is the square root of x. | |
import string | |
import sys | |
################################## | |
# Operation 1: read a text file ## | |
################################## | |
def read_file(filename): | |
""" | |
Read the text file with the given filename; | |
return a list of the lines of text in the file. | |
""" | |
try: | |
f = open(filename, 'r') | |
return f.read() | |
except IOError: | |
print "Error opening or reading input file: ",filename | |
sys.exit() | |
################################################# | |
# Operation 2: split the text lines into words ## | |
################################################# | |
# global variables needed for fast parsing | |
# translation table maps upper case to lower case and punctuation to spaces | |
translation_table = string.maketrans(string.punctuation+string.uppercase, | |
" "*len(string.punctuation)+string.lowercase) | |
def get_words_from_line_list(text): | |
""" | |
Parse the given text into words. | |
Return list of all words found. | |
""" | |
text = text.translate(translation_table) | |
word_list = text.split() | |
return word_list | |
############################################## | |
# Operation 3: count frequency of each word ## | |
############################################## | |
def count_frequency(word_list): | |
""" | |
Return a dictionary mapping words to frequency. | |
""" | |
D = {} | |
for new_word in word_list: | |
if new_word in D: | |
D[new_word] = D[new_word]+1 | |
else: | |
D[new_word] = 1 | |
return D | |
############################################# | |
## compute word frequencies for input file ## | |
############################################# | |
def word_frequencies_for_file(filename): | |
""" | |
Return dictionary of (word,frequency) pairs for the given file. | |
""" | |
line_list = read_file(filename) | |
word_list = get_words_from_line_list(line_list) | |
freq_mapping = count_frequency(word_list) | |
print "File",filename,":", | |
print len(line_list),"lines,", | |
print len(word_list),"words,", | |
print len(freq_mapping),"distinct words" | |
return freq_mapping | |
def inner_product(D1,D2): | |
""" | |
Inner product between two vectors, where vectors | |
are represented as dictionaries of (word,freq) pairs. | |
Example: inner_product({"and":3,"of":2,"the":5}, | |
{"and":4,"in":1,"of":1,"this":2}) = 14.0 | |
""" | |
sum = 0.0 | |
for key in D1: | |
if key in D2: | |
sum += D1[key] * D2[key] | |
return sum | |
def vector_angle(D1,D2): | |
""" | |
The input is a list of (word,freq) pairs, sorted alphabetically. | |
Return the angle between these two vectors. | |
""" | |
numerator = inner_product(D1,D2) | |
denominator = math.sqrt(inner_product(D1,D1)*inner_product(D2,D2)) | |
return math.acos(numerator/denominator) | |
def main(): | |
if len(sys.argv) != 3: | |
print "Usage: docdist8.py filename_1 filename_2" | |
else: | |
filename_1 = sys.argv[1] | |
filename_2 = sys.argv[2] | |
sorted_word_list_1 = word_frequencies_for_file(filename_1) | |
sorted_word_list_2 = word_frequencies_for_file(filename_2) | |
distance = vector_angle(sorted_word_list_1,sorted_word_list_2) | |
print "The distance between the documents is: %0.6f (radians)"%distance | |
if __name__ == "__main__": | |
import profile | |
profile.run("main()") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment