Created
August 25, 2012 10:59
-
-
Save daramcq/3463722 to your computer and use it in GitHub Desktop.
RecDep information retrieval system
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This is a python translation of RecDep.java (gist: 2883635) with minor improvements (nested dictionaries replacing nested Lists) | |
''' | |
import nltk | |
import nltk.data | |
import pprint | |
import string | |
def preprocess(word): | |
word = word.lower() | |
exclude = set(string.punctuation) | |
table = string.maketrans("","") | |
return word.translate(table,string.punctuation) | |
def addTerm(term, termList, docName, termPos): | |
if term in termList: | |
if docName in termList[term]: | |
termList[term][docName].append(termPos) | |
else: | |
termList[term][docName] = [termPos] | |
else: | |
termList[term] = {docName:[termPos]} | |
def addDoc(f, termList, docList): | |
data = f.read() | |
tokens = nltk.word_tokenize(data) | |
for x in xrange(0, len(tokens)): | |
wordcount = x | |
s = preprocess(tokens[x]) | |
addTerm(s,termList,f.name,wordcount) | |
docList[f.name]=wordcount | |
def addDocArray(fArr, termList, docList): | |
for x in xrange(0,len(fArr)): | |
addDoc(fArr[x],termList,docList) | |
f1 = open('../RTE_2012_5_1_Rupert_Murdoch.txt') | |
f2 = open('../BBC_2012_5_1_NATO_Afghanistan.txt') | |
f3 = open('../ATIMES_2012_4_28_BRIC_Future.txt') | |
fileArr = [f1,f2,f3] | |
termList = {} | |
docList = {} | |
pp = pprint.PrettyPrinter(indent=4) | |
addDocArray(fileArr,termList,docList) | |
pp.pprint(termList) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment