Skip to content

Instantly share code, notes, and snippets.

@daramcq
Created August 25, 2012 10:59
Show Gist options
  • Save daramcq/3463722 to your computer and use it in GitHub Desktop.
Save daramcq/3463722 to your computer and use it in GitHub Desktop.
RecDep information retrieval system
'''
This is a python translation of RecDep.java (gist: 2883635) with minor improvements (nested dictionaries replacing nested Lists)
'''
import nltk
import nltk.data
import pprint
import string
def preprocess(word):
word = word.lower()
exclude = set(string.punctuation)
table = string.maketrans("","")
return word.translate(table,string.punctuation)
def addTerm(term, termList, docName, termPos):
if term in termList:
if docName in termList[term]:
termList[term][docName].append(termPos)
else:
termList[term][docName] = [termPos]
else:
termList[term] = {docName:[termPos]}
def addDoc(f, termList, docList):
data = f.read()
tokens = nltk.word_tokenize(data)
for x in xrange(0, len(tokens)):
wordcount = x
s = preprocess(tokens[x])
addTerm(s,termList,f.name,wordcount)
docList[f.name]=wordcount
def addDocArray(fArr, termList, docList):
for x in xrange(0,len(fArr)):
addDoc(fArr[x],termList,docList)
f1 = open('../RTE_2012_5_1_Rupert_Murdoch.txt')
f2 = open('../BBC_2012_5_1_NATO_Afghanistan.txt')
f3 = open('../ATIMES_2012_4_28_BRIC_Future.txt')
fileArr = [f1,f2,f3]
termList = {}
docList = {}
pp = pprint.PrettyPrinter(indent=4)
addDocArray(fileArr,termList,docList)
pp.pprint(termList)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment