daramcq/Recdep.py

## Recdep.py
'''
This is a python translation of RecDep.java (gist: 2883635) with minor improvements (nested dictionaries replacing nested Lists)
'''

import nltk
import nltk.data
import pprint
import string

def preprocess(word):
	word = word.lower()
	exclude = set(string.punctuation)
	table = string.maketrans("","")
	return word.translate(table,string.punctuation)


def addTerm(term, termList, docName, termPos):
	if term in termList:
		if docName in termList[term]:
			termList[term][docName].append(termPos)
		else:
			termList[term][docName] = [termPos]
	else:
		termList[term] = {docName:[termPos]}

def addDoc(f, termList, docList):
	data = f.read()
	tokens = nltk.word_tokenize(data)

	for x in xrange(0, len(tokens)):
		wordcount = x
		s = preprocess(tokens[x])
		addTerm(s,termList,f.name,wordcount)

	docList[f.name]=wordcount

def addDocArray(fArr, termList, docList):
	for x in xrange(0,len(fArr)):
		addDoc(fArr[x],termList,docList)


f1 = open('../RTE_2012_5_1_Rupert_Murdoch.txt')
f2 = open('../BBC_2012_5_1_NATO_Afghanistan.txt')
f3 = open('../ATIMES_2012_4_28_BRIC_Future.txt')

fileArr = [f1,f2,f3]
termList = {}
docList = {}

pp = pprint.PrettyPrinter(indent=4)
addDocArray(fileArr,termList,docList)
pp.pprint(termList)
	'''
	This is a python translation of RecDep.java (gist: 2883635) with minor improvements (nested dictionaries replacing nested Lists)
	'''

	import nltk
	import nltk.data
	import pprint
	import string

	def preprocess(word):
	word = word.lower()
	exclude = set(string.punctuation)
	table = string.maketrans("","")
	return word.translate(table,string.punctuation)


	def addTerm(term, termList, docName, termPos):
	if term in termList:
	if docName in termList[term]:
	termList[term][docName].append(termPos)
	else:
	termList[term][docName] = [termPos]
	else:
	termList[term] = {docName:[termPos]}

	def addDoc(f, termList, docList):
	data = f.read()
	tokens = nltk.word_tokenize(data)

	for x in xrange(0, len(tokens)):
	wordcount = x
	s = preprocess(tokens[x])
	addTerm(s,termList,f.name,wordcount)

	docList[f.name]=wordcount

	def addDocArray(fArr, termList, docList):
	for x in xrange(0,len(fArr)):
	addDoc(fArr[x],termList,docList)


	f1 = open('../RTE_2012_5_1_Rupert_Murdoch.txt')
	f2 = open('../BBC_2012_5_1_NATO_Afghanistan.txt')
	f3 = open('../ATIMES_2012_4_28_BRIC_Future.txt')

	fileArr = [f1,f2,f3]
	termList = {}
	docList = {}

	pp = pprint.PrettyPrinter(indent=4)
	addDocArray(fileArr,termList,docList)
	pp.pprint(termList)