pbdeuchler/main.py

## main.py
import shelve
import os
import sys

#Global vars

SHELF = os.getcwd() + '/' +'store' #variable for db name
LOC = os.getcwd() + '/files/' #variable for corpus location (os.getcwd() gets the current directory)
write = sys.stdout.write #For progress bar
db = {}

#-----------------------------------------------------------------------------------------------


#Main function

def main():
	directory = os.listdir(LOC)
	count = 0
	print "Indexing..."
	for x in directory: #loop through files in supplied directory
		count = count + 1 #For progress tracking purposes
		if x is '.DS_Store': #Excludes those f***ing DS_STORE files
			return
		x = LOC + x
		FileRead(x) #Creates FileRead object, which then processes the data
	#Show queries for required terms
	write(getpostings('file'))
	write(getpostings('performance'))
	write(getpostings('read'))
	write(getpostings('window'))
	write(getpostings('subject'))
	sh = shelve.open('store') #opening persistent database
	sh['index'] = db #storing the index for future reference
	sh.close() #closing database connection

#-----------------------------------------------------------------------------------------------

#Helper functions

def getpostings(query): #Required function, also formats db query result
	try:
		value = db[query]
	except:
		return 'Key not found'
	value[1] = map(lambda x: int(x), value[1])
	value[1].sort()
	fifty = str(value[1][0:50])[1:-1]
	freq = value[0]
	docs = value[2]
	output = '''
%s was found %i times in %i documents.
The first 50 were:
%s
	''' % (query.capitalize(), freq, docs, fifty)
	return output

#-----------------------------------------------------------------------------------------------

#Class definitions

class FileRead(object):

	def __init__(self, input):
		self.location = input #for the file open
		self.filename = input.split('.')[0].split('/')[-1] #for tracking and indexing
		self.filetype = input.split('.')[1] #just in case
		try:
			self.terms = list(self.yield_valid_terms()) #calls read_valid_terms as a list comprehension and returns terms
		except:
			print input.upper() + " READ FAILED"
		self.index = {} #Only initialize if everything else has passed thus far
		try:
			self.make_index() #indexes terms into {'term': 'freq'} format
		except:
			print self.filename.upper() + '.' + self.filetype.upper() + " INDEX FAILED"
		try:
			self.store() #merges index into main database, format is {'term': [freq, [list of doc id's], # of docs found]}
		except:
			print self.filename.upper() + '.' + self.filetype.upper() + " STORE FAILED"


	def yield_valid_terms(self):
		with open(self.location) as f: #opens file
			for line in f: #iterates through each line
				for term in line.split(): #delimits line by whitespace
					if any(c.isalpha() for c in term): #checks for alpha chars to ensure term validity
						yield term #what is says

	def make_index(self):
		for term in self.terms:
			try:
				self.index[term] = self.index[term] + 1 #if term already exists in index, increment freq
			except:
				self.index[term] = 1 #if term doesn't exist, create entry with starting freq '1'

	def store(self):
		for term in self.index:
			try:
				value = db[term] #if term already exists in database, get it
				value[0] = value[0] + self.index[term] #add to the freq
				value[1].append(self.filename) #add file id to the doc id list
				value[2] = len(value[1])
				db[term] = value #save that mothaf***er
			except:
				db[term] = [self.index[term], [self.filename], 1] #if term isn't found, create entry


#-----------------------------------------------------------------------------------------------


#Called on execution
if __name__ == '__main__':
	main() #call main function
	import shelve
	import os
	import sys

	#Global vars

	SHELF = os.getcwd() + '/' +'store' #variable for db name
	LOC = os.getcwd() + '/files/' #variable for corpus location (os.getcwd() gets the current directory)
	write = sys.stdout.write #For progress bar
	db = {}

	#-----------------------------------------------------------------------------------------------


	#Main function

	def main():
	directory = os.listdir(LOC)
	count = 0
	print "Indexing..."
	for x in directory: #loop through files in supplied directory
	count = count + 1 #For progress tracking purposes
	if x is '.DS_Store': #Excludes those f***ing DS_STORE files
	return
	x = LOC + x
	FileRead(x) #Creates FileRead object, which then processes the data
	#Show queries for required terms
	write(getpostings('file'))
	write(getpostings('performance'))
	write(getpostings('read'))
	write(getpostings('window'))
	write(getpostings('subject'))
	sh = shelve.open('store') #opening persistent database
	sh['index'] = db #storing the index for future reference
	sh.close() #closing database connection

	#-----------------------------------------------------------------------------------------------

	#Helper functions

	def getpostings(query): #Required function, also formats db query result
	try:
	value = db[query]
	except:
	return 'Key not found'
	value[1] = map(lambda x: int(x), value[1])
	value[1].sort()
	fifty = str(value[1][0:50])[1:-1]
	freq = value[0]
	docs = value[2]
	output = '''
	%s was found %i times in %i documents.
	The first 50 were:
	%s
	''' % (query.capitalize(), freq, docs, fifty)
	return output

	#-----------------------------------------------------------------------------------------------

	#Class definitions

	class FileRead(object):

	def __init__(self, input):
	self.location = input #for the file open
	self.filename = input.split('.')[0].split('/')[-1] #for tracking and indexing
	self.filetype = input.split('.')[1] #just in case
	try:
	self.terms = list(self.yield_valid_terms()) #calls read_valid_terms as a list comprehension and returns terms
	except:
	print input.upper() + " READ FAILED"
	self.index = {} #Only initialize if everything else has passed thus far
	try:
	self.make_index() #indexes terms into {'term': 'freq'} format
	except:
	print self.filename.upper() + '.' + self.filetype.upper() + " INDEX FAILED"
	try:
	self.store() #merges index into main database, format is {'term': [freq, [list of doc id's], # of docs found]}
	except:
	print self.filename.upper() + '.' + self.filetype.upper() + " STORE FAILED"


	def yield_valid_terms(self):
	with open(self.location) as f: #opens file
	for line in f: #iterates through each line
	for term in line.split(): #delimits line by whitespace
	if any(c.isalpha() for c in term): #checks for alpha chars to ensure term validity
	yield term #what is says

	def make_index(self):
	for term in self.terms:
	try:
	self.index[term] = self.index[term] + 1 #if term already exists in index, increment freq
	except:
	self.index[term] = 1 #if term doesn't exist, create entry with starting freq '1'

	def store(self):
	for term in self.index:
	try:
	value = db[term] #if term already exists in database, get it
	value[0] = value[0] + self.index[term] #add to the freq
	value[1].append(self.filename) #add file id to the doc id list
	value[2] = len(value[1])
	db[term] = value #save that mothaf***er
	except:
	db[term] = [self.index[term], [self.filename], 1] #if term isn't found, create entry


	#-----------------------------------------------------------------------------------------------


	#Called on execution
	if __name__ == '__main__':
	main() #call main function