Skip to content

Instantly share code, notes, and snippets.

@pbdeuchler
Created February 22, 2012 19:01
Show Gist options
  • Save pbdeuchler/1886665 to your computer and use it in GitHub Desktop.
Save pbdeuchler/1886665 to your computer and use it in GitHub Desktop.
CSCI 345 Project 1
import shelve
import os
import sys
#Global vars
SHELF = os.getcwd() + '/' +'store' #variable for db name
LOC = os.getcwd() + '/files/' #variable for corpus location (os.getcwd() gets the current directory)
write = sys.stdout.write #For progress bar
db = {}
#-----------------------------------------------------------------------------------------------
#Main function
def main():
directory = os.listdir(LOC)
count = 0
print "Indexing..."
for x in directory: #loop through files in supplied directory
count = count + 1 #For progress tracking purposes
if x is '.DS_Store': #Excludes those f***ing DS_STORE files
return
x = LOC + x
FileRead(x) #Creates FileRead object, which then processes the data
#Show queries for required terms
write(getpostings('file'))
write(getpostings('performance'))
write(getpostings('read'))
write(getpostings('window'))
write(getpostings('subject'))
sh = shelve.open('store') #opening persistent database
sh['index'] = db #storing the index for future reference
sh.close() #closing database connection
#-----------------------------------------------------------------------------------------------
#Helper functions
def getpostings(query): #Required function, also formats db query result
try:
value = db[query]
except:
return 'Key not found'
value[1] = map(lambda x: int(x), value[1])
value[1].sort()
fifty = str(value[1][0:50])[1:-1]
freq = value[0]
docs = value[2]
output = '''
%s was found %i times in %i documents.
The first 50 were:
%s
''' % (query.capitalize(), freq, docs, fifty)
return output
#-----------------------------------------------------------------------------------------------
#Class definitions
class FileRead(object):
def __init__(self, input):
self.location = input #for the file open
self.filename = input.split('.')[0].split('/')[-1] #for tracking and indexing
self.filetype = input.split('.')[1] #just in case
try:
self.terms = list(self.yield_valid_terms()) #calls read_valid_terms as a list comprehension and returns terms
except:
print input.upper() + " READ FAILED"
self.index = {} #Only initialize if everything else has passed thus far
try:
self.make_index() #indexes terms into {'term': 'freq'} format
except:
print self.filename.upper() + '.' + self.filetype.upper() + " INDEX FAILED"
try:
self.store() #merges index into main database, format is {'term': [freq, [list of doc id's], # of docs found]}
except:
print self.filename.upper() + '.' + self.filetype.upper() + " STORE FAILED"
def yield_valid_terms(self):
with open(self.location) as f: #opens file
for line in f: #iterates through each line
for term in line.split(): #delimits line by whitespace
if any(c.isalpha() for c in term): #checks for alpha chars to ensure term validity
yield term #what is says
def make_index(self):
for term in self.terms:
try:
self.index[term] = self.index[term] + 1 #if term already exists in index, increment freq
except:
self.index[term] = 1 #if term doesn't exist, create entry with starting freq '1'
def store(self):
for term in self.index:
try:
value = db[term] #if term already exists in database, get it
value[0] = value[0] + self.index[term] #add to the freq
value[1].append(self.filename) #add file id to the doc id list
value[2] = len(value[1])
db[term] = value #save that mothaf***er
except:
db[term] = [self.index[term], [self.filename], 1] #if term isn't found, create entry
#-----------------------------------------------------------------------------------------------
#Called on execution
if __name__ == '__main__':
main() #call main function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment