Created
March 17, 2010 22:11
-
-
Save markng/335788 to your computer and use it in GitHub Desktop.
nowhere near finished, parse ocr from arizona campaign contributions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import curses.ascii | |
import pickle | |
import pprint | |
import re | |
from optparse import OptionParser | |
class Page(object): | |
"""a page of contributions""" | |
number = 0 | |
entries = [] | |
text = "" | |
def __str__(self): | |
"""string rep""" | |
return('Page %d , %d entries, %d text' % (self.number, len(self.entries), len(self.text))) | |
def __init__(self, text): | |
self.text = text | |
self.parse() | |
def parse(self): | |
"""docstring for parse""" | |
# first, define a bunch of regex | |
firstentry = re.compile('a Name', re.IGNORECASE) | |
splitentry = re.compile('(b Name|c Name|d Name|e Name)', re.IGNORECASE) # find the seam between entries | |
findlast = re.compile('ENTER TOTAL ONLY', re.IGNORECASE) # find the end of the last entry | |
# now work with the text | |
lines = self.text.splitlines() # we should be able to get away with parsing line-by-line, I hope | |
line = lines.pop(0) | |
while(not firstentry.search(line)): # look for start of first entry | |
line = lines.pop(0) | |
print "found first entry" | |
entrytext = [] | |
line = lines.pop(0) # get rid of split line | |
while(not findlast.search(line)): | |
if splitentry.search(line): | |
print "new entry" | |
entry = Entry(entrytext) | |
print entry | |
self.entries.append(entry) | |
entrytext = [] | |
else: | |
entrytext.append(line) | |
try: | |
line = lines.pop(0) | |
except IndexError, e: | |
print "break" | |
break | |
return True | |
class Entry(object): | |
"""a contribution entry""" | |
number = 0 | |
text = [] | |
def __init__(self, text): | |
"""docstring for parse""" | |
self.text = text | |
def __str__(self): | |
"""text rep""" | |
return(str(self.text)) | |
def run(argv=None): | |
"""run the program""" | |
if argv is None: | |
argv = sys.argv | |
parser = OptionParser('usage: %prog <file>') | |
options, arguments = parser.parse_args(argv[1:]) | |
filename = os.path.abspath(arguments[0]) | |
thefile = open(filename) | |
line = thefile.readline() | |
pages = [] | |
pagetext = "" | |
while(line): | |
if curses.ascii.unctrl(line[0]) == '^L': | |
page = Page(pagetext) | |
page.number = len(pages) + 1 | |
pages.append(page) | |
pagetext = "" | |
pagetext += line | |
line = thefile.readline() | |
outfile = open('out.pickle', "w") | |
pickle.dump(pages, outfile) | |
return pages | |
if __name__ == '__main__': | |
pages = run() | |
for page in pages: | |
print '----------- NEXT PAGE %s ---------' % (page) | |
#for entry in page.entries: | |
# print "--------NEXT----------" | |
# print entry.text[2] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment