Skip to content

Instantly share code, notes, and snippets.

@markng
Created March 17, 2010 22:11
Show Gist options
  • Save markng/335788 to your computer and use it in GitHub Desktop.
Save markng/335788 to your computer and use it in GitHub Desktop.
nowhere near finished, parse ocr from arizona campaign contributions
import os
import sys
import curses.ascii
import pickle
import pprint
import re
from optparse import OptionParser
class Page(object):
"""a page of contributions"""
number = 0
entries = []
text = ""
def __str__(self):
"""string rep"""
return('Page %d , %d entries, %d text' % (self.number, len(self.entries), len(self.text)))
def __init__(self, text):
self.text = text
self.parse()
def parse(self):
"""docstring for parse"""
# first, define a bunch of regex
firstentry = re.compile('a Name', re.IGNORECASE)
splitentry = re.compile('(b Name|c Name|d Name|e Name)', re.IGNORECASE) # find the seam between entries
findlast = re.compile('ENTER TOTAL ONLY', re.IGNORECASE) # find the end of the last entry
# now work with the text
lines = self.text.splitlines() # we should be able to get away with parsing line-by-line, I hope
line = lines.pop(0)
while(not firstentry.search(line)): # look for start of first entry
line = lines.pop(0)
print "found first entry"
entrytext = []
line = lines.pop(0) # get rid of split line
while(not findlast.search(line)):
if splitentry.search(line):
print "new entry"
entry = Entry(entrytext)
print entry
self.entries.append(entry)
entrytext = []
else:
entrytext.append(line)
try:
line = lines.pop(0)
except IndexError, e:
print "break"
break
return True
class Entry(object):
"""a contribution entry"""
number = 0
text = []
def __init__(self, text):
"""docstring for parse"""
self.text = text
def __str__(self):
"""text rep"""
return(str(self.text))
def run(argv=None):
"""run the program"""
if argv is None:
argv = sys.argv
parser = OptionParser('usage: %prog <file>')
options, arguments = parser.parse_args(argv[1:])
filename = os.path.abspath(arguments[0])
thefile = open(filename)
line = thefile.readline()
pages = []
pagetext = ""
while(line):
if curses.ascii.unctrl(line[0]) == '^L':
page = Page(pagetext)
page.number = len(pages) + 1
pages.append(page)
pagetext = ""
pagetext += line
line = thefile.readline()
outfile = open('out.pickle', "w")
pickle.dump(pages, outfile)
return pages
if __name__ == '__main__':
pages = run()
for page in pages:
print '----------- NEXT PAGE %s ---------' % (page)
#for entry in page.entries:
# print "--------NEXT----------"
# print entry.text[2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment