markng/ocrpaio.py

## ocrpaio.py
import os
import sys
import curses.ascii
import pickle
import pprint
import re
from optparse import OptionParser

class Page(object):
    """a page of contributions"""
    number = 0
    entries = []
    text = ""

    def __str__(self):
        """string rep"""
        return('Page %d , %d entries, %d text' % (self.number, len(self.entries), len(self.text)))

    def __init__(self, text):
        self.text = text
        self.parse()

    def parse(self):
        """docstring for parse"""
        # first, define a bunch of regex
        firstentry = re.compile('a Name', re.IGNORECASE)
        splitentry = re.compile('(b Name|c Name|d Name|e Name)', re.IGNORECASE) # find the seam between entries
        findlast = re.compile('ENTER TOTAL ONLY', re.IGNORECASE) # find the end of the last entry

        # now work with the text
        lines = self.text.splitlines() # we should be able to get away with parsing line-by-line, I hope
        line = lines.pop(0)
        while(not firstentry.search(line)): # look for start of first entry
            line = lines.pop(0)
        print "found first entry"
        entrytext = []
        line = lines.pop(0) # get rid of split line
        while(not findlast.search(line)):
            if splitentry.search(line):
                print "new entry"
                entry = Entry(entrytext)
                print entry
                self.entries.append(entry)
                entrytext = []
            else:
                entrytext.append(line)

            try:
                line = lines.pop(0)
            except IndexError, e:
                print "break"
                break
        return True

class Entry(object):
    """a contribution entry"""
    number = 0
    text = []
    def __init__(self, text):
        """docstring for parse"""
        self.text = text

    def __str__(self):
        """text rep"""
        return(str(self.text))

def run(argv=None):
    """run the program"""
    if argv is None:
        argv = sys.argv

    parser = OptionParser('usage: %prog <file>')

    options, arguments = parser.parse_args(argv[1:])
    filename = os.path.abspath(arguments[0])

    thefile = open(filename)
    line = thefile.readline()
    pages = []
    pagetext = ""
    while(line):
        if curses.ascii.unctrl(line[0]) == '^L':
            page = Page(pagetext)
            page.number = len(pages) + 1
            pages.append(page)
            pagetext = ""
        pagetext += line
        line = thefile.readline()
    outfile = open('out.pickle', "w")
    pickle.dump(pages, outfile)
    return pages

if __name__ == '__main__':
    pages = run()
    for page in pages:
        print '----------- NEXT PAGE %s ---------' % (page)
        #for entry in page.entries:
        #    print "--------NEXT----------"
        #    print entry.text[2]
	import os
	import sys
	import curses.ascii
	import pickle
	import pprint
	import re
	from optparse import OptionParser

	class Page(object):
	"""a page of contributions"""
	number = 0
	entries = []
	text = ""

	def __str__(self):
	"""string rep"""
	return('Page %d , %d entries, %d text' % (self.number, len(self.entries), len(self.text)))

	def __init__(self, text):
	self.text = text
	self.parse()

	def parse(self):
	"""docstring for parse"""
	# first, define a bunch of regex
	firstentry = re.compile('a Name', re.IGNORECASE)
	splitentry = re.compile('(b Name\|c Name\|d Name\|e Name)', re.IGNORECASE) # find the seam between entries
	findlast = re.compile('ENTER TOTAL ONLY', re.IGNORECASE) # find the end of the last entry

	# now work with the text
	lines = self.text.splitlines() # we should be able to get away with parsing line-by-line, I hope
	line = lines.pop(0)
	while(not firstentry.search(line)): # look for start of first entry
	line = lines.pop(0)
	print "found first entry"
	entrytext = []
	line = lines.pop(0) # get rid of split line
	while(not findlast.search(line)):
	if splitentry.search(line):
	print "new entry"
	entry = Entry(entrytext)
	print entry
	self.entries.append(entry)
	entrytext = []
	else:
	entrytext.append(line)

	try:
	line = lines.pop(0)
	except IndexError, e:
	print "break"
	break
	return True

	class Entry(object):
	"""a contribution entry"""
	number = 0
	text = []
	def __init__(self, text):
	"""docstring for parse"""
	self.text = text

	def __str__(self):
	"""text rep"""
	return(str(self.text))

	def run(argv=None):
	"""run the program"""
	if argv is None:
	argv = sys.argv

	parser = OptionParser('usage: %prog <file>')

	options, arguments = parser.parse_args(argv[1:])
	filename = os.path.abspath(arguments[0])

	thefile = open(filename)
	line = thefile.readline()
	pages = []
	pagetext = ""
	while(line):
	if curses.ascii.unctrl(line[0]) == '^L':
	page = Page(pagetext)
	page.number = len(pages) + 1
	pages.append(page)
	pagetext = ""
	pagetext += line
	line = thefile.readline()
	outfile = open('out.pickle', "w")
	pickle.dump(pages, outfile)
	return pages

	if __name__ == '__main__':
	pages = run()
	for page in pages:
	print '----------- NEXT PAGE %s ---------' % (page)
	#for entry in page.entries:
	# print "--------NEXT----------"
	# print entry.text[2]