Skip to content

Instantly share code, notes, and snippets.

@mrtopf
Created October 2, 2010 23:01
Show Gist options
  • Save mrtopf/608079 to your computer and use it in GitHub Desktop.
Save mrtopf/608079 to your computer and use it in GitHub Desktop.
import sys
from elementtree.ElementTree import parse
class Record(object):
"""the final record"""
def _toint(self, s):
"""convert a financial number to int"""
if s=="-":
return -1
else:
s = s.replace(" ","")
return int(s)
def __init__(self, funktion, zweck, soll11, soll10, ist09):
self.funktion = funktion
#self.zweck = unicode(zweck.decode("utf8"))
self.zweck = zweck
self.soll11 = self._toint(soll11)
self.soll10 = self._toint(soll10)
self.ist09 = self._toint(ist09)
def __str__(self):
s= u"""<Record: %s: %s, Soll 2011: %s, Soll 2010: %s, Ist 2009: %s>""" %(
self.funktion,
self.zweck.encode("latin-1","ignore"),
self.soll11,
self.soll10,
self.ist09
)
return s
class Row(object):
def __init__(self, line):
self.top = int(line.attrib['top'])
self.left = int(line.attrib['left'])
self.text = line.text
class Page(object):
"""a page"""
def __init__(self, no):
self.no = no
self.rows = {}
def add(self, line):
top = int(line.attrib['top'])
self.rows.setdefault(top,[]).append(Row(line))
class HaushaltsParser(object):
"""parse an XML haushalt"""
def __init__(self, filename):
"""initialize the Parser with a filename to parse. The file needs to be
created with htmltopdf -x <file.pdf>
"""
self.filename = filename
self.pages = []
self.records = [] # the resulting records
def read(self):
"""read the file and create pages and rows per page"""
tree = parse(self.filename)
elem = tree.getroot()
pageobjs = elem.findall("page")
for pageobj in pageobjs:
no = int(pageobj.attrib['number'])
page = Page(no)
for line in pageobj.findall("text"):
top = line.attrib['top']
page.add(line)
self.pages.append(page)
def process(self):
"""process pages and rows into records"""
for page in self.pages:
for row in page.rows.values():
for column in row:
# this is the column with F- in front
if column.left==68:
self.find_record(column, row, page)
def find_record(self, column, row, page):
"""try to find a record"""
top = column.top
all_columns = page.rows[top]
# sort columns by left pos
all_columns.sort(lambda x,y: cmp(x.left,y.left))
# extract the text
cols_text = [c.text for c in all_columns]
record = Record(*cols_text)
print record.funktion, record.zweck, record.soll11, record.soll10, record.ist09
if __name__=="__main__":
p = HaushaltsParser("epl01.xml")
p.read()
p.process()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment