Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save danieroux/236fd31aff25ebfa5b5963d0f44db786 to your computer and use it in GitHub Desktop.
Save danieroux/236fd31aff25ebfa5b5963d0f44db786 to your computer and use it in GitHub Desktop.
#!/usr/bin/evn python
#
# Rough but serviceable Standard Bank South Africa PDF statement to CSV extraction.
# Exports to Xero's import format
#
# Needs:
# - poppler
# - lxml for python to be installed (pip install lxml)
from lxml import etree as ET
from subprocess import call
import re
import os
import csv
class ParseStandardBankPDF:
def __init__(self, year, pdf_location):
self.pdf_location = pdf_location
filename, ext = os.path.splitext(pdf_location)
self.filename = filename
self.csv_location = filename + ".csv"
self.xml_location = filename + ".xml"
self.year = year
def makeXmlTree(self, pdf_location):
cmd = "pdftohtml -xml -nodrm '{}'".format(pdf_location)
os.system(cmd)
filename, ext = os.path.splitext(pdf_location)
return ET.parse(self.xml_location)
def parseTransactions(self):
tree = self.makeXmlTree(self.pdf_location)
transactions = []
first_page_done = False
for page in tree.getroot().iter('page'):
found = page.xpath('.//text[text()="BALANCE BROUGHT FORWARD"]')
if (found):
transactions += self.doPageWithEntries(found[0].itersiblings(), first_page_done)
first_page_done = True
return transactions
def doPageWithEntries(self, cursor, first_page_done):
cursor.next()
transactions_on_page = []
if (not first_page_done):
cursor.next()
while True:
try:
type_and_number = cursor.next()
description = cursor.next()
amount = cursor.next()
if (amount.text == '##'):
amount = cursor.next()
date_ish = cursor.next()
balance = cursor.next()
transactions_on_page.append([type_and_number.text, description.text, amount.text, date_ish.text, balance.text])
except StopIteration:
break
return transactions_on_page
def makeAmountFromMatch(self, matchobject):
rand = matchobject.group(1).replace(".", "")
cents = matchobject.group(2)
minus = matchobject.group(3)
return "{}{}.{}".format(minus, rand, cents)
def makeAmount(self, string_amount):
return re.sub(r'([\d.]*),(\d*)(-?)', self.makeAmountFromMatch, string_amount)
def makeDate(self, string_date):
month, day = string_date.split(" ")
return "{}-{}-{}".format(self.year, month, day)
def makeLine(self, list):
type_and_number, description, amount, date_ish, balance = list
return { 'Reference': type_and_number,
'Description': description,
'Amount': self.makeAmount(amount),
'Date': self.makeDate(date_ish) }
def writeTransactions(self, listOfList):
with open(self.csv_location, 'w') as csvfile:
fieldnames = ['Date', 'Amount', 'Description', 'Reference']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for list in listOfList:
writer.writerow(self.makeLine(list))
def go(self):
parsed = self.parseTransactions()
self.writeTransactions(parsed)
ParseStandardBankPDF("2015", 'Statement number 24.pdf').go()
ParseStandardBankPDF("2015", 'Statement number 25.pdf').go()
ParseStandardBankPDF("2015", 'Statement number 26.pdf').go()
ParseStandardBankPDF("2015", 'Statement number 27.pdf').go()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment